def compute_distortion(cluster_list): """ takes a list of clusters and uses cluster_error to compute its distortion """ # use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6 num_clusters = 9 num_iterations = 5 hier_list = [] kmeans_list = [] # load data table data_table = load_data_table(DATA_111_URL) hier_list = cpf.hierarchical_clustering(cluster_list, num_clusters) hier_error = 0 for cluster in hier_list: hier_error += cluster.cluster_error(data_table) kmeans_list = cpf.kmeans_clustering(cluster_list, num_clusters, num_iterations) kmeans_error = 0 for cluster in kmeans_list: kmeans_error += cluster.cluster_error(data_table) print "\n\n\n\n-------- -------- Results -------- --------" print "Number of clusters: %r" % num_clusters print "hierarchical_clustering error: ", hier_error print "kmeans_clustering error: ", kmeans_error print "----- ----- ----- ----- ----- ----- -----" return [hier_error, kmeans_error]
def compute_distortion(cluster_list, data_table, num_clusters): """ takes a list of clusters and uses cluster_error to compute its distortion """ # use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6 num_iterations = 5 hier_list = [] kmeans_list = [] # print "\n\nCluster list created. Passing list to hierarchical_clustering ..." hier_list = cpf.hierarchical_clustering(cluster_list, num_clusters) # print "Computing distortion on ", len(hier_list), "hierarchical clusters" hier_error = 0 for cluster in hier_list: hier_error += cluster.cluster_error(data_table) # print "\n\nPassing list to kmeans_clustering ..." kmeans_list = cpf.kmeans_clustering(cluster_list, num_clusters, num_iterations) # print "Computing distortion on ", len(kmeans_list), "k-means clusters" kmeans_error = 0 for cluster in kmeans_list: kmeans_error += cluster.cluster_error(data_table) # print "\n\n\n\n-------- -------- Results -------- --------" # print "Number of clusters: %r" % num_clusters # print "hierarchical_clustering error: ", hier_error # print "kmeans_clustering error: ", kmeans_error # print "----- ----- ----- ----- ----- ----- -----" return [hier_error, kmeans_error]
def run_question(number, data_set): """ Load a data table, compute a list of clusters and plot a list of clusters. Set DESKTOP = True/False to use either matplotlib or simplegui """ global DESKTOP print "Loading data table ..." data_table = load_data_table(data_set) print "Data table loaded. Creating clusters ..." singleton_list = [] # set correct number of clusters if number in [2, 3]: num_clusters = 15 elif number in [5, 6]: num_clusters = 9 print "\nQuestion number: ", number print "Number of clusters to be calculated: ", num_clusters # parse data_table into cluster objects for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) print "\nCluster list created. Passing list to hierarchical_clustering ..." # calculate clusters if number == 0: cluster_list = sequential_clustering(singleton_list, 15) print "Displaying", len(cluster_list), "sequential clusters" elif number in [2, 5]: cluster_list = cpf.hierarchical_clustering(singleton_list, num_clusters) print "Displaying", len(cluster_list), "hierarchical clusters" elif number in [3, 6]: cluster_list = cpf.kmeans_clustering(singleton_list, num_clusters, 5) print "Displaying", len(cluster_list), "k-means clusters" else: "Please pass a valid number to run_question. Valid options are 0, 2, 3, 5, or 6." # draw the clusters using matplotlib or simplegui if DESKTOP: # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def run_question(number, data_set): """ Load a data table, compute a list of clusters and plot a list of clusters. Set DESKTOP = True/False to use either matplotlib or simplegui """ global DESKTOP print "Loading data table ..." data_table = load_data_table(data_set) print "Data table loaded. Creating clusters ..." singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) print "Cluster list created. Passing list to hierarchical_clustering ..." if number == 0: cluster_list = sequential_clustering(singleton_list, 15) print "Displaying", len(cluster_list), "sequential clusters" elif number in [2, 5]: cluster_list = cpf.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" elif number in [3, 6]: cluster_list = cpf.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" else: "Please pass a valid number to run_question." # draw the clusters using matplotlib or simplegui if DESKTOP: # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def compute_distortion_plot_helper(cluster_list, data_table, num_clusters): """ takes a list of clusters and uses cluster_error to compute its distortion """ # use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6 num_iterations = 5 hier_list = [] kmeans_list = [] hier_list = cpf.hierarchical_clustering(cluster_list, num_clusters) hier_error = 0 for cluster in hier_list: hier_error += cluster.cluster_error(data_table) kmeans_list = cpf.kmeans_clustering(cluster_list, num_clusters, num_iterations) kmeans_error = 0 for cluster in kmeans_list: kmeans_error += cluster.cluster_error(data_table) # print "\n\n\n\n-------- -------- Results -------- --------" # print "Number of clusters: %r" % num_clusters # print "hierarchical_clustering error: ", hier_error # print "kmeans_clustering error: ", kmeans_error # print "----- ----- ----- ----- ----- ----- -----" return [hier_error, kmeans_error]
def test_kmeans(): """ Test for k-means clustering kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways """ # load small data table print print "\n\nTesting kmeans_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) kmeansdata_24 = [[ 15, 1, set([('34017', '36061'), ('06037', ), ('06059', ), ('36047', ), ('36081', ), ('06071', '08031'), ('36059', ), ('36005', ), ('55079', ), ('34013', '34039'), ('06075', ), ('01073', ), ('06029', ), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')]) ], [ 15, 3, set([('34017', '36061'), ('06037', '06059'), ('06071', ), ('36047', ), ('36081', ), ('08031', ), ('36059', ), ('36005', ), ('55079', ), ('34013', '34039'), ('06075', ), ('01073', ), ('06029', ), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')]) ], [ 15, 5, set([('34017', '36061'), ('06037', '06059'), ('06071', ), ('36047', ), ('36081', ), ('08031', ), ('36059', ), ('36005', ), ('55079', ), ('34013', '34039'), ('06075', ), ('01073', ), ('06029', ), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')]) ], [ 10, 1, set([('34017', '36061'), ('06029', '06037', '06075'), ('11001', '24510', '34013', '34039', '51013', '51760', '51840', '54009'), ('06059', ), ('36047', ), ('36081', ), ('06071', '08031', '41051', '41067'), ('36059', ), ('36005', ), ('01073', '55079')]) ], [ 10, 3, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081', ), ('36059', ), ('36005', ), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')]) ], [ 10, 5, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081', ), ('36059', ), ('36005', ), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')]) ], [ 5, 1, set([('06029', '06037', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36047', '51013', '51760', '51840', '54009', '55079'), ('06059', ), ('36005', '36059', '36061', '36081'), ('06071', '08031', '41051', '41067')]) ], [ 5, 3, set([('06029', '06037', '06075'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '51760', '51840', '54009', '55079')]) ], [ 5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')]) ]] suite = poc_simpletest.TestSuite() for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.kmeans_clustering(cluster_list, num_clusters, num_iterations) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "\n\nTesting kmeans_custering on 24 county table, \nnum_clusters = " + str( num_clusters) error_message += " num_iterations = " + str(num_iterations) error_message += "\n\nStudent county tuples: " + str( student_county_tuple) error_message += "\n\nExpected county tuples: " + str( expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) print "\n\n" suite.report_results()