def run_example(data = 3108, algorithm = "sequential", display_centers = False): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ if data == 3108: data_url = DATA_3108_URL if data == 896: data_url = DATA_896_URL if data == 290: data_url = DATA_290_URL if data == 111: data_url = DATA_111_URL data_table = load_data_table(data_url) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) if algorithm == "sequential": cluster_list = sequential_clustering(singleton_list, 15) print "Displaying", len(cluster_list), "sequential clusters" if algorithm == "hierarchical": cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" if algorithm == "k-means": cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui # display_centers = True adds cluster centers if DESKTOP: alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, display_centers) else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers return
def test_kmeans(): """ Test for k-means clustering kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways """ # load small data table print print "Testing kmeans_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) kmeansdata_24 = [[15, 1, set([('34017', '36061'), ('06037',), ('06059',), ('36047',), ('36081',), ('06071', '08031'), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [15, 3, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [15, 5, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [10, 1, set([('34017', '36061'), ('06029', '06037', '06075'), ('11001', '24510', '34013', '34039', '51013', '51760', '51840', '54009'), ('06059',), ('36047',), ('36081',), ('06071', '08031', '41051', '41067'), ('36059',), ('36005',), ('01073', '55079')])], [10, 3, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [10, 5, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [5, 1, set([('06029', '06037', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36047', '51013', '51760', '51840', '54009', '55079'), ('06059',), ('36005', '36059', '36061', '36081'), ('06071', '08031', '41051', '41067')])], [5, 3, set([('06029', '06037', '06075'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '51760', '51840', '54009', '55079')])], [5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])]] suite = poc_simpletest.TestSuite() for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.kmeans_clustering(cluster_list, num_clusters, num_iterations) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "Testing kmeans_custering on 24 county table, num_clusters = " + str(num_clusters) error_message += " num_iterations = " + str(num_iterations) error_message += "\nStudent county tuples: " + str(student_county_tuple) error_message += "\nExpected county tuples: " + str(expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) suite.report_results()