def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # cluster_list = sequential_clustering(singleton_list, 15) # print "Displaying", len(cluster_list), "sequential clusters" cluster_list = project.hierarchical_clustering(singleton_list, 20) print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = project.kmeans_clustering(singleton_list, 16, 5) # print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui # if DESKTOP: # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) # #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers # else: # alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers print "cluster:", cluster_list
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" cluster_list = project3.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = project3.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui if DESKTOP: alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list)
def gen_all_distortions(data_table, cluster_list, clustering_type, min_size, max_size): """ Returns a list of the distortions of cluster_list for cluster outputs of min_size to max_size. data_table: either a 111, 290, or 896 county data set cluster_list = a list of 20 clusters clustering_type: either "hierarchical" or "kmeans" min_size: the minimum number of desired cluster outputs max_size: the maximum number of desired cluster outputs """ all_distortions = [] for size in range(min_size, max_size + 1): # Create deepcopy of clustering_list since hierarchical clustering mutates its cluster_list input. copy_list = deepcopy(cluster_list) assert (copy_list[x].fips_codes() == cluster_list[x].fips_codes() for x in range(len(cluster_list))), "copy_list != cluster_list" # Compute hierarchical or kmeans clustering. if clustering_type == "hierarchical": clustering = project.hierarchical_clustering(copy_list, size) assert (clustering[x].fips_codes() == copy_list[x].fips_codes() for x in range(len(clustering))), "clustering != copy_list" elif clustering_type == "kmeans": clustering = project.kmeans_clustering(copy_list, size, 5) # Compute distortion and append to all_distortions list. distortion = compute_distortion(clustering, data_table) all_distortions.append(distortion) return all_distortions
def test_hierarchical24(): """ Test for hierarchical clustering Note that hierarchical_clustering mutates cluster_list """ # load small data table print print "Testing hierarchical_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) # test data of the form [size of output cluster, sets of county tuples] hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])], [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])], [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])], [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])], [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])], [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])], [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])], [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])], [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])], [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])], [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])], ] suite = poc_simpletest.TestSuite() for num_clusters, expected_county_tuple in hierdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.hierarchical_clustering(cluster_list, num_clusters) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters) error_message += "\nStudent county tuples: " + str(student_county_tuple) error_message += "\nExpected county tuples: " + str(expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) suite.report_results()
def run_example(table, method): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ #data_table = load_data_table(DATA_3108_URL) #data_table = load_data_table(DATA_290_URL) data_table = load_data_table(table) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" cluster_distortion_dict ={} start = 20 end = 6 count = start new_list = list(singleton_list) while count >=end: if method == 'h_cluster': cluster_list = alg_project3_solution.hierarchical_clustering(new_list, count) cluster_distortion_dict[count] = compute_distortion(cluster_list, data_table) new_list = cluster_list elif method == 'k_cluster': cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, count, 5) cluster_distortion_dict[count] = compute_distortion(cluster_list, data_table) #new_list = cluster_list count -=1 #print "Displaying", len(cluster_list), "hierarchical clusters" #print "Displaying", len(cluster_list), "hierarchical clusters cluster error" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" #print "Displaying", len(cluster_list), "k-means clusters cluster error" # draw the clusters using matplotlib or simplegui if DESKTOP: #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers #print compute_distortion(cluster_list, data_table) return cluster_distortion_dict else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers
def question_10(): """ Compare the quality of two clustering methods by comparing distortion produced by the two. """ DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" # choose the data file data_table = cluster_visual.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) num_clusters = range(6, 20 + 1) distortion_k_means = [] distortion_hier = [] # compute distortion values for k-means clustering for each_num in num_clusters: cluster_list = project3.kmeans_clustering(singleton_list, each_num, 5) distortion_k_means.append(compute_distortion(cluster_list, data_table)) # compute distortion values for hierarchical clustering hier_clusters = singleton_list for num_clus in range(20, 5, -1): hier_clusters = project3.hierarchical_clustering(hier_clusters, num_clus) distortion_hier.append(compute_distortion(hier_clusters, data_table)) # reverse the distortion_hier list distortion_hier.reverse() # plot the results # Create a new figure of size 8x6 points, using 100 dots per inch plt.figure(figsize=(8,8), dpi=80) # Create a new subplot from a grid of 1x1 plt.subplot(1,1,1) #parameters: row, column, location index plt.xlabel("Number of Clusters") plt.ylabel("Distortion") plt.title("Quality comparison - Two Clustering Methods - 111 Data") # Plot cosine using blue color with a continuous line of width 1 (pixels) plt.plot(num_clusters, distortion_k_means, color="blue", linewidth=2.0, linestyle="-", label="k-means") # Plot sine using green color with a continuous line of width 1 (pixels) plt.plot(num_clusters, distortion_hier, color="green", linewidth=2.0, linestyle="-", label="hierarchical") # Add Legends plt.legend(loc='upper right', frameon=False) # Show result on screen plt.show()
def question2_plot(): """ Generate the plot for question 2 """ data_table = load_data_table(DATA_3108_URL) singleton_list = [] for line in data_table: cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) singleton_list.append(cluster) cluster_list = hierarchical_clustering(singleton_list, 15) plot_clusters(data_table, cluster_list, True)
def question_ten(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ #data_table = load_data_table(DATA_3108_URL) #data_table = load_data_table(DATA_896_URL) #data_table = load_data_table(DATA_290_URL) data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #original_dict = create_dictionary(data_table) hierarchical_distortion_list = dict([]) for num_clusters in range(6, 21): print "hierarchical: num_cluster=", num_clusters cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_clusters) distortion = compute_distortion(cluster_list, data_table) hierarchical_distortion_list[num_clusters] = distortion print "About to display ...." #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16) #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) #print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" # compute the distortion if (True): distortion = compute_distortion(cluster_list, data_table) print distortion # draw the clusters using matplotlib or simplegui if (False): if DESKTOP: #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers
def question7(): data_table = alg_project3_viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) clusters_hierarchical = project3.hierarchical_clustering(singleton_list, 9) clusters_kmeans = project3.kmeans_clustering(singleton_list, 9, 5) distortion_hierarchical = compute_distortion(clusters_hierarchical, data_table) distortion_kmeans = compute_distortion(clusters_kmeans, data_table) print "distortion hierarchical: ", distortion_hierarchical print "distortion k-means: ", distortion_kmeans
def compute_distortions(): data_urls = [DATA_111_URL, DATA_290_URL, DATA_896_URL] data_tables = [load_data_table(url) for url in data_urls] distortions = np.zeros((3, 15, 2)) for i in range(3): for j in range(15): singletons = [alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) for line in data_tables[i]] nclust = j + 6 # kmeans kclst = alg_project3_solution.kmeans_clustering(singletons, nclust, 5) distortions[i, j, 0] = compute_distortion(kclst, data_tables[i]) # hclust hclst = alg_project3_solution.hierarchical_clustering(singletons, nclust) distortions[i, j, 1] = compute_distortion(hclst, data_tables[i]) return distortions
def distortion_of_clustering(): """ Load a data table, compute a list of clusters and plot a list of clusters """ data_table = load_data_table(DATA_896_URL) hierarchical_distortion_list = [] kmeans_distortion_list = [] for num_cluster in xrange(6, 21): singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), \ line[1], line[2], line[3], line[4])) hierarchical_cluster_list = project3.hierarchical_clustering( singleton_list, num_cluster) hierarchical_distortion_list.append( compute_distortion(hierarchical_cluster_list, data_table)) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), \ line[1], line[2], line[3], line[4])) kmeans_cluster_list = project3.kmeans_clustering( singleton_list, num_cluster, 5) kmeans_distortion_list.append( compute_distortion(kmeans_cluster_list, data_table)) plt.figure() plt.hold(True) plt.plot(range(6, 21), hierarchical_distortion_list, 'r', label=' hierarchical') plt.plot(range(6, 21), kmeans_distortion_list, 'b', label='kmeans') plt.legend(loc='upper right') plt.title('Quality Comparision DataSet=896') plt.xlabel('Num_clusters') plt.ylabel('Distortion') plt.hold(False) plt.show()
def distortion_of_clustering(): """ Load a data table, compute a list of clusters and plot a list of clusters """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), \ line[1], line[2], line[3], line[4])) hierarchical_cluster_list = project3.hierarchical_clustering( singleton_list, 9) print "hierarchical", compute_distortion(hierarchical_cluster_list, data_table) for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), \ line[1], line[2], line[3], line[4])) kmeans_cluster_list = project3.kmeans_clustering(singleton_list, 9, 5) print "kmeans", compute_distortion(kmeans_cluster_list, data_table)
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), \ line[1], line[2], line[3], line[4])) cluster_list = project3.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = project3.kmeans_clustering(singleton_list, 15, 5) #print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def question_7(): ''' Write a function compute_distortion(cluster_list) that takes a list of clusters and uses cluster_error to compute its distortion. Now, use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6. Enter the values for the distortions (with at least four significant digits) for these two clusterings in the box below. Clearly indicate the clusterings to which each value corresponds. As a check on the correctness of your code, the distortions associated with the 16 output clusters produced by hierarchical clustering and k-means clustering (with 5 iterations) on the 290 county data set are approximately 2.575×1011 and 2.323×1011, respectively. ''' data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_h = project3.hierarchical_clustering(singleton_list, 9) print "Distortion of", len(cluster_list_h), "hierarchical clusters" print compute_distortion(cluster_list_h, data_table) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_k = project3.kmeans_clustering(singleton_list, 9, 5) print "Distortion of", len(cluster_list_k), "k-means clusters" print compute_distortion(cluster_list_k, data_table) return
def gen_cluster_list(data_table, clustering_type, num_clusters, num_iterations=None): """ Generates and returns a list of clusters from a data table using the given clustering type (i.e., hierarchical or kmeans), number of desired clusters num_clusters, and number of iterations num_iterations for k-means clustering. """ # Create list of clusters from data_table. singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Computes hierarchical or kmeans clustering. if clustering_type == "hierarchical": cluster_list = project.hierarchical_clustering(singleton_list, num_clusters) elif clustering_type == "kmeans": cluster_list = project.kmeans_clustering(singleton_list, num_clusters, num_iterations) return cluster_list
def question10(): data_table_111 = alg_project3_viz.load_data_table(DATA_111_URL) data_table_290 = alg_project3_viz.load_data_table(DATA_290_URL) data_table_896 = alg_project3_viz.load_data_table(DATA_896_URL) data_table_list_111 = compute_data_table(data_table_111) data_table_list_290 = compute_data_table(data_table_290) data_table_list_896 = compute_data_table(data_table_896) clusters = range(6,21) distortion_h_111_y = [] distortion_h_290_y = [] distortion_h_896_y = [] distortion_k_111_y = [] distortion_k_290_y = [] distortion_k_896_y = [] for idx in clusters: ###y points for hierarchicall data_111 h_111 = project3.hierarchical_clustering(data_table_list_111, idx) distortion_h_111 = compute_distortion(h_111, data_table_111) distortion_h_111_y.append(distortion_h_111) ###y points for k-means data_111 k_111 = project3.kmeans_clustering(data_table_list_111, idx, 5) distortion_k_111 = compute_distortion(k_111, data_table_111) distortion_k_111_y.append(distortion_k_111) ###y points for hier data_290 h_290 = project3.hierarchical_clustering(data_table_list_290, idx) distortion_h_290 = compute_distortion(h_290, data_table_290) distortion_h_290_y.append(distortion_h_290) ###y points for k-means data 290 k_290 = project3.kmeans_clustering(data_table_list_290, idx, 5) distortion_k_290 = compute_distortion(k_290, data_table_290) distortion_k_290_y.append(distortion_k_290) ###y points for hier data_896 h_896 = project3.hierarchical_clustering(data_table_list_896, idx) distortion_h_896 = compute_distortion(h_896, data_table_896) distortion_h_896_y.append(distortion_h_896) ###y points for k-means data 896 k_896 = project3.kmeans_clustering(data_table_list_896, idx, 5) distortion_k_896 = compute_distortion(k_896, data_table_896) distortion_k_896_y.append(distortion_k_896) plt.plot(clusters, distortion_h_111_y, '-b', label = 'hierarchical' ) plt.plot(clusters, distortion_k_111_y, '-r', label = 'k-means') plt.title('Distortion for 111 points') plt.legend(loc = 'upper right') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show() plt.plot(clusters, distortion_h_290_y, '-b', label='hierarchical') plt.plot(clusters, distortion_k_290_y, '-r', label='k-means') plt.title('Distortion for 290 points') plt.legend(loc='upper right') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show() plt.plot(clusters, distortion_h_896_y, '-b', label='hierarchical') plt.plot(clusters, distortion_k_896_y, '-r', label='k-means') plt.title('Distortion for 896 points') plt.legend(loc='upper right') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show()
def run_example(table, method): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ #data_table = load_data_table(DATA_3108_URL) #data_table = load_data_table(DATA_290_URL) data_table = load_data_table(table) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" cluster_distortion_dict = {} start = 20 end = 6 count = start new_list = list(singleton_list) while count >= end: if method == 'h_cluster': cluster_list = alg_project3_solution.hierarchical_clustering( new_list, count) cluster_distortion_dict[count] = compute_distortion( cluster_list, data_table) new_list = cluster_list elif method == 'k_cluster': cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, count, 5) cluster_distortion_dict[count] = compute_distortion( cluster_list, data_table) #new_list = cluster_list count -= 1 #print "Displaying", len(cluster_list), "hierarchical clusters" #print "Displaying", len(cluster_list), "hierarchical clusters cluster error" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" #print "Displaying", len(cluster_list), "k-means clusters cluster error" # draw the clusters using matplotlib or simplegui if DESKTOP: #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers #print compute_distortion(cluster_list, data_table) return cluster_distortion_dict else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
print "Loaded", len(data_lines), "data points" data_tokens = [line.split(',') for line in data_lines] return [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] for tokens in data_tokens] def compute_distortion(cluster_list, data_table): return sum([cluster_list[i].cluster_error(data_table) for i in range (len(cluster_list))]) data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) distortion_h = [] distortion_k = [] for i in range(6, 21): cluster_list = project3.hierarchical_clustering(singleton_list, i) distortion_h.append(compute_distortion(cluster_list, data_table)) cluster_list1 = project3.kmeans_clustering(singleton_list, i, 5) distortion_k.append(compute_distortion(cluster_list1, data_table)) x_axix1 = [n for n in range(6, 21)] y_axix1 = distortion_h y_axix2 = distortion_k plt.plot(x_axix1, y_axix1, marker = "o", color = "red") plt.plot(x_axix1, y_axix2, marker = "*", color = "blue") plt.xlabel("number of output clusters") plt.ylabel("Distortion") plt.title("Comparison of distortion of alg (DATA_896_URL)") plt.legend(["hierarchical_clustering", "k-means_clustering"], loc = "upper left")
def question_10(): """ Compare the quality of two clustering methods by comparing distortion produced by the two. """ DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" # choose the data file data_table = cluster_visual.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) num_clusters = range(6, 20 + 1) distortion_k_means = [] distortion_hier = [] # compute distortion values for k-means clustering for each_num in num_clusters: cluster_list = project3.kmeans_clustering(singleton_list, each_num, 5) distortion_k_means.append(compute_distortion(cluster_list, data_table)) # compute distortion values for hierarchical clustering hier_clusters = singleton_list for num_clus in range(20, 5, -1): hier_clusters = project3.hierarchical_clustering( hier_clusters, num_clus) distortion_hier.append(compute_distortion(hier_clusters, data_table)) # reverse the distortion_hier list distortion_hier.reverse() # plot the results # Create a new figure of size 8x6 points, using 100 dots per inch plt.figure(figsize=(8, 8), dpi=80) # Create a new subplot from a grid of 1x1 plt.subplot(1, 1, 1) #parameters: row, column, location index plt.xlabel("Number of Clusters") plt.ylabel("Distortion") plt.title("Quality comparison - Two Clustering Methods - 111 Data") # Plot cosine using blue color with a continuous line of width 1 (pixels) plt.plot(num_clusters, distortion_k_means, color="blue", linewidth=2.0, linestyle="-", label="k-means") # Plot sine using green color with a continuous line of width 1 (pixels) plt.plot(num_clusters, distortion_hier, color="green", linewidth=2.0, linestyle="-", label="hierarchical") # Add Legends plt.legend(loc='upper right', frameon=False) # Show result on screen plt.show()
def question_10(data_set): ''' Compute the distortion of the list of clusters produced by hierarchical clustering and k-means clustering (using 5 iterations) on the 111, 290, and 896 county data sets, respectively, where the number of output clusters ranges from 6 to 20 (inclusive).Important note:To compute the distortion for all 15 output clusterings produced by hierarchical_clustering, you should remember that you can use the hierarchical cluster of size 20 to compute the hierarchical clustering of size 19 and so on. Otherwise, you will introduce an unnecessary factor of 15 into the computation of the 15 hierarchical clusterings. Once you have computed these distortions for both clustering methods, create three separate plots (one for each data set) that compare the distortion of the clusterings produced by both methods. Each plot should include two curves drawn as line plots. The horizontal axis for each plot should indicate the number of output clusters while the vertical axis should indicate the distortion associated with each output clustering. For each plot, include a title that indicates the data set used in creating the plots and a legend that distinguishes the two curves. Takes a data set of either 3108, 896, 290, or 111 points ''' xvals = xrange(20, 5, -1) kmeans_y = [] hierarchical_y = [] # load data by county data_urls = { 3108: DATA_3108_URL, 896: DATA_896_URL, 290: DATA_290_URL, 111: DATA_111_URL } data_table = load_data_table(data_urls[data_set]) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute k-means cluster distortion for num_clusters in xvals: print "Computing", num_clusters, "k-means clusters" kmeans = project3.kmeans_clustering(singleton_list, num_clusters, 5) kmeans_y.append(compute_distortion(kmeans, data_table)) # compute hierarchical cluster distortion hierarchical = singleton_list for num_clusters in xvals: print "Computing", num_clusters, "hierarchical clusters" hierarchical = project3.hierarchical_clustering( hierarchical, num_clusters) hierarchical_y.append(compute_distortion(hierarchical, data_table)) # plot results plt.plot(xvals, kmeans_y, color='r', label="K-Means Clustering") plt.plot(xvals, hierarchical_y, color='b', label="Hierarchical Clustering") plt.legend() plt.title("Distortion Comparison Between Clustering Methods on " + str(data_set) + " County Data Set") plt.xlabel("Number of Output Clusters") plt.ylabel("Distortion") plt.show() return #question_1() #question_7() #question_10(111) #question_10(290) #question_10(896) #question_10(3108)