def run_question_10(): data_tables = [ pro3_viz.load_data_table(DATA_111_URL), pro3_viz.load_data_table(DATA_290_URL), pro3_viz.load_data_table(DATA_896_URL) ] for data_table in data_tables: # for each cancer data set we do the following: data_table_cl = clusterize_data(data_table) x = [] h_dist = [] k_dist = [] for num_of_clusters in range(6, 21): hierarchical_cluster_list = pro3.hierarchical_clustering( data_table_cl, num_of_clusters ) # this where we increase the number of clusters for both methods kmeans_cluster_list = pro3.kmeans_clustering( data_table_cl, num_of_clusters, 5) x.append(num_of_clusters) h_dist.append( compute_distortion(data_table, hierarchical_cluster_list) / 10**11) # note that 'data_table' is not clusterized k_dist.append( compute_distortion(data_table, kmeans_cluster_list) / 10**11) plt.title("Distortion of clustering methods for " + str(len(data_table_cl)) + " points") plt.xlabel("Number of clusters") plt.ylabel("Distortion (10^11)") plt.plot(x, h_dist, label='hierarchical clustering') plt.plot(x, k_dist, label='k-means clustering (5 iterations)') plt.axis([6, 20, 0, 25]) plt.legend() plt.show()
def q6(): data_table = viz.load_data_table(viz.DATA_111_URL) singleton_list=[] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def q2(): data_table = viz.load_data_table(viz.DATA_3108_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3.hierarchical_clustering(singleton_list, 15) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def question10(data, filename): table = load_data_table(data) clusters = load_as_list(data) xs = range(6, 21) ys_hier = [] def dist(clusters): ys_hier.append(distortion(clusters, table)) hierarchical_clustering(clusters, 6, dist, set(xs)) ys_hier.reverse() clusters = load_as_list(data) ys_kmeans = [ distortion(kmeans_clustering(clusters, x, 5), table) for x in xs ] plt.cla() plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion') plt.plot(xs, ys_kmeans, '-b', label='k-means clustering distortion') plt.title('Clustering distortion (%s)' % data) plt.xlabel('Number of output clusters') plt.ylabel('Distortion') plt.legend(loc='upper right') plt.tight_layout() plt.savefig(filename) print('Saved plot to %s' % filename)
def question5(filename): data = 'unifiedCancerData_111.csv' dist = distortion( visualize(data, filename, lambda x: hierarchical_clustering(x, 9)), load_data_table(data)) print('Distortion in question5, hierarchical_clustering = %f (%s)' % (dist, dist))
def run_example(data_dir, num_clusters): """ Modified to do question 10 loops and save time plot Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ # DATA_3108_URL DATA_111_URL DATA_290_URL data_table = load_data_table(data_dir) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) begin = time.time() cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_clusters) end = time.time() hierarchical_dur = end - begin hierarchical_dist = compute_distortion(cluster_list, data_table) * 10e10 begin = time.time() cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_clusters, 5) end = time.time() kmeans_dur = end - begin kmeans_dist = compute_distortion(cluster_list, data_table) * 10e10 return hierarchical_dur, hierarchical_dist, kmeans_dur, kmeans_dist
def run_question_3(): data_table = pro3_viz.load_data_table(DATA_3108_URL) data_table_cl = clusterize_data(data_table) cluster_list = pro3.kmeans_clustering(data_table_cl, 15, 5) print "Displaying", len(cluster_list), "k-means clusters" # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) # simple filled-in circles alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) # add cluster centers
def q2(): data_table = viz.load_data_table(viz.DATA_3108_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3.hierarchical_clustering(singleton_list, 15) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def compute_distortion(): data_table = alp.load_data_table(alp.DATA_111_URL) cluster_list = alp.run_example() list1 = [] for i in cluster_list: error = i.cluster_error(data_table) list1.append(error) print sum(list1)
def q6(): data_table = viz.load_data_table(viz.DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def question7(url, num_clusters): data_table = load_data_table(url) singleton_list = [] for line in data_table: cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) singleton_list.append(cluster) # cluster_list = hierarchical_clustering(singleton_list, num_clusters) cluster_list = kmeans_clustering(singleton_list, num_clusters, 5) print compute_distortion(cluster_list, data_table)
def quality_check(data_url, alg): x = [] y = [] data_table = load_data_table(data_url) for num in range(6, 21): x.append(num) cluster_list = get_clusters(data_url, num, alg, data_table) y.append(compute_distortion(cluster_list, data_url, data_table)) return (x, y)
def run_question_5(): data_table = pro3_viz.load_data_table(DATA_111_URL) data_table_cl = clusterize_data(data_table) cluster_list = pro3.hierarchical_clustering(data_table_cl, 9) print "Displaying", len(cluster_list), "hierarchical clusters" # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) # simple filled-in circles alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) # add cluster centers return cluster_list
def question5_plot(): """ Generate the plot for question 5 """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) singleton_list.append(cluster) cluster_list = hierarchical_clustering(singleton_list, 9) plot_clusters(data_table, cluster_list, True)
def run_question_7(): data_table = pro3_viz.load_data_table(DATA_111_URL) data_table_cl = clusterize_data(data_table) hierarchical_cluster_list = pro3.hierarchical_clustering(data_table_cl, 9) kmeans_cluster_list = pro3.kmeans_clustering(data_table_cl, 9, 5) print "hierarchical clustering:", compute_distortion( data_table, hierarchical_cluster_list) # note that 'data_table' is not clusterized print "k-means clustering:", compute_distortion(data_table, kmeans_cluster_list)
def question3_plot(): """ Generate the plot for question 3 """ data_table = load_data_table(DATA_3108_URL) singleton_list = [] for line in data_table: cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) singleton_list.append(cluster) cluster_list = kmeans_clustering(singleton_list, 15, 5) plot_clusters(data_table, cluster_list, True)
def q10(): nodes_list = {viz.DATA_111_URL:111, viz.DATA_290_URL:290, viz.DATA_896_URL:896} url_list = [viz.DATA_111_URL, viz.DATA_290_URL, viz.DATA_896_URL] kmeans_dict = dict() hierarchical_dict = dict() for url in url_list: data_table = viz.load_data_table(url) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) kmeans_dict[url] = list() hierarchical_dict[url] = list() cluster_range = range(6, 20 + 1) for cluster_count in cluster_range: #kmeans cluster_list = alg_project3.kmeans_clustering(singleton_list, cluster_count, 5) kmeans_error = compute_distortion(cluster_list, data_table) kmeans_dict[url].append(kmeans_error) #hierarchical count = 20 while count >= 6: alg_project3.hierarchical_clustering(singleton_list, count) hierarchical_error = compute_distortion(singleton_list, data_table) hierarchical_dict[url].insert(0, hierarchical_error) count -= 1 for url in url_list: plt.title('Distortion for hierarchical and k-means clustering for '+str(nodes_list[url])+' points') plt.xlabel('Number of clusters') plt.ylabel('Distortion') line1, = plt.plot(cluster_range, kmeans_dict[url],'g') line2, = plt.plot(cluster_range, hierarchical_dict[url],'b') plt.legend((line1, line2), ('kmeans clustering', 'hierarchical clustering')) plt.show() #q2() #q2() #q3() #q5() #q6() #q7() #q10()
def q7(): data_table = viz.load_data_table(viz.DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5) error2 = compute_distortion(cluster_list, data_table) cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9) error1 = compute_distortion(cluster_list, data_table) print 'hierarchical clustering',error1 print 'kmeans clustering', error2
def q10(): nodes_list = { viz.DATA_111_URL: 111, viz.DATA_290_URL: 290, viz.DATA_896_URL: 896 } url_list = [viz.DATA_111_URL, viz.DATA_290_URL, viz.DATA_896_URL] kmeans_dict = dict() hierarchical_dict = dict() for url in url_list: data_table = viz.load_data_table(url) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) kmeans_dict[url] = list() hierarchical_dict[url] = list() cluster_range = range(6, 20 + 1) for cluster_count in cluster_range: #kmeans cluster_list = alg_project3.kmeans_clustering( singleton_list, cluster_count, 5) kmeans_error = compute_distortion(cluster_list, data_table) kmeans_dict[url].append(kmeans_error) #hierarchical count = 20 while count >= 6: alg_project3.hierarchical_clustering(singleton_list, count) hierarchical_error = compute_distortion(singleton_list, data_table) hierarchical_dict[url].insert(0, hierarchical_error) count -= 1 for url in url_list: plt.title('Distortion for hierarchical and k-means clustering for ' + str(nodes_list[url]) + ' points') plt.xlabel('Number of clusters') plt.ylabel('Distortion') line1, = plt.plot(cluster_range, kmeans_dict[url], 'g') line2, = plt.plot(cluster_range, hierarchical_dict[url], 'b') plt.legend((line1, line2), ('kmeans clustering', 'hierarchical clustering')) plt.show()
def question7(): data_table = alg_project3_viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) clusters_hierarchical = project3.hierarchical_clustering(singleton_list, 9) clusters_kmeans = project3.kmeans_clustering(singleton_list, 9, 5) distortion_hierarchical = compute_distortion(clusters_hierarchical, data_table) distortion_kmeans = compute_distortion(clusters_kmeans, data_table) print "distortion hierarchical: ", distortion_hierarchical print "distortion k-means: ", distortion_kmeans
def q7(): data_table = viz.load_data_table(viz.DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5) error2 = compute_distortion(cluster_list, data_table) cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9) error1 = compute_distortion(cluster_list, data_table) print('hierarchical clustering', error1) print('kmeans clustering', error2)
def question10_plot(): urls = [DATA_111_URL, DATA_290_URL, DATA_896_URL] sizes = range(6, 21) data_sizes = [111, 290, 896] for url, data_size in zip(urls, data_sizes): data_table = load_data_table(url) singleton_list = [] for line in data_table: cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) singleton_list.append(cluster) # hierarchical clustering clusters = singleton_list distortion_hier = [] while len(clusters) > 6: pair = fast_closest_pair(clusters) idx1, idx2 = pair[1], pair[2] clusters[idx1].merge_clusters(clusters[idx2]) clusters.pop(idx2) if len(clusters) in sizes: distortion = compute_distortion(clusters, data_table) distortion_hier.append(distortion) # k-means distortion_kmeans = [] for size in sizes: singleton_list = [] for line in data_table: cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) singleton_list.append(cluster) c = kmeans_clustering(singleton_list, size, 5) distortion = compute_distortion(c, data_table) distortion_kmeans.append(distortion) plt.plot(sizes, distortion_hier[::-1]) plt.plot(sizes, distortion_kmeans) legend_texts = ['hierarchical clustering', 'k-means clustering (5 iterations)'] plt.legend(legend_texts, loc='upper right') plt.title('Distortion with %d county data' % (data_size)) plt.xlabel('Size of Clusters') plt.ylabel('Distortion') plt.show()
def load_data(cancer_id): ''' Loads the cancer data. Input: A string that denotes which cancer data to load Output: a tuple (data_table, cluster_list). ''' DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" data_url = DIRECTORY + "data_clustering/unifiedCancerData_" + cancer_id + ".csv" data_table = viz.load_data_table(data_url) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) return (data_table, singleton_list)
def compute_distortion(cluster_list): ''' Computes the total distortion of a list of clusters. Input: cluster_list - a list of clusters. Output: total_distortion - a float of the total distortion ''' DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" data_table = viz.load_data_table(DATA_111_URL) # Init list variable for storing distortion distortion = [] # loop over clusters for cluster in cluster_list: # Calculate and save distortion distortion.append(cluster.cluster_error(data_table)) return sum(distortion)
def cluster_data(): ''' Load a data table, compute a list of clusters and Output: a tuple of two list of clusters (hierarchical, kmeans) ''' DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" data_table = viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) singleton_list_copy = [cluster.copy() for cluster in singleton_list] return (project.hierarchical_clustering(singleton_list, 9), project.kmeans_clustering(singleton_list_copy, 9, 5))
def plot_Q6(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" data_table = viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = project.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "hierarchical clusters" # draw the clusters using matplotlib alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
def question10(data, filename): table = load_data_table(data) clusters = Cluster.load_as_list(data) xs = range(6, 21) ys_hier = [] def dist(clusters): ys_hier.append(distortion(clusters, table)) hierarchical_clustering(clusters, 6, dist, set(xs)) ys_hier.reverse() ys_kmeans = [distortion(kmeans_clustering(clusters, x, 5), table) for x in xs] plt.cla() plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion') plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion') plt.title('Clustering distortion (%s)' % data) plt.xlabel('Number of output clusters') plt.ylabel('Distortion') plt.legend(loc='upper right') plt.tight_layout() plt.savefig(filename) print('Saved plot to %s' % filename)
def gen_random_clusters(): listx = [] listy_slow = [] listy_fast = [] data_table = alp.load_data_table(alp.DATA_896_URL) for i in range(6,21): list1 = [] sumerror = 0 sumerror_fast = 0 cluster_list = alp.run_example(i, True) cluster_list_fast = alp.run_example(i, False) for s in cluster_list: error = s.cluster_error(data_table) list1.append(error) sumerror = sum(list1) list1 = [] for s in cluster_list_fast: error = s.cluster_error(data_table) list1.append(error) sumerror_fast = sum(list1) listx.append(i) listy_slow.append(sumerror) listy_fast.append(sumerror_fast) plt.plot(listx, listy_slow, '-r', label='hierarchical clustering') plt.plot(listx, listy_fast, '-b', label='kmeans clustering') plt.legend(loc='upper right') plt.title("Quality - Data Set of 896") plt.ylabel('Total error') plt.xlabel('Number of clusters') plt.show()
def compute_q5_q6(): # Load data table111 = viz_tools.load_data_table(DATA_111_URL) # Formate data as Clusters singleton_list = [] for line in table111: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Note: K-means tested first b/c clustering.hierarchical_clustering # mutates list of clusters # K-means kmeans_clusters = clustering.kmeans_clustering(singleton_list, 9, 5) k_distortion = compute_distortion(kmeans_clusters, table111) print("K-means Distortion: {}".format(k_distortion)) # Hierarchical hierarchical_clusters = clustering.hierarchical_clustering( singleton_list, 9) h_distortion = compute_distortion(hierarchical_clusters, table111) print("Hierarchical Distortion: {}".format(h_distortion))
def run_example(): """ Modified to do question 7 showing distoration Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ # DATA_3108_URL DATA_111_URL DATA_290_URL data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) print "___________" cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" print "with Distoration of {0}".format(compute_distortion(cluster_list, data_table)) print "___________" cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" print "with Distoration of {0}".format(compute_distortion(cluster_list, data_table))
def test_compute_distortion(): # Load data table290 = viz_tools.load_data_table(DATA_290_URL) # Formate data as Clusters singleton_list = [] for line in table290: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Note: K-means tested first b/c clustering.hierarchical_clustering # mutates list of clusters # Test 2: Expect 2.323×10^11 kmeans_clusters = clustering.kmeans_clustering(singleton_list, 16, 5) k_distortion = compute_distortion(kmeans_clusters, table290) print("K-means Distortion: {}".format(k_distortion)) # Test 1: Expect 2.575×10^11 hierarchical_clusters = clustering.hierarchical_clustering( singleton_list, 16) h_distortion = compute_distortion(hierarchical_clusters, table290) print("Hierarchical Distortion: {}".format(h_distortion))
def compute_and_plot_distortions(): """ Compute the distortion of the list of clusters produced by hierarchical clustering and k-means clustering (using 5 iterations) on the 111, 290, and 896 county data sets, respectively, where the number of output clusters ranges from 6 to 20 (inclusive). Important note:To compute the distortion for all 15 output clusterings produced by hierarchical_clustering, you should remember that you can use the hierarchical cluster of size 20 to compute the hierarchical clustering of size 19 and so on. Otherwise, you will introduce an unnecessary factor of 15 into the computation of the 15 hierarchical clusterings. """ #choose data set: #data_table = viz.load_data_table(viz.DATA_111_URL) #data_table = viz.load_data_table(viz.DATA_290_URL) data_table = viz.load_data_table(viz.DATA_896_URL) num_output_clusters = [] kmeans_distortion = [] hierarchical_distortion = [] print "\nComputing kmeans distortions" for indx in range(6, 21): ##Dette loop kunne optimeres, saa beregningerne genbruges, men det er ikke noedvendigt, da k_means er saa hurtig num_output_clusters.append(indx) singleton_list = [] for line in data_table: singleton_list.append( c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) kmeans_cluster_list = p.kmeans_clustering(singleton_list, indx, 5) distortion = compute_distortion(kmeans_cluster_list, data_table) kmeans_distortion.append(distortion) print indx, distortion print "Computed kmeans distortions" print "" print "Computing hierarchical distortions" for line in data_table: singleton_list.append( c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hierarchical_cluster_list = singleton_list for indx in range(20, 5, -1): hierarchical_cluster_list = p.hierarchical_clustering( hierarchical_cluster_list, indx) distortion = compute_distortion(hierarchical_cluster_list, data_table) hierarchical_distortion.append(distortion) print indx, distortion hierarchical_distortion.reverse() print "Computed hierarchical distortions\n" print "Plotting data" plt.plot(num_output_clusters, kmeans_distortion, label="K-means clustering") plt.plot(num_output_clusters, hierarchical_distortion, label="Hierarchical clustering") plt.xlabel("Number of output clusters") plt.ylabel('Distortion') #tegner plt.legend() plt.title( "Comparison of distortion of two clustering methods \n Dataset: 896 counties" ) #goer det hele synligt plt.show()
import alg_project3_viz import proj3_solution import alg_cluster import matplotlib.pyplot as plt DATA_URL = alg_project3_viz.DATA_896_URL DATA_TABLE = alg_project3_viz.load_data_table(DATA_URL) def compute_distortion(cluster_list): """ Computes the Distortion of a list of clusters clustered by clustering methods. """ total_distortion = 0 for cluster in cluster_list: total_distortion += cluster.cluster_error(DATA_TABLE) return total_distortion def compute_plot(x_list, y1_list, y2_list): """ Plots the Running Time of two functions viz. y_list1 and y_list2 """ plt.figure(figsize=(12, 8), dpi = 80) plt.plot(x_list, y1_list, '-b', label='hierarchical clustering') plt.plot(x_list, y2_list, '-r', label='k-means clustering(5 iterations)') plt.xlabel('The number of output clusters') plt.ylabel('The distortion produced by the clustering methods') plt.title('Comparison of Distortion by two clustering methods on 896 counties') plt.legend(loc='upper right', prop={'size': 13.5})
def question10(): data_table_111 = alg_project3_viz.load_data_table(DATA_111_URL) data_table_290 = alg_project3_viz.load_data_table(DATA_290_URL) data_table_896 = alg_project3_viz.load_data_table(DATA_896_URL) data_table_list_111 = compute_data_table(data_table_111) data_table_list_290 = compute_data_table(data_table_290) data_table_list_896 = compute_data_table(data_table_896) clusters = range(6,21) distortion_h_111_y = [] distortion_h_290_y = [] distortion_h_896_y = [] distortion_k_111_y = [] distortion_k_290_y = [] distortion_k_896_y = [] for idx in clusters: ###y points for hierarchicall data_111 h_111 = project3.hierarchical_clustering(data_table_list_111, idx) distortion_h_111 = compute_distortion(h_111, data_table_111) distortion_h_111_y.append(distortion_h_111) ###y points for k-means data_111 k_111 = project3.kmeans_clustering(data_table_list_111, idx, 5) distortion_k_111 = compute_distortion(k_111, data_table_111) distortion_k_111_y.append(distortion_k_111) ###y points for hier data_290 h_290 = project3.hierarchical_clustering(data_table_list_290, idx) distortion_h_290 = compute_distortion(h_290, data_table_290) distortion_h_290_y.append(distortion_h_290) ###y points for k-means data 290 k_290 = project3.kmeans_clustering(data_table_list_290, idx, 5) distortion_k_290 = compute_distortion(k_290, data_table_290) distortion_k_290_y.append(distortion_k_290) ###y points for hier data_896 h_896 = project3.hierarchical_clustering(data_table_list_896, idx) distortion_h_896 = compute_distortion(h_896, data_table_896) distortion_h_896_y.append(distortion_h_896) ###y points for k-means data 896 k_896 = project3.kmeans_clustering(data_table_list_896, idx, 5) distortion_k_896 = compute_distortion(k_896, data_table_896) distortion_k_896_y.append(distortion_k_896) plt.plot(clusters, distortion_h_111_y, '-b', label = 'hierarchical' ) plt.plot(clusters, distortion_k_111_y, '-r', label = 'k-means') plt.title('Distortion for 111 points') plt.legend(loc = 'upper right') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show() plt.plot(clusters, distortion_h_290_y, '-b', label='hierarchical') plt.plot(clusters, distortion_k_290_y, '-r', label='k-means') plt.title('Distortion for 290 points') plt.legend(loc='upper right') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show() plt.plot(clusters, distortion_h_896_y, '-b', label='hierarchical') plt.plot(clusters, distortion_k_896_y, '-r', label='k-means') plt.title('Distortion for 896 points') plt.legend(loc='upper right') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.show()
def create_cluster_list(url): data_table = viz.load_data_table(url) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) return singleton_list, data_table
def question5(filename): data = 'data/unifiedCancerData_111.csv' dist = distortion(visualize(data, filename, lambda x: hierarchical_clustering(x, 9)), load_data_table(data)) print('Distortion in question5, hierarchical_clustering = %f (%s)' % (dist, dist))
def question6(filename): data = 'data/unifiedCancerData_111.csv' dist = distortion(visualize(data, filename, lambda x: kmeans_clustering(x, 9, 5)), load_data_table(data)) print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
""" Assignment 3 Question 7 Answer """ import alg_project3_viz as viz import alg_project3_solution as sol import alg_cluster data_table = viz.load_data_table(viz.DATA_111_URL) hier_data_list = sol.make_data_list(data_table) kmeans_data_list = sol.make_data_list(data_table) hier_cluster_list = sol.hierarchical_clustering(hier_data_list, 9) kmeans_cluster_list = sol.kmeans_clustering(kmeans_data_list, 9, 5) print("hierarchical:", sol.compute_distortion(hier_cluster_list, data_table)) print("kmeans:", sol.compute_distortion(kmeans_cluster_list, data_table)) # Hierarchical: 175163886915.8305 or 1.752 x 10^11 with four significant figures # K-means: 271254226924.20047 or 2.712 x 10^11
# Copyright 2016 by Soros Liu # # All Rights Reserved """ """ import project_closest_pair_clustering as provided import alg_project3_viz as viz import alg_cluster __author__ = 'Soros Liu' def compute_distortion(cluster_list, data_table): distortion = 0.0 for cluster in cluster_list: distortion += cluster.cluster_error(data_table) return distortion if __name__ == '__main__': data_table = viz.load_data_table('unifiedCancerData_290.csv') singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = provided.hierarchical_clustering(singleton_list, 16) print compute_distortion(cluster_list, data_table) # cluster_list = provided.kmeans_clustering(singleton_list, 16, 5) # print compute_distortion(cluster_list, data_table)
def q10(plot_key): # Load data table111 = viz_tools.load_data_table(DATA_111_URL) table290 = viz_tools.load_data_table(DATA_290_URL) table896 = viz_tools.load_data_table(DATA_896_URL) # Create cluster function create_cluster = lambda line: alg_cluster.Cluster(set([line[0]]), line[ 1], line[2], line[3], line[4]) # Formate data as Clusters klist111 = [create_cluster(line) for line in table111] klist290 = [create_cluster(line) for line in table290] klist896 = [create_cluster(line) for line in table896] hlist111 = [create_cluster(line) for line in table111] hlist290 = [create_cluster(line) for line in table290] hlist896 = [create_cluster(line) for line in table896] # Initialize distortion lists distortion111k, distortion290k, distortion896k = [], [], [] distortion111h, distortion290h, distortion896h = [], [], [] # Calculate distortion lists for num in range(20, 5, -1): if plot_key == 111: kmeans_cluster111 = clustering.kmeans_clustering(klist111, num, 5) h_cluster111 = clustering.hierarchical_clustering(hlist111, num) distortion111k.append( compute_distortion(kmeans_cluster111, table111)) distortion111h.append(compute_distortion(h_cluster111, table111)) elif plot_key == 290: kmeans_cluster290 = clustering.kmeans_clustering(klist290, num, 5) h_cluster290 = clustering.hierarchical_clustering(hlist290, num) distortion290k.append( compute_distortion(kmeans_cluster290, table290)) distortion290h.append(compute_distortion(h_cluster290, table290)) elif plot_key == 896: kmeans_cluster896 = clustering.kmeans_clustering(klist896, num, 5) h_cluster896 = clustering.hierarchical_clustering(hlist896, num) distortion896k.append( compute_distortion(kmeans_cluster896, table896)) distortion896h.append(compute_distortion(h_cluster896, table896)) # Plot results fig = plt.figure('Distortion for Different Clustering Methods') plt.title('Distortion for Different Clustering Methods: {} Points'.format( plot_key)) plt.xlabel('Number of Clusters') plt.ylabel('Distortion') x = list(range(20, 5, -1)) if plot_key == 111: y1, y4 = distortion111k, distortion111h plt.plot(x, y1, '-bo', markersize=1, label='K-means (111)') plt.plot(x, y4, '-co', markersize=1, label='Hierarchical (111)') elif plot_key == 290: y2, y5 = distortion290k, distortion290h plt.plot(x, y2, '-go', markersize=1, label='K-means (290)') plt.plot(x, y5, '-mo', markersize=1, label='Hierarchical (290)') elif plot_key == 896: y3, y6 = distortion896k, distortion896h plt.plot(x, y3, '-ro', markersize=1, label='K-means (896)') plt.plot(x, y6, '-yo', markersize=1, label='Hierarchical (896)') plt.legend(loc='best') plt.show()
# -*- Mode: Python -*- # Author: Soros Liu <*****@*****.**> # ================================================================================================== # Copyright 2016 by Soros Liu # # All Rights Reserved """ """ import project_closest_pair_clustering as provided import alg_project3_viz as viz import alg_cluster __author__ = 'Soros Liu' def compute_distortion(cluster_list, data_table): distortion = 0.0 for cluster in cluster_list: distortion += cluster.cluster_error(data_table) return distortion if __name__ == '__main__': data_table = viz.load_data_table('unifiedCancerData_290.csv') singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = provided.hierarchical_clustering(singleton_list, 16) print compute_distortion(cluster_list, data_table) # cluster_list = provided.kmeans_clustering(singleton_list, 16, 5) # print compute_distortion(cluster_list, data_table)
from alg_project3_viz import load_data_table, DATA_896_URL, DATA_290_URL, DATA_111_URL, DATA_3108_URL from clustering_algorithms import * from cluster import * import matplotlib.pyplot as plt DATA_TABLE_3108 = load_data_table(DATA_3108_URL) DATA_TABLE_896 = load_data_table(DATA_896_URL) DATA_TABLE_290 = load_data_table(DATA_290_URL) DATA_TABLE_111 = load_data_table(DATA_111_URL) data_table_list = [DATA_TABLE_111, DATA_TABLE_290, DATA_TABLE_896] def compute_distortion(cluster_list, data_table): """ compute distortion, sum of cluster errors in each cluster of cluster_list """ return sum([cluster.cluster_error(data_table) for cluster in cluster_list]) def create_singleton_list(data_table): """ create initial cluster_list where each cluster contain only one country """ singleton_list = [] for line in data_table: singleton_list.append(Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) return singleton_list def distortion_analysis(clustering_func, data_table, min_cluster_num, max_cluster_num): ans = [] if clustering_func == hierarchical_clustering: cluster_list = create_singleton_list(data_table)
def a3q10_distortion(data_table): num_clusters = range(6, 21) kmean_distortion = [] hierarchical_distortion = [] for num in num_clusters: kmean_distortion.append(distortion(data_table, num)[0]) hierarchical_distortion.append(distortion(data_table, num)[1]) return (hierarchical_distortion, kmean_distortion) def a3q10_plot(x, y1, y2, data): """ Plot an example with two curves with legends """ plt.title("Distortion of Hierarchical/KMeans of " + str(data) + " data") plt.xlabel("number of initial clusters") plt.ylabel("distortion") plt.plot(x, y1, '-b', label='Hierarchical clustering') plt.plot(x, y2, '-r', label='KMeans clustering') plt.legend(loc='upper right') plt.show() data = 896 data_table = viz.load_data_table(viz.DATA_896_URL) x = list(range(6, 21)) y1 = a3q10_distortion(data_table)[0] y2 = a3q10_distortion(data_table)[1] a3q10_plot(x, y1, y2, data)
def question6(filename): data = 'unifiedCancerData_111.csv' dist = distortion( visualize(data, filename, lambda x: kmeans_clustering(x, 9, 5)), load_data_table(data)) print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
import alg_project3_viz import proj3_solution import alg_cluster import matplotlib.pyplot as plt DATA_URL = alg_project3_viz.DATA_896_URL DATA_TABLE = alg_project3_viz.load_data_table(DATA_URL) def compute_distortion(cluster_list): """ Computes the Distortion of a list of clusters clustered by clustering methods. """ total_distortion = 0 for cluster in cluster_list: total_distortion += cluster.cluster_error(DATA_TABLE) return total_distortion def compute_plot(x_list, y1_list, y2_list): """ Plots the Running Time of two functions viz. y_list1 and y_list2 """ plt.figure(figsize=(12, 8), dpi=80) plt.plot(x_list, y1_list, '-b', label='hierarchical clustering') plt.plot(x_list, y2_list, '-r', label='k-means clustering(5 iterations)') plt.xlabel('The number of output clusters') plt.ylabel('The distortion produced by the clustering methods') plt.title( 'Comparison of Distortion by two clustering methods on 896 counties') plt.legend(loc='upper right', prop={'size': 13.5})
def test_distortion(): """ sum up the errors for all clusters """ data_urls = [DATA_111_URL, DATA_290_URL, DATA_896_URL] for idx, url in enumerate(data_urls): if idx == 0: title = '111' elif idx == 1: title = '290' else: title = '896' data_table = alg_project3_viz.load_data_table(url) singleton_list = [] for line in data_table: singleton_list.append( Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) ascending_cluster_range = range(6, 21) cluster_lists = map( lambda cluster_count: closestpair.kmeans_clustering( singleton_list, cluster_count, 5), ascending_cluster_range) kmeans_cluster_distortion = map( lambda cluster_list: compute_distortion(cluster_list, data_table), cluster_lists) singleton_list = [] for line in data_table: singleton_list.append( Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_range = range(20, 5, -1) hierarchical_cluster_distortion = [] previous_clusters = singleton_list for num_clusters in cluster_range: new_clusters = closestpair.hierarchical_clustering( previous_clusters, num_clusters) hierarchical_cluster_distortion.append( compute_distortion(new_clusters, data_table)) previous_clusters = new_clusters hierarchical_cluster_distortion.reverse() pyplot.plot(ascending_cluster_range, hierarchical_cluster_distortion, '-b', label='hierarchical clustering') pyplot.plot(ascending_cluster_range, kmeans_cluster_distortion, '-r', label='k-means clustering') pyplot.legend(loc='upper right') pyplot.ylabel('cluster distortion') pyplot.xlabel('number of clusters in graph') pyplot.title('distortion from hierarchical and k-means clustering: ' + title + ' counties') pyplot.show()