def question10(data, filename): table = load_data_table(data) clusters = load_as_list(data) xs = range(6, 21) ys_hier = [] def dist(clusters): ys_hier.append(distortion(clusters, table)) hierarchical_clustering(clusters, 6, dist, set(xs)) ys_hier.reverse() clusters = load_as_list(data) ys_kmeans = [ distortion(kmeans_clustering(clusters, x, 5), table) for x in xs ] plt.cla() plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion') plt.plot(xs, ys_kmeans, '-b', label='k-means clustering distortion') plt.title('Clustering distortion (%s)' % data) plt.xlabel('Number of output clusters') plt.ylabel('Distortion') plt.legend(loc='upper right') plt.tight_layout() plt.savefig(filename) print('Saved plot to %s' % filename)
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) #print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" print "Distortion", alg_project3_solution.compute_distortion(cluster_list, data_table) # draw the clusters using matplotlib or simplegui if DESKTOP: #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers
def clustering_distortion(data_url, cluster_method): """Return a list of distortions. Input: a data_url for information on cancer data and either clustering method of des.kmeans_clustering or des.hierarchical_clustering Output: a list of distortions for a range of iterations for kmeans_clustering """ cluster_list = des.cluster_lst(data_url) distortions_list = [] if cluster_method == des.kmeans_clustering: for num_clstr in range(6, 21): kmeans_clusters = des.kmeans_clustering(cluster_list, num_clstr, 5) distortions_list.append( compute_distortion(kmeans_clusters, data_url)) elif cluster_method == des.hierarchical_clustering: init_hierachical_clusters = des.hierarchical_clustering( cluster_list, 20) distortions_list.append( compute_distortion(init_hierachical_clusters, data_url)) for num_clstr in range(19, 5, -1): hierachical_clusters = des.hierarchical_clustering( init_hierachical_clusters, num_clstr) distortions_list.append( compute_distortion(hierachical_clusters, data_url)) distortions_list.reverse() else: return "Invalid cluster_method" return distortions_list
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_290_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" cluster_list = alg_project3_solution.hierarchical_clustering( list(singleton_list), 16) print "Displaying", len(cluster_list), "hierarchical clusters" print "Distortion", compute_distortion(cluster_list, data_table) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list2 = alg_project3_solution.kmeans_clustering( list(singleton_list), 16, 5) print "Displaying", len(cluster_list2), "k-means clusters" print "Distortion", compute_distortion(cluster_list2, data_table)
def test_four_pairs(self): s = map(lambda x: Cluster(*x), [ (set([1]), 0, 0, 13, 0.1), (set([2]), 0, 0, 14, 0.2), (set([3]), 0, 0, 14, 0.2), (set([4]), 0, 0, 14, 0.2), (set([10]), 10, 10, 14, 0.2), (set([11]), 10, 10, 14, 0.2), (set([12]), 10, 10, 14, 0.2), (set([13]), 10, 10, 14, 0.2), ]) # s = map(lambda x: Cluster(*x), # [(set([1]), 0, 0, 13, 0.1), # (set([2]), 1, 0, 14, 0.2), # (set([3]), 2, 0, 14, 0.2), # (set([4]), 3, 0, 14, 0.2), # (set([10]), 0, 100, 14, 0.2), # (set([11]), 1, 100, 14, 0.2), # (set([12]), 2, 100, 14, 0.2), # (set([13]), 3, 100, 14, 0.2), # ]) h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0], hierarchical_clustering(s, 2), []) k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0], kmeans_clustering(s, 2, 10), []) h = sorted([list(x) for x in h]) k = sorted([list(x) for x in k]) # print(h) # print(k) self.assertEqual(h, k)
def test_four_pairs(self): s = map(lambda x: Cluster(*x), [(set([1]), 0, 0, 13, 0.1), (set([2]), 0, 0, 14, 0.2), (set([3]), 0, 0, 14, 0.2), (set([4]), 0, 0, 14, 0.2), (set([10]), 10, 10, 14, 0.2), (set([11]), 10, 10, 14, 0.2), (set([12]), 10, 10, 14, 0.2), (set([13]), 10, 10, 14, 0.2), ]) # s = map(lambda x: Cluster(*x), # [(set([1]), 0, 0, 13, 0.1), # (set([2]), 1, 0, 14, 0.2), # (set([3]), 2, 0, 14, 0.2), # (set([4]), 3, 0, 14, 0.2), # (set([10]), 0, 100, 14, 0.2), # (set([11]), 1, 100, 14, 0.2), # (set([12]), 2, 100, 14, 0.2), # (set([13]), 3, 100, 14, 0.2), # ]) h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0], hierarchical_clustering(s, 2), []) k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0], kmeans_clustering(s, 2, 10), []) h = sorted([list(x) for x in h]) k = sorted([list(x) for x in k]) # print(h) # print(k) self.assertEqual(h, k)
def q10_legend(DATA_URL): data_table = load_data_table(DATA_URL) singleton_list = [] hierarchical_cluster_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hierarchical_cluster_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) xvals = [] yvals1 = [] yvals2 = [] for num_clusters in range(20, 5, -1): xvals.append(num_clusters) hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering( hierarchical_cluster_list, num_clusters) yvals1.append(compute_distortion(hierarchical_cluster_list, data_table)) yvals2.append( compute_distortion( alg_project3_solution.kmeans_clustering( singleton_list, num_clusters, 5), data_table)) curve1 = [[xvals[idx], yvals1[idx]] for idx in range(len(xvals))] curve2 = [[xvals[idx], yvals2[idx]] for idx in range(len(xvals))] simpleplot.plot_lines( "The distortion of output clusters uesd " + str(len(data_table)) + "-county data set", 800, 600, "the number of output clusters", "the distortion associated with each output clustering", [curve1, curve2], True, ["hierarchical cluster", "kmeans cluster"])
def q10(): sizes = xrange(6,21) data_file = open('unifiedCancerData_896.csv','r') data = data_file.read() data_lines = data.split('\n') data_tokens = [line.split(',') for line in data_lines] data_table = [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] for tokens in data_tokens] singleton_list = [] singleton_list1 = [] t1 = [] t2 = [] for item in sizes: singleton_list = [] singleton_list1 = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) singleton_list1.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, item) t1.append(compute_distortion(cluster_list,data_table)) cluster_list1 = alg_project3_solution.kmeans_clustering(singleton_list1, item, 5) t2.append(compute_distortion(cluster_list1,data_table)) print t1 print t2 plt.plot(sizes,t1,'r-',label='hierarchical_clustering') plt.plot(sizes,t2,'b-',label='kmeans_clustering') plt.title('CancerData_896') plt.xlabel('Number of output clusters') plt.ylabel('Distortion associated with each output clustering') plt.legend(loc='upper right') plt.show()
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ # data_table = load_data_table(DATA_3108_URL) print 'in run_example' k_n=[] h_n=[] for x in range(6,21): print '------>:',x,'<-----\n' # kmeans data_table=load_data_table(DATA_111_URL) singleton_list=[] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_k = alg_project3_solution.kmeans_clustering(singleton_list, x, 5) kmeans=reduce(lambda x,y:x+y,map(lambda x:x.cluster_error(data_table),cluster_list_k)) k_n.append(kmeans) #hierarchical data_table=load_data_table(DATA_111_URL) singleton_list=[] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_h = alg_project3_solution.hierarchical_clustering(singleton_list, x) hierarchical=reduce(lambda x,y:x+y,map(lambda x:x.cluster_error(data_table),cluster_list_h)) h_n.append(hierarchical) print 'kmean:',k_n print 'hierarchical:',h_n
def test02(self): cluster_list = kmeans_clustering([Cluster(set([0]), 0, 0, 1, 0), Cluster(set([1]), 0, 3, 1, 0), Cluster(set([2]), 1, 2, 1, 0), Cluster(set([3]), 2, 2, 1, 0), Cluster(set([4]), 3, 0, 1, 0), Cluster(set([5]), 3, 3, 1, 0)], 2, 3) self.assertEqual(len(cluster_list),2)
def test01(self): cluster_list = kmeans_clustering([Cluster(set([0]), -4.0, 0.0, 1, 0), Cluster(set([1]), 0.0, -1.0, 1, 0), Cluster(set([2]), 0.0, 1.0, 1, 0), Cluster(set([3]), 4.0, 0.0, 1, 0)], 2, 3 ) self.assertEqual(len(cluster_list),2)
def cluster_by_kmeans(data_table_url,num_clusters,num_iterate): # load data table data_table = load_data_table(data_table_url) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_clusters, num_iterate) return cluster_list
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ #data_table = load_data_table(DATA_3108_URL) data_table = load_data_table_local(DATA_290) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) singleton_list_copy = [singleton.copy() for singleton in singleton_list] # cluster_list = sequential_clustering(singleton_list, 15) # print "Displaying", len(cluster_list), "sequential clusters" hierarchical_distortions = [] cluster_list = alg_project3_solution.hierarchical_clustering( singleton_list, 20) hierarchical_distortions.append( compute_distortion(cluster_list, data_table)) for num_clusters in range(19, 5, -1): cluster_list = alg_project3_solution.hierarchical_clustering( cluster_list, num_clusters) hierarchical_distortions.append( compute_distortion(cluster_list, data_table)) hierarchical_distortions.reverse() # print "Displaying", len(cluster_list), "hierarchical clusters" kmeans_distortions = [] for num_clusters in range(6, 21): cluster_list = alg_project3_solution.kmeans_clustering( singleton_list_copy, num_clusters, 5) kmeans_distortions.append(compute_distortion(cluster_list, data_table)) # print "Displaying", len(cluster_list), "k-means clusters" # code to compute distortion # distortion = compute_distortion(cluster_list, data_table) # print "distortion = " + str(distortion) # draw the clusters using matplotlib or simplegui # if DESKTOP: # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers # else: # alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers return [hierarchical_distortions, kmeans_distortions]
def question_seven(): """Return the distortion for kmeans and hierarchical clusters.""" q7_data_url = des.DATA_111_URL # change url depending on desired data table q7_clst = des.cluster_lst(q7_data_url) q7_kmeans_clusters = des.kmeans_clustering(q7_clst, 9, 5) q7_hierarchical_clusters = des.hierarchical_clustering(q7_clst, 9) kmeans_dist = compute_distortion(q7_kmeans_clusters, q7_data_url) hierarchical_dist = compute_distortion(q7_hierarchical_clusters, q7_data_url) return "hierarchical distortion =", hierarchical_dist, "kmeans distortion =", kmeans_dist
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ #data_table = load_data_table(DATA_3108_URL) #data_table = load_data_table(DATA_111_URL) #data_table = load_data_table(DATA_290_URL) data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hier = [] k_means = [] for num in range(6, 21): singleton_list_copy = [item.copy() for item in singleton_list] #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list_copy, num) hier.append(compute_distortion(cluster_list, data_table)) #print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num, 5) k_means.append(compute_distortion(cluster_list, data_table)) print hier print k_means plt.plot(range(6, 21), hier, label="hierarchical_clustering") plt.plot(range(6, 21), k_means, label="kmeans_clustering") plt.xlabel("Number of outcome clusters") plt.ylabel("Distrotion") plt.title("Distrotion with 896 counties") plt.legend() plt.show() #print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) #print "k means", compute_distortion(cluster_list, data_table) #print "Displaying", len(cluster_list), "k-means clusters" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui """
def compute_kmeans_distortions(cluster_list): """ list -> list Takes a list of cluster objects and iteratively clusters the data further, while calculating the distortion at each iteration. Returns a list of distortion values. """ distortions = [] for iteration in range(6, 21): new_list = sol.kmeans_clustering(cluster_list, iteration, 5) distortions.append(sol.compute_distortion(new_list, data_table)) return distortions
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16) #print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 16, 5) #print "Displaying", len(cluster_list), "k-means clusters" kmeans = [] for clusters_number in xrange(6, 21): cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, clusters_number, 5) kmeans.append([ clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table) ]) #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 20) #hierarchical = [[20, alg_project3_solution.compute_distortion(cluster_list, data_table)]] hierarchical = [] for clusters_number in xrange(20, 5, -1): cluster_list = alg_project3_solution.hierarchical_clustering( singleton_list, clusters_number) hierarchical.append([ clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table) ]) hierarchical.reverse() #print hierarchical[10], kmeans[10] simpleplot.plot_lines( "Distortion of the clusterings produced by hierarchical and k-means metods on 896 county data set", 800, 600, "Number of clusters n [6 .. 20]", "Distortion", [hierarchical, kmeans], False, ["Hierarchical clustering", "k-means clustering with 5 iterations"])
def q7(): data_file = open('unifiedCancerData_111.csv','r') data = data_file.read() data_lines = data.split('\n') data_tokens = [line.split(',') for line in data_lines] data_table = [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] for tokens in data_tokens] singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) print compute_distortion(cluster_list,data_table)
def clustering(): title_list = ['111 counties', '290 counties', '896 counties'] url_list = [DATA_111_URL, DATA_290_URL, DATA_896_URL] distortion_hierarchical = [[], [], []] distortion_kmeans = [[], [], []] num_clusters_list = range(20, 5, -1) for idx in range(len(url_list)): data_table = load_data_table(url_list[idx]) cluster_list = [] for line in data_table: cluster_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_copy = [cluster.copy() for cluster in cluster_list] for num_cluster in num_clusters_list: cluster_list = student.hierarchical_clustering( cluster_list, num_cluster) distortion = compute_distortion(cluster_list, data_table) distortion_hierarchical[idx].append(distortion) print "Displaying", len( cluster_list), "hierarchical clusters, distortion:", distortion for num_cluster in num_clusters_list: cluster_list = student.kmeans_clustering(cluster_list_copy, num_cluster, 5) distortion = compute_distortion(cluster_list, data_table) distortion_kmeans[idx].append(distortion) print "Displaying", len( cluster_list), "k-means clusters, distortion:", distortion plot_num = 131 + idx plt.subplot(plot_num) plt.plot(num_clusters_list, distortion_hierarchical[idx], "o-", label="hierarchical") plt.plot(num_clusters_list, distortion_kmeans[idx], "x-", label="kmeans") plt.legend() plt.ylabel('Distortion') plt.xlabel('Number of clusters') plt.grid(True) plt.title(title_list[idx]) plt.show()
def compute_distortion(cluster_list, data_table, out_size): clust_list = cluster_list[:] # note that hierarchical_clustering mutates cluster_list clusters_k = cluster_algs.kmeans_clustering(clust_list, out_size, 5) clusters_h = cluster_algs.hierarchical_clustering(cluster_list, out_size) distortion_h = 0 distortion_k = 0 for cluster_h in clusters_h: distortion_h += cluster_h.cluster_error(data_table) for cluster_k in clusters_k: distortion_k += cluster_k.cluster_error(data_table) return (distortion_h, distortion_k)
def run_example_two(): #data_table = load_data_table(DATA_896_URL) #data_table = load_data_table(DATA_290_URL) data_table = load_data_table(DATA_111_URL) min_num_of_clusters = 6 max_num_of_clusters = 20 kmeans_points = {} hierarchical_points = {} num_of_clusters = min_num_of_clusters while num_of_clusters <= max_num_of_clusters: singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # generate the clusters cluster_list = alg_project3_solution.hierarchical_clustering( singleton_list, num_of_clusters) # calculate the distortion distortion = compute_distortion(cluster_list, data_table) #print(distortion) hierarchical_points.update({num_of_clusters: distortion}) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, num_of_clusters, 5) distortion = compute_distortion(cluster_list, data_table) #print(distortion) kmeans_points.update({num_of_clusters: distortion}) num_of_clusters += 1 plot_graphs(hierarchical_points, kmeans_points)
def quality_k(data_table, interval): """ Input: Loaded data table List with the number of output clusters Return: List of distortion of the clusterings, produced by k-means clustering method """ singleton_list = \ [alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) for line in data_table] distortion = [] for num_clstrs in interval: cluster_list = \ alg_project3_solution.kmeans_clustering(singleton_list, num_clstrs, 5) distortion.append(sum([clstr.cluster_error(data_table) for clstr in cluster_list])) return distortion
def run_kmeans(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_3108_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 15, 5) print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def test_kmeans(): """ Test for k-means clustering kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways """ # load small data table print print "Testing kmeans_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) kmeansdata_24 = [[15, 1, set([('34017', '36061'), ('06037',), ('06059',), ('36047',), ('36081',), ('06071', '08031'), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [15, 3, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [15, 5, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [10, 1, set([('34017', '36061'), ('06029', '06037', '06075'), ('11001', '24510', '34013', '34039', '51013', '51760', '51840', '54009'), ('06059',), ('36047',), ('36081',), ('06071', '08031', '41051', '41067'), ('36059',), ('36005',), ('01073', '55079')])], [10, 3, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [10, 5, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [5, 1, set([('06029', '06037', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36047', '51013', '51760', '51840', '54009', '55079'), ('06059',), ('36005', '36059', '36061', '36081'), ('06071', '08031', '41051', '41067')])], [5, 3, set([('06029', '06037', '06075'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '51760', '51840', '54009', '55079')])], [5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])]] suite = poc_simpletest.TestSuite() for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.kmeans_clustering(cluster_list, num_clusters, num_iterations) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "Testing kmeans_custering on 24 county table, num_clusters = " + str(num_clusters) error_message += " num_iterations = " + str(num_iterations) error_message += "\nStudent county tuples: " + str(student_county_tuple) error_message += "\nExpected county tuples: " + str(expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) suite.report_results()
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16) #print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 16, 5) #print "Displaying", len(cluster_list), "k-means clusters" kmeans = [] for clusters_number in xrange(6, 21): cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, clusters_number, 5) kmeans.append([clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table)]) #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 20) #hierarchical = [[20, alg_project3_solution.compute_distortion(cluster_list, data_table)]] hierarchical = [] for clusters_number in xrange(20, 5, -1): cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, clusters_number) hierarchical.append([clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table)]) hierarchical.reverse() #print hierarchical[10], kmeans[10] simpleplot.plot_lines("Distortion of the clusterings produced by hierarchical and k-means metods on 896 county data set", 800, 600, "Number of clusters n [6 .. 20]", "Distortion", [hierarchical, kmeans], False, ["Hierarchical clustering", "k-means clustering with 5 iterations"])
def run_distortion(): data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print("Displaying", len(cluster_list), "sequential clusters") #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) #print( 'Hierarchical Distortion = ', compute_distortion(cluster_list, data_table) ) cluster_list2 = [] cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, 9, 5) print('KMeans Distortion = ', compute_distortion(cluster_list, data_table))
def plot_distortions(): DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv" DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" DATA_24_URL = DIRECTORY + "data_clustering/unifiedCancerData_24.csv" #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append( Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) errors_h = [] for num_clusters in range(6, 21): cluster_list = hierarchical_clustering( [clu.copy() for clu in singleton_list], num_clusters) cluster_error = compute_distortion(cluster_list, data_table) errors_h.append(cluster_error) errors_k = [] for num_clusters in range(6, 21): cluster_list = kmeans_clustering( [clu.copy() for clu in singleton_list], num_clusters, 5) cluster_error = compute_distortion(cluster_list, data_table) errors_k.append(cluster_error) xlabel("number of output clusters") ylabel("distortion") #xscale('log') #yscale('log') plot(range(6, 21), errors_h, '-b', label="hierarchical") plot(range(6, 21), errors_k, '-r', label="kmeans") legend(loc="upper left") title("896 county data sets") show()
def plot_distortions(): DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv" DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" DATA_24_URL = DIRECTORY + "data_clustering/unifiedCancerData_24.csv" #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append(Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) errors_h = [] for num_clusters in range(6,21): cluster_list = hierarchical_clustering([clu.copy() for clu in singleton_list], num_clusters) cluster_error = compute_distortion(cluster_list, data_table) errors_h.append(cluster_error) errors_k = [] for num_clusters in range(6,21): cluster_list = kmeans_clustering([clu.copy() for clu in singleton_list], num_clusters, 5) cluster_error = compute_distortion(cluster_list, data_table) errors_k.append(cluster_error) xlabel("number of output clusters") ylabel("distortion") #xscale('log') #yscale('log') plot(range(6,21), errors_h, '-b', label="hierarchical") plot(range(6,21), errors_k, '-r', label="kmeans") legend(loc="upper left") title("896 county data sets") show()
def run_distortion_graph(): data_table = load_data_table(DATA_896_URL) size_clusters = range(6, 21) hierarchical_distortion = [] kmeans_distortion = [] for size in size_clusters: singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_hierarchical = alg_project3_solution.hierarchical_clustering( singleton_list, size) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_kmeans = alg_project3_solution.kmeans_clustering( singleton_list, size, 5) hierarchical_distortion.append( compute_distortion(cluster_list_hierarchical, data_table) / 100000000000.0) kmeans_distortion.append( compute_distortion(cluster_list_kmeans, data_table) / 100000000000.0) plt.figure() plt.plot(size_clusters, hierarchical_distortion, '-b', label='Hierarchical_distortion') plt.plot(size_clusters, kmeans_distortion, '-g', label='Kmeans_distortion') plt.legend(loc='upper right') plt.title( 'Distortion for hierarchical and k-means clustering for 896 data') plt.xlabel('Number of clusters') plt.ylabel('Distortion(x 10^11)') plt.show()
def question10(data, filename): table = load_data_table(data) clusters = Cluster.load_as_list(data) xs = range(6, 21) ys_hier = [] def dist(clusters): ys_hier.append(distortion(clusters, table)) hierarchical_clustering(clusters, 6, dist, set(xs)) ys_hier.reverse() ys_kmeans = [distortion(kmeans_clustering(clusters, x, 5), table) for x in xs] plt.cla() plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion') plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion') plt.title('Clustering distortion (%s)' % data) plt.xlabel('Number of output clusters') plt.ylabel('Distortion') plt.legend(loc='upper right') plt.tight_layout() plt.savefig(filename) print('Saved plot to %s' % filename)
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_3108_URL) #data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" start_time = time() cluster_list = alg_project3_solution.hierarchical_clustering( singleton_list, 15) print "Displaying", len(cluster_list), "hierarchical clusters" end_time = time() hierarchical_clustering_time = end_time - start_time start_time = time() cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, 15, 5) print "Displaying", len(cluster_list), "k-means clusters" end_time = time() kmeans_clustering_time = end_time - start_time print hierarchical_clustering_time, kmeans_clustering_time """
def question10_plot(date_url): data_table = load_data_table(date_url) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) xvals = [cluster_num_k for cluster_num_k in range(6, 21)] kc_cd_yvals = [] for cluster_num_k in range(20, 5, -1): cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, cluster_num_k, 5) kc_cd_yvals.append(compute_distortion(cluster_list, data_table)) kc_cd_yvals.reverse() hc_cd_yvals = [] cluster_list = list(singleton_list) for cluster_num_k in range(20, 5, -1): cluster_list = alg_project3_solution.hierarchical_clustering( cluster_list, cluster_num_k) hc_cd_yvals.append(compute_distortion(cluster_list, data_table)) hc_cd_yvals.reverse() plt.plot(xvals, kc_cd_yvals, '-b', label='kmeans_clustering_distortion') plt.plot(xvals, hc_cd_yvals, '-r', label='hierarchical_clustering_distortion') plt.legend(loc='upper right') plt.xlabel('cluster num') plt.ylabel('distortion') title_str = 'DATA: ' + re.search('[0-9]+', date_url).group(0) plt.title(title_str) plt.grid(True) plt.show()
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ #data_table = load_data_table(DATA_3108_URL) data_table = load_data_table(DATA_111_URL) #data_table = load_data_table(DATA_290_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) # print "Displaying", len(cluster_list), "hierarchical clusters" # print "distortion:", compute_distortion(cluster_list, data_table) # start = time.clock() # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 15) # elapsed = (time.clock() - start) # print "elapsed:",elapsed # print "Displaying", len(cluster_list), "hierarchical clusters" # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16) # print "Displaying", len(cluster_list), "hierarchical clusters" # print "distortion:", compute_distortion(cluster_list, data_table) cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" print "distortion:", compute_distortion(cluster_list, data_table)
def compute_and_plot_distortions(): """ Compute the distortion of the list of clusters produced by hierarchical clustering and k-means clustering (using 5 iterations) on the 111, 290, and 896 county data sets, respectively, where the number of output clusters ranges from 6 to 20 (inclusive). Important note:To compute the distortion for all 15 output clusterings produced by hierarchical_clustering, you should remember that you can use the hierarchical cluster of size 20 to compute the hierarchical clustering of size 19 and so on. Otherwise, you will introduce an unnecessary factor of 15 into the computation of the 15 hierarchical clusterings. """ #choose data set: #data_table = viz.load_data_table(viz.DATA_111_URL) #data_table = viz.load_data_table(viz.DATA_290_URL) data_table = viz.load_data_table(viz.DATA_896_URL) num_output_clusters = [] kmeans_distortion = [] hierarchical_distortion = [] print "\nComputing kmeans distortions" for indx in range(6, 21): ##Dette loop kunne optimeres, saa beregningerne genbruges, men det er ikke noedvendigt, da k_means er saa hurtig num_output_clusters.append(indx) singleton_list = [] for line in data_table: singleton_list.append( c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) kmeans_cluster_list = p.kmeans_clustering(singleton_list, indx, 5) distortion = compute_distortion(kmeans_cluster_list, data_table) kmeans_distortion.append(distortion) print indx, distortion print "Computed kmeans distortions" print "" print "Computing hierarchical distortions" for line in data_table: singleton_list.append( c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hierarchical_cluster_list = singleton_list for indx in range(20, 5, -1): hierarchical_cluster_list = p.hierarchical_clustering( hierarchical_cluster_list, indx) distortion = compute_distortion(hierarchical_cluster_list, data_table) hierarchical_distortion.append(distortion) print indx, distortion hierarchical_distortion.reverse() print "Computed hierarchical distortions\n" print "Plotting data" plt.plot(num_output_clusters, kmeans_distortion, label="K-means clustering") plt.plot(num_output_clusters, hierarchical_distortion, label="Hierarchical clustering") plt.xlabel("Number of output clusters") plt.ylabel('Distortion') #tegner plt.legend() plt.title( "Comparison of distortion of two clustering methods \n Dataset: 896 counties" ) #goer det hele synligt plt.show()
def run_suite(): """ Testing code for the functions written for Word Wrangler """ # create a TestSuite (and an object) suite = poc_simpletest.TestSuite() # create a set of 3 clusters cluster1 = CC.Cluster([1, 1], 0, 0, 100, 0.00001) cluster2 = CC.Cluster([2, 2, 2], 3, 4, 200, 0.00002) cluster3 = CC.Cluster([3, 3, 3, 3], 6, 8, 300, 0.00003) list_of_clusters = [cluster1, cluster2, cluster3] # testing the slow_closest_pair function with the 3 cluster list suite.run_test(student.slow_closest_pair(list_of_clusters), (5., 0, 1), "Test #1: testing slow_closest_pair on 3 clusters") # testing the fast_closest_pair function with the 3 cluster list suite.run_test(student.fast_closest_pair(list_of_clusters), (5., 0, 1), "Test #2: testing fast_closest_pair on 3 clusters") # add a fourth cluster to the list cluster4 = CC.Cluster([4, 4, 4, 4, 4], 12, 16, 400, 0.00004) list_of_clusters.append(cluster4) # testing the slow_closest_pair function with the 4 cluster list suite.run_test(student.slow_closest_pair(list_of_clusters), (5., 0, 1), "Test #3: testing slow_closest_pair on 4 clusters") # testing the fast_closest_pair function with the 4 cluster list suite.run_test(student.fast_closest_pair(list_of_clusters), (5., 0, 1), "Test #4: testing fast_closest_pair on 4 clusters") # create a set of 4 clusters cluster1 = CC.Cluster(set([]), 0, 0, 1, 0) cluster2 = CC.Cluster(set([]), 1, 0, 1, 0) cluster3 = CC.Cluster(set([]), 2, 0, 1, 0) cluster4 = CC.Cluster(set([]), 3, 0, 1, 0) list_of_clusters = [cluster1, cluster2, cluster3, cluster4] # testing closest_pair_strip on 4 clusters suite.run_test(student.closest_pair_strip(list_of_clusters, 1.5, 1.0), (1.0, 1, 2), "Test #5: testing closest_pair_strip on 4 clusters") # create a set of 4 clusters cluster1 = CC.Cluster(set([]), 1.0, 0.0, 1, 0) cluster2 = CC.Cluster(set([]), 4.0, 0.0, 1, 0) cluster3 = CC.Cluster(set([]), 5.0, 0.0, 1, 0) cluster4 = CC.Cluster(set([]), 7.0, 0.0, 1, 0) list_of_clusters = [cluster1, cluster2, cluster3, cluster4] # testing fast_closest_pair on 4 clusters suite.run_test(student.fast_closest_pair(list_of_clusters), (1.0, 1, 2), "Test #6: testing closest_pair_strip on 4 clusters") # create a set of 4 clusters cluster1 = CC.Cluster(set([]), -4.0, 0.0, 1, 0) cluster2 = CC.Cluster(set([]), 0.0, -1.0, 1, 0) cluster3 = CC.Cluster(set([]), 0.0, 1.0, 1, 0) cluster4 = CC.Cluster(set([]), 4.0, 0.0, 1, 0) list_of_clusters = [cluster1, cluster2, cluster3, cluster4] # testing closest_pair_strip on 4 clusters suite.run_test(student.closest_pair_strip(list_of_clusters, 0.0, 4.1231059999999999), (2.0, 1, 2), "Test #7: testing closest_pair_strip on 4 clusters") # create a set of 4 clusters cluster1 = CC.Cluster(set([]), -4.0, 0.0, 1, 0) cluster2 = CC.Cluster(set([]), 0.0, -1.0, 1, 0) cluster3 = CC.Cluster(set([]), 0.0, 1.0, 1, 0) cluster4 = CC.Cluster(set([]), 4.0, 0.0, 1, 0) list_of_clusters = [cluster1, cluster2, cluster3, cluster4] # testing fast_closest_pair on 4 clusters suite.run_test(student.fast_closest_pair(list_of_clusters), (2.0, 1, 2), "Test #8: testing fast_closest_pair on 4 clusters") # create a sorted list_of_clusters from a small dataset containing 8 clusters fhandle = open("unifiedCancerData_8.txt") list_of_clusters = [] for line in fhandle: tokens = line.split(',') cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])) list_of_clusters.append(cluster) list_of_clusters.sort(key = lambda cluster: cluster.horiz_center()) print "The following list_of_clusters was loaded:" for index in range(len(list_of_clusters)): print index, list_of_clusters[index] print # testing the slow_closest_pair function with 8 cluster list suite.run_test(student.slow_closest_pair(list_of_clusters), (2.4479655653349655, 5, 7), "Test #9: testing slow_closest_pair on 8 clusters") # testing the fast_closest_pair function with 8 cluster list suite.run_test(student.fast_closest_pair(list_of_clusters), (2.4479655653349655, 5, 7), "Test #10: testing fast_closest_pair on 8 clusters") # testing the hierarchical_clustering function with 8 clusters clustering_result = student.hierarchical_clustering(list_of_clusters, 5) for index in range(len(clustering_result)): print clustering_result[index] print # testing the kmeans_clustering function with 8 clusters clustering_result = student.kmeans_clustering(list_of_clusters, 5, 3) for index in range(len(clustering_result)): print clustering_result[index] print # create a sorted list_of_clusters from a small dataset containing 17 clusters fhandle = open("unifiedCancerData_17.txt") list_of_clusters = [] for line in fhandle: tokens = line.split(',') cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])) list_of_clusters.append(cluster) list_of_clusters.sort(key = lambda cluster: cluster.horiz_center()) # testing the slow_closest_pair function with 17 cluster list suite.run_test(student.slow_closest_pair(list_of_clusters), (1.9439662413427632, 9, 10), "Test #11: testing slow_closest_pair on 17 clusters") # testing the fast_closest_pair function with 17 cluster list suite.run_test(student.fast_closest_pair(list_of_clusters), (1.9439662413427632, 9, 10), "Test #12: testing fast_closest_pair on 17 clusters") # create a sorted list_of_clusters from a small dataset containing 24 clusters fhandle = open("unifiedCancerData_24.txt") list_of_clusters = [] for line in fhandle: tokens = line.split(',') cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])) list_of_clusters.append(cluster) list_of_clusters.sort(key = lambda cluster: cluster.horiz_center()) print "The following list_of_clusters was loaded:" for index in range(len(list_of_clusters)): print index, list_of_clusters[index] print # testing the kmeans_clustering function with 24 clusters clustering_result = student.kmeans_clustering(list_of_clusters, 10, 1) print "This output was created by kmeans_slustering:" for index in range(len(clustering_result)): print index, clustering_result[index] print # create a sorted list_of_clusters from a small dataset containing 39 clusters fhandle = open("unifiedCancerData_39.txt") list_of_clusters = [] for line in fhandle: tokens = line.split(',') cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])) list_of_clusters.append(cluster) list_of_clusters.sort(key = lambda cluster: cluster.horiz_center()) # testing the slow_closest_pair function with 39 cluster list suite.run_test(student.slow_closest_pair(list_of_clusters), (1.6612217536988727, 22, 24), "Test #13: testing slow_closest_pair on 39 clusters") # testing the fast_closest_pair function with 39 cluster list suite.run_test(student.fast_closest_pair(list_of_clusters), (1.6612217536988727, 22, 24), "Test #14: testing fast_closest_pair on 39 clusters") # create a sorted list_of_clusters from a small dataset containing 111 clusters fhandle = open("unifiedCancerData_111.csv") list_of_clusters = [] for line in fhandle: tokens = line.split(',') cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])) list_of_clusters.append(cluster) list_of_clusters.sort(key = lambda cluster: cluster.horiz_center()) print "The following list_of_clusters was loaded:" for index in range(len(list_of_clusters)): print index, list_of_clusters[index] print # testing the slow_closest_pair function with 111 cluster list suite.run_test(student.slow_closest_pair(list_of_clusters), (1.266216002018164, 79, 81), "Test #15: testing slow_closest_pair on 111 clusters") # testing the fast_closest_pair function with 111 cluster list suite.run_test(student.fast_closest_pair(list_of_clusters), (1.266216002018164, 79, 81), "Test #16: testing fast_closest_pair on 111 clusters") # testing the hierarchical_clustering function with 111 clusters clustering_result = student.hierarchical_clustering(list_of_clusters, 5) for index in range(len(clustering_result)): print clustering_result[index] print # report number of tests and failures print suite.report_results()
for tokens in data_tokens] ##################################### # Code for answering question 7 of the application # Read the input data for 290 county data and create a list of clusters data_table = load_data_table(DATA_290_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Create the clustered lists needed for computing the distortions hierarchical_list = sol.hierarchical_clustering(singleton_list, 16) kmeans_list = sol.kmeans_clustering(singleton_list, 16, 5) # Compute and print the distortions num_clusters = len(kmeans_list) hierarchical_distortion = 0 kmeans_distortion = 0 for index in range(num_clusters): hierarchical_distortion += hierarchical_list[index].cluster_error(data_table) kmeans_distortion += kmeans_list[index].cluster_error(data_table) # Print the results print print "=====> Results for 290 county datapoints in 16 clusters" print ".......... Distortion for hiearchical_clustering:", hierarchical_distortion print ".......... Distortion for kmeans_clustering: ", kmeans_distortion print
def question6(filename): data = 'unifiedCancerData_111.csv' dist = distortion( visualize(data, filename, lambda x: kmeans_clustering(x, 9, 5)), load_data_table(data)) print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
def question3(filename): visualize('unifiedCancerData_3108.csv', filename, lambda x: kmeans_clustering(x, 15, 5))
def kmeans(): sol.kmeans_clustering(s, 9, 10)
def question6(filename): data = 'data/unifiedCancerData_111.csv' dist = distortion(visualize(data, filename, lambda x: kmeans_clustering(x, 9, 5)), load_data_table(data)) print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
""" Assignment 3 Question 7 Answer """ import alg_project3_viz as viz import alg_project3_solution as sol import alg_cluster data_table = viz.load_data_table(viz.DATA_111_URL) hier_data_list = sol.make_data_list(data_table) kmeans_data_list = sol.make_data_list(data_table) hier_cluster_list = sol.hierarchical_clustering(hier_data_list, 9) kmeans_cluster_list = sol.kmeans_clustering(kmeans_data_list, 9, 5) print("hierarchical:", sol.compute_distortion(hier_cluster_list, data_table)) print("kmeans:", sol.compute_distortion(kmeans_cluster_list, data_table)) # Hierarchical: 175163886915.8305 or 1.752 x 10^11 with four significant figures # K-means: 271254226924.20047 or 2.712 x 10^11
def error_data(): data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) num_clusters = [] hier_error = [] k_means_error = [] for num in range(6, 21): singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) num_clusters.append(num) cluster_list = alg_project3_solution.hierarchical_clustering( singleton_list, num) print "Displaying", len(cluster_list), "hierarchical clusters" error_sum = 0.0 for cluster in cluster_list: error_sum += cluster.cluster_error(data_table) hier_error.append(error_sum / 1e11) print(error_sum / 1e11) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, num, 5) print "Displaying", len(cluster_list), "k-means clusters" error_sum = 0.0 for cluster in cluster_list: error_sum += cluster.cluster_error(data_table) k_means_error.append(error_sum / 1e11) print(error_sum / 1e11) matplotlib.rc('figure', figsize=(16, 8)) plt.plot(num_clusters, hier_error, label="Hierarchical") plt.plot(num_clusters, k_means_error, label="K-means") plt.legend() plt.xlabel('Number of Clusters', fontsize=14, color='Green') plt.ylabel('Distortion x 10^11 ', fontsize=14, color='Brown') plt.title( 'Distortion for Hierarchical and K-means custering for 896 points') plt.grid(True) plt.show() #plt.savefig('question_10_896.png') #error_data()
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # cluster_list = sequential_clustering(singleton_list, 15) # print "Displaying", len(cluster_list), "sequential clusters" # question 5 # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) # print "Displaying", len(cluster_list), "hierarchical clusters" # question 6 # cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) # print "Displaying", len(cluster_list), "k-means clusters" # question 7 # cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) # kmeans_result = alg_project3_solution.compute_distortion(cluster_list, data_table) # print("Displaying", kmeans_result, "kmeans_result") # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) # hierarchical_result = alg_project3_solution.compute_distortion(cluster_list, data_table) # print("Displaying", hierarchical_result, "hierarchical_result") # question 10 kmeans_res = [] for clusters_number in range(6, 21): cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, clusters_number, 5) kmeans_res.append([ clusters_number, alg_project3_solution.compute_distortion(cluster_list, data_table) ]) hier_res = [] for clusters_number in range(20, 5, -1): cluster_list = alg_project3_solution.hierarchical_clustering( singleton_list, clusters_number) hier_res.append([ clusters_number, alg_project3_solution.compute_distortion(cluster_list, data_table) ]) hier_res.reverse() # draw the clusters using matplotlib or simplegui if DESKTOP: # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) # add cluster centers create_separate_plots(kmeans_res, hier_res) else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def question3(filename): visualize('data/unifiedCancerData_3108.csv', filename, lambda x: kmeans_clustering(x, 15, 5))