def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # Create copy of cluster_list, sorted in descending order of clusters' populations. sorted_cluster_list = sorted(cluster_list, key = lambda cluster: cluster.total_population(), reverse = True) # Position initial clusters at the location of clusters with largest populations. centers = [alg_cluster.Cluster(set(), cluster.horiz_center(), cluster.vert_center(), 0, 0) for cluster in sorted_cluster_list[:num_clusters]] for dummy_idx in range(num_iterations): # Initialize num_clusters empty clusters. k_clusters = [alg_cluster.Cluster(set(), 0, 0, 0, 0) for dummy_idx in range(num_clusters)] # For every cluster, merge cluster into the closest k_cluster. for cluster in cluster_list: min_dist = float('inf') for center in centers: if cluster.distance(center) < min_dist: min_dist = cluster.distance(center) closest = centers.index(center) k_clusters[closest].merge_clusters(cluster) # Update centers. centers = [alg_cluster.Cluster(set(), cluster.horiz_center(), cluster.vert_center(), 0, 0) for cluster in k_clusters] return k_clusters
def compare_distortions(): #data_table = load_data_table(DATA_111_URL) #data_table = load_data_table(DATA_290_URL) data_table = load_data_table(DATA_896_URL) dist_hierarchical = [] dist_kmeans = [] out_cluster_k = [k for k in range(6,21)] print out_cluster_k for k in out_cluster_k: singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, k) dist_hierarchical.append(compute_distortion(cluster_list)) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, k, 5) dist_kmeans.append(compute_distortion(cluster_list)) return dist_hierarchical,dist_kmeans,out_cluster_k
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations copy_cluster_list = list(cluster_list) copy_cluster_list.sort(key=lambda cluster: cluster.total_population()) copy_cluster_list.reverse() size = len(cluster_list) new_clusters = [ alg_cluster.Cluster(set([]), copy_cluster_list[dummy_idx].horiz_center(), copy_cluster_list[dummy_idx].vert_center(), 0, 0.0) for dummy_idx in range(num_clusters) ] for idx1 in range(num_iterations): old_clusters = list(new_clusters) new_clusters = [ alg_cluster.Cluster(set([]), old_clusters[dummy_idx].horiz_center(), old_clusters[dummy_idx].vert_center(), 0, 0.0) for dummy_idx in range(num_clusters) ] for idx2 in range(size): closest_idx = kmeans_closest_idx(cluster_list[idx2], old_clusters) new_clusters[closest_idx].merge_clusters(cluster_list[idx2]) return new_clusters
def q10_legend(DATA_URL): data_table = load_data_table(DATA_URL) singleton_list = [] hierarchical_cluster_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hierarchical_cluster_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) xvals = [] yvals1 = [] yvals2 = [] for num_clusters in range(20, 5, -1): xvals.append(num_clusters) hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering( hierarchical_cluster_list, num_clusters) yvals1.append(compute_distortion(hierarchical_cluster_list, data_table)) yvals2.append( compute_distortion( alg_project3_solution.kmeans_clustering( singleton_list, num_clusters, 5), data_table)) curve1 = [[xvals[idx], yvals1[idx]] for idx in range(len(xvals))] curve2 = [[xvals[idx], yvals2[idx]] for idx in range(len(xvals))] simpleplot.plot_lines( "The distortion of output clusters uesd " + str(len(data_table)) + "-county data set", 800, 600, "the number of output clusters", "the distortion associated with each output clustering", [curve1, curve2], True, ["hierarchical cluster", "kmeans cluster"])
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations point_list = [(cluster.total_population(), cluster.horiz_center(), cluster.vert_center()) for cluster in cluster_list] centers = [] for dummy_i in range(num_clusters): temp_point = max(point_list) centers.append((temp_point[1], temp_point[2])) point_list.remove(temp_point) answer = [] for dummy_i in range(num_iterations): answer = [] for center in centers: answer.append( alg_cluster.Cluster(set([]), center[0], center[1], 0, 0)) for cluster in cluster_list: temp_dist = (float("inf"), -1) for idx in range(num_clusters): temp_dist = min(temp_dist, (cluster.distance( alg_cluster.Cluster(set([]), centers[idx][0], centers[idx][1], 0, 0)), idx)) answer[temp_dist[1]].merge_clusters(cluster) centers = [(mean.horiz_center(), mean.vert_center()) for mean in answer] return answer
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ nodelist = list(cluster_list) centers = [] nodelist_by_pop = list(nodelist) nodelist_by_pop.sort(key = lambda cluster: cluster.total_population(), reverse=True) # position initial clusters at the location of clusters with largest populations for idx in range(num_clusters): centers.append(alg_cluster.Cluster(set([]), nodelist_by_pop[idx].horiz_center(), nodelist_by_pop[idx].vert_center(), nodelist_by_pop[idx].total_population(), 0)) for idx in range(num_iterations): # set empty list of clusters with clusters = num_clusters results = [alg_cluster.Cluster(set([]), 0, 0, 0, 0) for _ in range(num_clusters)] for each in nodelist: shortest_dist = float("inf") center_position = 0 for center in centers: dist = each.distance(center) if dist < shortest_dist: shortest_dist = dist center_position = centers.index(center) results[center_position].merge_clusters(each) # reset centers centers = list(results) return results
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_290_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" cluster_list = alg_project3_solution.hierarchical_clustering( list(singleton_list), 16) print "Displaying", len(cluster_list), "hierarchical clusters" print "Distortion", compute_distortion(cluster_list, data_table) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list2 = alg_project3_solution.kmeans_clustering( list(singleton_list), 16, 5) print "Displaying", len(cluster_list2), "k-means clusters" print "Distortion", compute_distortion(cluster_list2, data_table)
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list """ points = cluster_list[:] # n <-- |p|; len_points_list = len(points) # position initial clusters at the location of clusters with largest populations (i.e., cluster[3] which is population) cluster_centers = [] temp_cl = points[:] temp_cl.sort(key=lambda cluster: cluster.total_population()) for cluster in reversed(temp_cl): if len(cluster_centers) < num_clusters: cluster_centers.append( alg_cluster.Cluster(set([]), cluster.horiz_center(), cluster.vert_center(), 0, 0)) # For number of iterations for dummy_var in range(num_iterations): # initialize k (num_clusters) empty sets C1, ... Ck; cluster_groupings = [] for index in range(len(cluster_centers)): cluster_groupings.append(alg_cluster.Cluster(set(), 0, 0, 0, 0)) # # For each county # for j = 0 to n - 1 do for index in range(len_points_list): # Find the old cluster center that is closest # L <-- argminsub(1<=f<=k) (dsub(psubj), musubf); min_dist = float('inf') nearest_cluster_index = None for idx, cluster in enumerate(cluster_centers): if points[index].distance(cluster) < min_dist: min_dist = points[index].distance(cluster) nearest_cluster_index = idx # Add the county to the corresponding new cluster # Handled with Cluster class merge_clusters method, which will automatically update the cluster centers to correct locations. cluster_groupings[nearest_cluster_index].merge_clusters( points[index]) # Set old clusters equal to new clusters # for f = 1 to k do for index in range(len(cluster_centers)): # muf = center (Cf) // handled with Cluster class built-in method(s) cluster_centers[index] = cluster_groupings[index].copy() # return {C1, C2, ..., Ck}; return cluster_groupings
def run_kmeans_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) def compute_distortion(cluster_list): error = 0 for cluster in cluster_list: error += cluster.cluster_error(data_table) return error error = [] for cluster_num in range(6, 21): cluster_list = kmeans_clustering(singleton_list, cluster_num, 5) error.append(compute_distortion(cluster_list)) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) return error print("Displaying", len(cluster_list), "kmeans clusters") # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) # print "Displaying", len(cluster_list), "hierarchical clusters" # cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) # print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui if DESKTOP: # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) # add cluster centers else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def setUp(self): self.cluster1 = alg_cluster.Cluster(set([]), 0, 0, 1, 0) self.cluster2 = alg_cluster.Cluster(set([]), 1, 0, 1, 0) self.cluster3 = alg_cluster.Cluster(set([]), 2, 0, 1, 0) self.cluster_list1 = [ alg_cluster.Cluster(set([]), 0.02, 0.39, 1, 0), alg_cluster.Cluster(set([]), 0.19, 0.75, 1, 0), alg_cluster.Cluster(set([]), 0.35, 0.03, 1, 0), alg_cluster.Cluster(set([]), 0.73, 0.81, 1, 0), alg_cluster.Cluster(set([]), 0.76, 0.88, 1, 0), alg_cluster.Cluster(set([]), 0.78, 0.11, 1, 0) ]
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations new_cluster_list = [cluster.copy() for cluster in cluster_list] new_cluster_list.sort(key=lambda cluster: cluster.total_population(), reverse=True) n_cluster = len(new_cluster_list) miu_list = list(new_cluster_list[:num_clusters]) #print miu_list #for i in range(num_clusters): # miu_list.append(alg_cluster.Cluster()) while num_iterations > 0: c_buckets = [] num_clusters1 = num_clusters while num_clusters1 > 0: c_buckets.append(alg_cluster.Cluster(set(), 0, 0, 0, 0)) num_clusters1 -= 1 #print len(c_buckets) for j_index in range(n_cluster): min_dist = float("inf") for f_index in range(num_clusters): dist = new_cluster_list[j_index].distance(miu_list[f_index]) if dist < min_dist: min_dist = dist l_index = f_index #print "mindist", min_dist, l_index c_buckets[l_index].merge_clusters(new_cluster_list[j_index]) #print c_buckets[:2] for f_index in range(num_clusters): x_pos = c_buckets[f_index].horiz_center() y_pos = c_buckets[f_index].vert_center() miu_list[f_index] = alg_cluster.Cluster(set(), x_pos, y_pos, 0, 0) num_iterations -= 1 return c_buckets
def kmeans_clustering(cluster_list, len_k, iter_times_q): """ Compute the len_k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations cluster_list_copy = list(cluster_list) cluster_list_copy.sort(key=lambda cluster: cluster.total_population(), reverse=True) miu = [] for idx_f in range(len_k): miu.append((cluster_list_copy[idx_f].horiz_center(), cluster_list_copy[idx_f].vert_center())) for dummy_i in range(iter_times_q): k_cluster_list = [] nk_cluster_list = [] for idx_x in range(len_k): temp_cluster = alg_cluster.Cluster(set(), float(miu[idx_x][0]), float(miu[idx_x][1]), int(0), float(0)) k_cluster_list.append(temp_cluster) temp_cluster = alg_cluster.Cluster(set(), float(0), float(0), int(0), float(0)) nk_cluster_list.append(temp_cluster) for idx_j in range(len(cluster_list)): min_distance = float('inf') idx_l = -1 for idx_f in range(len_k): temp_distance = cluster_list[idx_j].distance( k_cluster_list[idx_f]) if temp_distance <= min_distance: min_distance = temp_distance idx_l = idx_f nk_cluster_list[idx_l].merge_clusters(cluster_list[idx_j]) miu = [] for idx_f in range(len_k): miu.append((nk_cluster_list[idx_f].horiz_center(), nk_cluster_list[idx_f].vert_center())) return nk_cluster_list
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ size = len(cluster_list) copy_list = [cluster.copy() for cluster in cluster_list] copy_list.sort(key=lambda cluster: -1 * cluster.total_population()) # position initial clusters at the location of clusters with largest populations centers = copy_list[:num_clusters] for _ in range(0, num_iterations): cluster_set = [ alg_cluster.Cluster(set(), 0, 0, 0, 0) for _ in range(num_clusters) ] for idx_j in range(0, size): index = find_index(cluster_list[idx_j], centers) cluster_set[index].merge_clusters(cluster_list[idx_j]) for idx_k in range(0, num_clusters): centers[idx_k] = cluster_set[idx_k].copy() return centers
def plot_distortion(data_url, size_range): """ :return: """ data_table = load_data_table(data_url) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hc_list = singleton_list x = [] y_hc = [] y_kmc = [] # compute distortion of hierarchical_clustering in specific range # Note: use the hierarchical cluster of size n to compute the size n-1 and so on. for size in range(size_range[1], size_range[0] - 1, -1): print len(hc_list) hc_list = alg_project3_solution.hierarchical_clustering(hc_list, size) y_hc.append(compute_distortion(hc_list, data_table)) y_hc.reverse() # reverse the hc_list print y_hc for size in range(size_range[0], size_range[1] + 1): x.append(size) kmc_list = alg_project3_solution.kmeans_clustering( singleton_list, size, 5) y_kmc.append(compute_distortion(kmc_list, data_table)) title = data_url[-7:-4] plt.plot(x, y_hc, '-b', label='hierarchical_clustering') plt.plot(x, y_kmc, '-r', label='k-means_clustering') plt.legend(loc='upper right') plt.xlabel("Number of Output Clusters") plt.ylabel("Distortion") plt.title(title + " county data sets") plt.show()
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Input: List of clusters, number of clusters, number of iterations Output: List of clusters whose length is num_clusters """ num = len(cluster_list) cluster_index = range(num) cluster_index.sort(key=lambda x:cluster_list[x].total_population()) cluster_index.reverse() # initialize k-means clusters to be initial clusters with largest populations center_list = [ cluster_list[cluster_index[pos]].copy() for pos in range(num_clusters) ] prev_center_list = [] for _ in range(num_iterations): prev_center_list = center_list center_list = [] for idx in range(num_clusters): center_list.append(alg_cluster.Cluster(set(), 0.0, 0.0, 0, 0.0)) best_idx_list = [] for idx in range(num): best_dist = prev_center_list[0].distance(cluster_list[idx]) best_idx = 0 for cidx in range(num_clusters): dist = prev_center_list[cidx].distance(cluster_list[idx]) if best_dist > dist: best_idx = cidx best_dist = dist best_idx_list.append(best_idx) for idx in range(num): center_list[best_idx_list[idx]].merge_clusters(cluster_list[idx]) return center_list
def visualize_data(cluster_input, data, method=None, display_centers=False): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(data) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) if method == None: cluster_list = sequential_clustering(singleton_list, cluster_input) print("Displaying", len(cluster_list), "sequential clusters") elif method == 'hierarchical_clustering': cluster_list = clustering.hierarchical_clustering(singleton_list, cluster_input) print("Displaying", len(cluster_list), "hierarchical clusters") elif method == 'kmeans_clustering': cluster_list = clustering.kmeans_clustering(singleton_list, cluster_input[0], cluster_input[1]) print("Displaying", len(cluster_list), "k-means clusters") else: print("ERROR: method entered into visualize_data not recognized") alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, display_centers)
def compare_distortion(): # data_table = load_data_table('unifiedCancerData_111.csv') # data_table = load_data_table('unifiedCancerData_290.csv') data_table = load_data_table('unifiedCancerData_896.csv') hie_distortion = [] kmeans_distortion = [] num_output = range(20, 5, -1) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) hie_list = singleton_list[:] for num_cluster in num_output: hie_list = alg_project3_solution.hierarchical_clustering( hie_list, num_cluster) hie_distortion.append(app3_7.compute_distortion(hie_list, data_table)) kmeans_list = alg_project3_solution.kmeans_clustering( singleton_list, num_cluster, 5) kmeans_distortion.append( app3_7.compute_distortion(kmeans_list, data_table)) plt.plot(num_output, hie_distortion, '-b', label='hierarchical clustering') plt.plot(num_output, kmeans_distortion, '-r', label='k-means clustering') plt.legend(loc='upper right') plt.xlabel('number of output clusters') plt.ylabel('distortion of two clustering methods') plt.title('Comparison of the Distortion with ' + str(len(singleton_list)) + ' County Data Set') plt.show()
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations list_len = len(cluster_list) centers = cluster_list[:] centers.sort(key=lambda cluster: cluster.total_population(), reverse=True) centers = centers[:num_clusters] cluster_sets = None for dummy_i in range(num_iterations): cluster_sets = [ alg_cluster.Cluster(set([]), 0.0, 0.0, 0.0, 0.0) for dummy_idx in range(num_clusters) ] for idx in range(list_len): min_idx = min(range(num_clusters), key=lambda center_idx: cluster_list[idx].distance( centers[center_idx])) cluster_sets[min_idx].merge_clusters(cluster_list[idx]) centers = cluster_sets return cluster_sets
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters. Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_3108_URL) #data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # ************** Here we have to choose the type of clustering we want to use for visualization ******************** #cluster_list = sequential_clustering(singleton_list, 15); print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9); print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" if DESKTOP: # draw the clusters using matplotlib or simplegui alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) # add cluster centers else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = closest_pairs_and_clustering_algorithms.hierarchical_clustering(singleton_list, 9) #print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = closest_pairs_and_clustering_algorithms.kmeans_clustering( singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui if DESKTOP: #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def kmeans_clustering(cluster_list, num_clusters, num_iterations): ''' Takes a list of Cluster objects and applies k-means clustering as described in the pseudo-code KMeansClustering from Homework 3 to this list of clusters. This function should compute an initial list of clusters (line 2 in the pseudo-code) with the property that each cluster consists of a single county chosen from the set of the num_cluster counties with the largest populations. The function should then compute num_iterations of k-means clustering and return this resulting list of clusters''' _cluster_length = len(cluster_list) _temp = cluster_list[:] _cluster_copy = cluster_list[:] _temp.sort(key=lambda cluster: cluster.total_population()) _k_centers = _temp[-(num_clusters):] # ans = -1 #print _k_centers for _current_iteration in range(0, num_iterations): _k_initial_sets = [ alg_cluster.Cluster(set([]), 0, 0, 0, 0) for _dmy in range(0, num_clusters) ] for _point in range(0, _cluster_length): _min_dist = float('inf') _closest_center = -1 for _k_idx in range(0, num_clusters): _current_dist = _cluster_copy[_point].distance( _k_centers[_k_idx]) if _current_dist < _min_dist: _closest_center = _k_idx _min_dist = _current_dist _k_initial_sets[_closest_center].merge_clusters( _cluster_copy[_point]) #dist = min([_inner_cluster_list[_inner1].distance(center, for center in _k_centers if ]) for _each in range(0, num_clusters): _k_centers[_each] = _k_initial_sets[_each] # ans = _k_initial_sets return _k_centers
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations num = len(cluster_list) clusters_by_pop = sorted(cluster_list, key=lambda z: z.total_population(), reverse=True)[:num_clusters] cluster_dict = { idx: cluster for idx, cluster in enumerate(clusters_by_pop) } for _ in xrange(num_iterations): clusters = [ alg_cluster.Cluster(fips_codes=set(), horiz_pos=0, vert_pos=0, population=0, risk=0) for _ in xrange(num_clusters) ] for index in xrange(num): minimum = min( xrange(num_clusters), key=lambda f: cluster_list[index].distance(cluster_dict[f])) clusters[minimum].merge_clusters(cluster_list[index]) for idy in xrange(num_clusters): cluster_dict[idy] = clusters[idy] return cluster_dict.values()
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_3108_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) cluster_list = hierarchical_clustering(singleton_list, 15) print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9) #print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui if DESKTOP: alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list)
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters. Note: the function may not mutate cluster_list. Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ cluster_list = [ cluster.copy() for cluster in cluster_list ] # first we create a list of cluster object copies, as we need to work on a copy and the 'fips_codes' don't come as sets fot the cancer data initial_centers = sorted( cluster_list, reverse=True, key=lambda cls: cls.total_population() )[: num_clusters] # position initial clusters of size num_clusters at the location of clusters with largest populations for dummy_iteration in range( num_iterations ): # the main loop that counts for the number of iterations clusters = [ (alg_cluster.Cluster(set([]), 0, 0, 0, 0), cluster) for cluster in initial_centers ] # we need an empty cluster to merge with the incoming clusters, we also add the initial coords to simplify future ops. for cluster in cluster_list: new_cluster, dummy_center = min( clusters, key=lambda cls: cls[1].distance(cluster) ) # as we iterate, we store 'cluster' object as 'new_cluster' where the initial coords are the closest new_cluster.merge_clusters( cluster ) # by using the reference to the cluster object, we merge the object in 'clusters' with the one we picked initial_centers = [ cluster for cluster, dummy_center in clusters ] # now we have to update, the initial centers for the next iteration, as this is where the initial positions into 'clusters' came from return initial_centers
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Input: List of clusters, number of clusters, number of iterations Output: List of clusters whose length is num_clusters """ cluster_list_sorted = sorted(cluster_list, key=lambda x: x.total_population(), reverse=True) k_clusters = cluster_list_sorted[:num_clusters] # initialize k-means clusters to be initial clusters with largest populations for dummy_idx in xrange(num_iterations): new_clusters = [ alg_cluster.Cluster(set([]), 0, 0, 1, 0) for dummy_idx in xrange(num_clusters) ] for idx_j in xrange(len(cluster_list)): current_dist = [ cluster_list[idx_j].distance(k_clusters[idx_l]) for idx_l in xrange(num_clusters) ] idx_l = min(xrange(len(current_dist)), key=current_dist.__getitem__) new_clusters[idx_l].merge_clusters(cluster_list[idx_j]) k_clusters = new_clusters[:] return k_clusters
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Input: List of clusters, number of clusters, number of iterations Output: List of clusters whose length is num_clusters """ cluster_list_k = [] population_counties_list = [item.total_population() for item in cluster_list] dummy_k = num_clusters while dummy_k>0: max_population = max(population_counties_list) idx = population_counties_list.index(max_population) cluster_list_k.append(cluster_list[idx]) population_counties_list[idx] = -1 dummy_k-=1 cluster_list_k.reverse() for iteration in range(num_iterations): if iteration: pass # print iteration temp_clusters = [alg_cluster.Cluster(set([]), item.horiz_center(), item.vert_center(), 0, 0.0) for item in cluster_list_k] for index in range(len(cluster_list)): min_index = 0 min_value = 100000000.0 for idx in range(num_clusters): if cluster_list[index].distance(cluster_list_k[idx]) < min_value: min_index = idx min_value = cluster_list[index].distance(cluster_list_k[idx]) temp_clusters[min_index].merge_clusters(cluster_list[index]) for idx in range(num_clusters): cluster_list_k[idx] = temp_clusters[idx] return temp_clusters
def gen_singleton_list(data_table): singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) return singleton_list
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ clusters = cluster_list[::] # position initial clusters at the location of clusters with largest populations clusters.sort(key = lambda cluster: cluster.total_population(), reverse = True) clusters = clusters[:num_clusters] for _ in range(num_iterations): new_clusters = [alg_cluster.Cluster(set(),0,0,0,0) for _ in range(num_clusters)] for cluster in cluster_list: # find closest center closest = float("inf") for idx in range(len(clusters)): dist = clusters[idx].distance(cluster) if dist < closest: closest = dist center = idx # merge the cluster closest to new_clusters new_clusters[center].merge_clusters(cluster) # store new center clusters = new_clusters return clusters
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # position initial clusters at the location of clusters with largest populations clusters = [cluster for cluster in cluster_list] clusters.sort(key=lambda x: x.total_population(), reverse=True) clusters = clusters[:num_clusters] for _ in range(num_iterations): # num_iterations == q # initalize num_clusters i.e k empty cluster empty_cluster = [ alg_cluster.Cluster(set([]), 0, 0, 0, 0) for _ in range(num_clusters) ] for jdx in range(len(cluster_list)): distance, merge_with = float('inf'), None for cluster in clusters: if cluster_list[jdx].distance(cluster) < distance: distance, merge_with = cluster_list[jdx].distance( cluster), cluster empty_cluster[clusters.index(merge_with)].merge_clusters( cluster_list[jdx]) # new_clusters[.index(closest_cluster_center)].merge_clusters(county) clusters = empty_cluster return clusters
def gen_random_clusters(num_clusters): random_clusters = list() for cluster in range(num_clusters): x = random.random() * 2 - 1 y = random.random() * 2 - 1 random_clusters.append(alg_cluster.Cluster(set([]), x, y, 0, 0)) return random_clusters