def setUp(self): """ Run before each test. """ cluster1 = Cluster(set([32001, 32013, 32031]), 2, 2, 400, .1) cluster2 = Cluster(set([51121, 51155, 51161]), 0, 0, 600, .2) cluster3 = Cluster(set([51059, 51013, 51107]), -2, -2, 800, .3) self._cluster_list = [cluster1, cluster2, cluster3] self._compare_func = lambda x1, x2: x1 - x2
def test_closest_pair_strip(self): self.assertEqual(util_clustering.closest_pair_strip( self._cluster_list, -1, 2), (math.sqrt(8), 1, 2)) cluster1 = Cluster([51121, 51155, 51161], -7, 3, 600, .2) cluster2 = Cluster([51059, 51013, 51107], -1, 1, 800, .3) cluster3 = Cluster([51121, 51155, 51161], 3, 6, 600, .2) cluster4 = Cluster([32001, 32013, 32031], 3, 5, 400, .1) cluster5 = Cluster([32001, 32013, 32031], 4, 6, 400, .1) cluster_list_2 = [cluster1, cluster2, cluster3, cluster4, cluster5] self.assertEqual(util_clustering.closest_pair_strip( cluster_list_2, 2, math.sqrt(2)), (1.0, 2, 3))
def test_special(self): s = [ Cluster(set([1, 2, 3, 4]), 0.0, 0.0, 55, 0.176363636364), Cluster(set([10, 11]), 10.0, 10.0, 28, 0.2), Cluster(set([12]), 10, 10, 14, 0.2), Cluster(set([13]), 10, 10, 14, 0.2) ] # print(s) d, i, j = fast_closest_pair(s) # print(d, i, j) # print(s[i], s[j]) self.assertNotEqual((i, j), (0, 1))
def hierarchical_clustering(cluster_list, num_clusters): """ Compute a hierarchical clustering of a set of clusters Note: the function mutates cluster_list Input: List of clusters, number of clusters Output: List of clusters whose length is num_clusters """ while len(cluster_list) > num_clusters: (dummy_distance, inx1, inx2) = fast_closest_pair(cluster_list) Cluster.merge_clusters(cluster_list[inx1], cluster_list[inx2]) cluster_list.remove(cluster_list[inx2]) return cluster_list
def test(): cluster0 = Cluster(set(["Al"]), 0, 0, 10, 20) cluster1 = Cluster(set(["DK"]), 1, 1, 10, 10) cluster2 = Cluster(set(["SW"]), 9, 9, 10, 1000) cluster3 = Cluster(set(["Brasil"]), 10, 10, 100, 10) #cluster1.merge_clusters(cluster2) cluster4 = Cluster(set(["NL"]), 5, 6, 4, 4) cluster5 = Cluster(set(["NL"]), 5, 6.1, 100, 0.01) cluster6 = Cluster(set(["hl"]), 6, 13, 0, 0) cluster7 = Cluster(set(["hl"]), 6.5, 15, 0, 0) cluster8 = Cluster(set(["hl"]), 8, 13, 0, 0) print kmeans_clustering([cluster0, cluster1, cluster2, cluster3], 2, 2)
def test(): cluster0 = Cluster(set(["Al"]), 1.1, 10, 0, 0) cluster1 = Cluster(set(["DK"]), 1, 2, 100, 0.01) cluster2 = Cluster(set(["SW"]), 4, 6, 200, 0.05) cluster3 = Cluster(set(["Brasil"]), 4, 2, 100000000, 3) #cluster1.merge_clusters(cluster2) cluster4 = Cluster(set(["NL"]), 5, 6, 4, 4) cluster5 = Cluster(set(["NL"]), 5, 6.1, 100, 0.01) cluster6 = Cluster(set(["hl"]), 6, 13, 0, 0) cluster7 = Cluster(set(["hl"]), 6.5, 15, 0, 0) cluster8 = Cluster(set(["hl"]), 8, 13, 0, 0) print hierarchical_clustering([cluster0, cluster1, cluster2], 1)
def test_four_pairs(self): s = map(lambda x: Cluster(*x), [ (set([1]), 0, 0, 13, 0.1), (set([2]), 0, 0, 14, 0.2), (set([3]), 0, 0, 14, 0.2), (set([4]), 0, 0, 14, 0.2), (set([10]), 10, 10, 14, 0.2), (set([11]), 10, 10, 14, 0.2), (set([12]), 10, 10, 14, 0.2), (set([13]), 10, 10, 14, 0.2), ]) # s = map(lambda x: Cluster(*x), # [(set([1]), 0, 0, 13, 0.1), # (set([2]), 1, 0, 14, 0.2), # (set([3]), 2, 0, 14, 0.2), # (set([4]), 3, 0, 14, 0.2), # (set([10]), 0, 100, 14, 0.2), # (set([11]), 1, 100, 14, 0.2), # (set([12]), 2, 100, 14, 0.2), # (set([13]), 3, 100, 14, 0.2), # ]) h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0], hierarchical_clustering(s, 2), []) k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0], kmeans_clustering(s, 2, 10), []) h = sorted([list(x) for x in h]) k = sorted([list(x) for x in k]) # print(h) # print(k) self.assertEqual(h, k)
def test_one_pair(self): s = map(lambda x: Cluster(*x), [(set([5]), 0, 0, 13, 0.1), (set([10]), 42, 0, 14, 0.2)]) r = reduce(lambda acc, x: [acc, acc.update(x.fips_codes())][0], hierarchical_clustering(s, 1), set()) self.assertEqual(set([5, 10]), r)
def question10(data, filename): table = load_data_table(data) clusters = Cluster.load_as_list(data) xs = range(6, 21) ys_hier = [] def dist(clusters): ys_hier.append(distortion(clusters, table)) hierarchical_clustering(clusters, 6, dist, set(xs)) ys_hier.reverse() ys_kmeans = [ distortion(kmeans_clustering(clusters, x, 5), table) for x in xs ] plt.cla() plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion') plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion') plt.title('Clustering distortion (%s)' % data) plt.xlabel('Number of output clusters') plt.ylabel('Distortion') plt.legend(loc='upper right') plt.tight_layout() plt.savefig(filename) print('Saved plot to %s' % filename)
def renew_centering(cluster_list, center_clusters, \ num_clusters, num_iterations): """ Function renews the centering. Reuturn a list of clusters. """ for dummy_iterations in xrange(num_iterations): #Initialize num_clusters empty clusters whose counties are empty set #and populations are zero. clustering_list = [Cluster(set([]), 0, 0, 0, 0) \ for dummy_num in xrange(num_clusters)] #For each cluster, find the min distance center-cluster pair #and create a new clustering. cluster_index = 0 center_index = 0 for cluster in cluster_list: center_cluster_distance_list = [] center_index = 0 for center_cluster in center_clusters: center_cluster_distance_list.append(\ pair_distance_not_sort([cluster, center_cluster], \ cluster_index, center_index)) center_index = center_index + 1 min_center_cluster_distance = min(center_cluster_distance_list) clustering_list[min_center_cluster_distance[2]].\ merge_clusters(cluster_list[cluster_index]) cluster_index = cluster_index + 1 center_clusters = clustering_list return clustering_list
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Input: List of clusters, number of clusters, number of iterations Output: List of clusters whose length is num_clusters """ def nearest_to(point, clusters): """Find index of the nearest cluster to the point""" nearest, dist = 0, point.distance(clusters[0]) for idx_i, cluster in enumerate(clusters): if point.distance(cluster) < dist: dist = point.distance(cluster) nearest = idx_i return nearest # initialize k-means clusters to be initial clusters with largest populations num_n = len(cluster_list) centers = sorted([idx_x.copy() for idx_x in cluster_list], key=lambda arg_x: arg_x.total_population(), reverse=True)[:num_clusters] for _ in range(num_iterations): set_k = [Cluster(set(), 0, 0, 0, 0) for _ in range(num_clusters)] for idx_j in range(num_n): nearest = nearest_to(cluster_list[idx_j], centers) set_k[nearest].merge_clusters(cluster_list[idx_j]) for idx_j, cluster in enumerate(set_k): centers[idx_j] = cluster.copy() return centers
def test_one_pair(self): # flip, x, y, population, risk s = map(lambda x: Cluster(*x), [(5, 0, 0, 13, 0.1), (10, 42, 0, 14, 0.2)]) r1 = list(slow_closest_pairs(s))[0] r2 = fast_closest_pair(s) self.assertEqual(r1, r2) self.assertEqual(r2, (42.0, 0, 1))
def load_as_list(filename): clusters = [] with open(filename) as f: for line in f.readlines(): fips, x, y, pop, risk = line.split(',') clusters.append( Cluster(set([fips]), float(x), float(y), int(pop), float(risk))) return clusters
def gen_random_clusters(num_clusters): """ creates a list of clusters where each cluster in this list corresponds to one randomly generated point in the square with corners (plus/minus1,plus/minus1) """ return [ Cluster([], random.uniform(-1, 1), random.uniform(-1, 1), 1, 0) for _ in range(num_clusters) ]
def gen_random_clusters(num_clusters): """ Function creates a list of clusters where each cluster in this list corresponds to one randomly generated point in the square with corners (+-1, +-1). """ rand_cluster_list = [Cluster(set([]), random()*sample([1, -1], 1)[0], \ random()*sample([1, -1], 1)[0], 0, 0)\ for dummy_num in xrange(num_clusters)] return rand_cluster_list
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters :param cluster_list: List of clusters :param num_clusters: number of clusters :param num_iterations: number of iterations :return: List of clusters whose length is num_clusters """ # initialize k-means clusters to be initial clusters with largest populations centers = sorted(cluster_list, key=lambda x: x.total_population(), reverse=True)[:num_clusters] # main loop while num_iterations > 0: num_iterations -= 1 # update countdown of iterations empty_sets = [ Cluster(set([]), idx.horiz_center(), idx.vert_center(), 0, 0.0) for idx in centers ] #organize free clusters into k sets for cluster in cluster_list: # calculate the distances between cluster and every cluster in empty_set distances = [ empty_sets[idx].distance(cluster) for idx in range(len(empty_sets)) ] # take the empty_set's cluster that is the closest to the current cluster closest = empty_sets[distances.index(min(distances))] #merge current cluster to the closest one closest.merge_clusters( cluster) # update closest, so it's updated in empties for idx in range(num_clusters): #update the center of every cluster, taking it from the empties list centers[idx] = Cluster(set([]), empty_sets[idx].horiz_center(), empty_sets[idx].vert_center(), 0, 0.0) return empty_sets
def kmeans_clustering(cluster_list, num_clusters, num_iterations): """ Compute the k-means clustering of a set of clusters Note: the function may not mutate cluster_list Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """ # 1. cluster_n = len(cluster_list) # 2. Initialize k centers sorted_clusters = [] for cluster in cluster_list: sorted_clusters.append(cluster.copy()) # sort in descending order sorted_clusters.sort(key = lambda cluster: cluster.total_population(), reverse=True) # select the first num_clusters from the sorted list. This will yield the top # num_clusters from the perspective of population. old_clusters = [] for index in range(0, num_clusters): old_clusters.append(sorted_clusters[index]) # 3. ##for index_i in range(0, num_iterations): while num_iterations > 0: # 4 initialize k empty sets new_clusters = [] index_k = num_clusters while index_k > 0: new_clusters.append( Cluster(set(),0,0,0,0) ) index_k -= 1 # 5 for index_j in range(0, cluster_n): # 6 argmin index_e = 0 distance = float('inf') for index_f in range(0,num_clusters): current_distance = old_clusters[index_f].distance(cluster_list[index_j]) if (current_distance < distance): distance = current_distance index_e = index_f new_clusters[index_e].merge_clusters(cluster_list[index_j]) for index_f in range(0, num_clusters): old_clusters[index_f] = new_clusters[index_f] num_iterations -= 1 return new_clusters
def test_three_vert_pairs(self): s = map(lambda x: Cluster(*x), [ (5, 0, 0, 13, 0.1), (10, 0, 2, 14, 0.2), (15, 0, 4, 15, 0.2), (20, 0, 5, 16, 0.2), (25, 0, 7, 17, 0.2), (30, 0, 9, 18, 0.2), ]) r1 = list(slow_closest_pairs(s))[0] r2 = fast_closest_pair(s) self.assertEqual(r1, r2) self.assertEqual(r1, (1.0, 2, 3))
def get_random_clusters(num_clusters): # def make(_): # x = random() * 2 - 1 # y = random() * 2 - 1 # return Cluster(set(['0']), x, y, 1, 1) # # return map(make, range(num_clusters)) clusters = [] for dummy_idx in range(num_clusters): x = random() * 2 - 1 y = random() * 2 - 1 clusters.append(Cluster(set(['0']), x, y, 1, 1)) return clusters
def load_data(data_file): ''' load the data ''' clusters = [] data = open(data_file) data_text = data.read() data_lines = data_text.split('\n') for line in data_lines: content = line.split(',') clusters.append( Cluster(int(content[0]), float(content[1]), float(content[2]), int(content[3]), float(content[4]))) return clusters
def test_slow_closest_pair(self): self.assertEqual(util_clustering.slow_closest_pair( self._cluster_list), (math.sqrt(8), 0, 1)) cluster_list_2 = [Cluster(set([]), 0.38, 0.26, 1, 0), Cluster(set([]), 0.42, 0.03, 1, 0), Cluster(set([]), 0.48, 0.23, 1, 0), Cluster(set([]), 0.8, 0.65, 1, 0), Cluster(set([]), 0.95, 0.85, 1, 0), Cluster(set([]), 0.97, 0.61, 1, 0)] self.assertEqual(util_clustering.slow_closest_pair( cluster_list_2), (0.10440306508910548, 0, 2)) cluster_list_3 = [Cluster(set([]), 0.38, 0.26, 1, 0), Cluster(set([]), 0.42, 0.03, 1, 0)] self.assertEqual(util_clustering.slow_closest_pair( cluster_list_3), (0.23345235059857505, 0, 1))
def test_compare_x(self): cluster_1 = Cluster([22152], -2, 2, 0, 0) cluster_2 = Cluster([90210], 0, 0, 0, 0) self.assertEqual(util_clustering.compare_x(cluster_1, cluster_2), -2) cluster_1 = Cluster([22152], -2, 2, 0, 0) cluster_2 = Cluster([90210], -2, 2, 0, 0) self.assertEqual(util_clustering.compare_x(cluster_1, cluster_2), 0) cluster_1 = Cluster([22152], -2, -4, 0, 0) cluster_2 = Cluster([90210], -4, 2, 0, 0) self.assertEqual(util_clustering.compare_x(cluster_1, cluster_2), 2)
def test_compare_y(self): cluster_1 = [1, Cluster([22152], -2, 2, 0, 0)] cluster_2 = [3, Cluster([90210], 0, 0, 0, 0)] self.assertEqual(util_clustering.compare_y(cluster_1, cluster_2), 2) cluster_1 = [0, Cluster([22152], -2, 2, 0, 0)] cluster_2 = [5, Cluster([90210], 0, 2, 0, 0)] self.assertEqual(util_clustering.compare_y(cluster_1, cluster_2), 0) cluster_1 = [2, Cluster([22152], -2, -4, 0, 0)] cluster_2 = [4, Cluster([90210], 0, 2, 0, 0)] self.assertEqual(util_clustering.compare_y(cluster_1, cluster_2), -6)
def kmeans_clustering(cluster_list, num_clusters, num_iterations): ''' Takes a list of Cluster objects and applies kmeans clustering Returns a list of clusters of desired number after specific number of operations ''' # initialize num_clusters centers # use the centers of those clusters which have the largest populations populations = [(item.total_population(), idx) for idx, item in enumerate(cluster_list)] populations.sort() centers = [] populations = populations[-num_clusters:] for item in populations: centers.append((cluster_list[item[1]].horiz_center(), cluster_list[item[1]].vert_center())) final_clusters = [] #for itr in range(num_iterations): while num_iterations > 0: # initialize empty clusters clusters = [Cluster(set(), j[0], j[1], 0, 0) for j in centers] copy_clusters = [item.copy() for item in clusters] # find distance of clusters from newly created clusters for clus1 in cluster_list: dmin = (float('inf'), None) for idx, clus2 in enumerate(clusters): dist = clus1.distance(clus2) dmin = dmin if dmin[0] < dist else (dist, idx) # merge the cluster to its closet pair copy_clusters[dmin[1]].merge_clusters(clus1) # update the centers of the desired clusters centers = [] for item in copy_clusters: centers.append((item.horiz_center(), item.vert_center())) final_clusters = list(copy_clusters) num_iterations -= 1 return final_clusters
def question10(data, filename): table = load_data_table(data) clusters = Cluster.load_as_list(data) xs = range(6, 21) ys_hier = [] def dist(clusters): ys_hier.append(distortion(clusters, table)) hierarchical_clustering(clusters, 6, dist, set(xs)) ys_hier.reverse() ys_kmeans = [distortion(kmeans_clustering(clusters, x, 5), table) for x in xs] plt.cla() plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion') plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion') plt.title('Clustering distortion (%s)' % data) plt.xlabel('Number of output clusters') plt.ylabel('Distortion') plt.legend(loc='upper right') plt.tight_layout() plt.savefig(filename) print('Saved plot to %s' % filename)
def closest_pair_strip(cluster_list, horiz_center, half_width): """ Helper function to compute the closest pair of clusters in a vertical strip Input: cluster_list is a list of clusters produced by fast_closest_pair horiz_center is the horizontal position of the strip's vertical center line half_width is the half the width of the strip (i.e; the maximum horizontal distance that a cluster can lie from the center line) Output: tuple of the form (dist, idx1, idx2) where the centers of the clusters cluster_list[idx1] and cluster_list[idx2] lie in the strip and have minimum distance dist. """ vert_list = [item for item in cluster_list if math.fabs(cluster_list[item].horiz_center() - mid) < w] vert_list.sort(key = lambda Cluster: Cluster.vert_center()) count = len(vert_list) result = set([(Decimal('Infinity'),-1,-1)]) min_dist = Decimal('Infinity') for idx1 in xrange(0,count -2): for idx2 in xrange(idx1+1,min(idx1+3,count -1)): if min_dist > pair_distance(vert_list,idx1,idx2): min_dist = pair_distance(vert_list,idx1,idx2) result = set([(min_dist,vert_list[idx1],vert_list[idx2])]) return result
def test_are_clusters_equal(self): self.assertFalse(util_clustering.are_clusters_equal( self._cluster_list[0], self._cluster_list[1])) self.assertFalse(util_clustering.are_clusters_equal(self._cluster_list[0], Cluster(set([32001, 32013, 32041]), 2, 2, 400, .1))) self.assertFalse(util_clustering.are_clusters_equal(self._cluster_list[0], Cluster(set([32001, 32013, 32041]), 3, 2, 400, .1))) self.assertFalse(util_clustering.are_clusters_equal(self._cluster_list[0], Cluster(set([32001, 32013, 32041]), 3, 5, 400, .1))) self.assertFalse(util_clustering.are_clusters_equal(self._cluster_list[0], Cluster(set([32001, 32013, 32041]), 3, 2, 425, .1))) self.assertFalse(util_clustering.are_clusters_equal(self._cluster_list[0], Cluster(set([32001, 32013, 32041]), 3, 2, 400, 1.1))) self.assertTrue(util_clustering.are_clusters_equal( self._cluster_list[0], Cluster(set([32001, 32013, 32031]), 2, 2, 400, .1)))
def make(_): x = random() * 2 - 1 y = random() * 2 - 1 return Cluster(set(['0']), x, y, 1, 1)
def test_fast_closest_pair(self): self.assertEqual(util_clustering.fast_closest_pair( self._cluster_list), (math.sqrt(8), 0, 1)) cluster1 = Cluster([51121, 51155, 51161], -7, 3, 600, .2) cluster2 = Cluster([51059, 51013, 51107], -1, 1, 800, .3) cluster3 = Cluster([51121, 51155, 51161], 3, 6, 600, .2) cluster4 = Cluster([32001, 32013, 32031], 3, 5, 400, .1) cluster5 = Cluster([32001, 32013, 32031], 4, 6, 400, .1) cluster_list_2 = [cluster1, cluster2, cluster3, cluster4, cluster5] self.assertEqual(util_clustering.fast_closest_pair( cluster_list_2), (1.0, 2, 3)) cluster_list_3 = [Cluster(set([]), 0.38, 0.26, 1, 0), Cluster(set([]), 0.42, 0.03, 1, 0), Cluster(set([]), 0.48, 0.23, 1, 0), Cluster(set([]), 0.8, 0.65, 1, 0), Cluster(set([]), 0.95, 0.85, 1, 0), Cluster(set([]), 0.97, 0.61, 1, 0)] self.assertEqual(util_clustering.fast_closest_pair( cluster_list_3), (0.10440306508910548, 0, 2))
def test_min_dist_to_cluster(self): compare_cluster = Cluster(set([22140, 22141, 22142]), 3, 3, 400, .1) self.assertEqual(util_clustering.min_dist_to_cluster( self._cluster_list, compare_cluster), 0 )
def test_are_cluster_lists_equal(self): self.assertFalse(util_clustering.are_cluster_lists_equal( self._cluster_list, [Cluster(set([32001, 32133, 32031]), 2, 2, 400, .1), Cluster(set([51121, 51155, 51161]), 0, 0, 600, .2), Cluster(set([51059, 51013, 51107]), -2, -2, 800, .3)])) self.assertFalse(util_clustering.are_cluster_lists_equal( self._cluster_list, [Cluster(set([32001, 32013, 32031]), 2, 2, 400, .1), Cluster(set([51121, 51155, 51161]), 0, 0, 600, .2), Cluster(set([51059, 51013, 51107]), -2, -2, 1200, .3)])) self.assertFalse(util_clustering.are_cluster_lists_equal( self._cluster_list, [Cluster(set([51121, 51155, 51161]), -7, 3, 600, .2), Cluster(set([32001, 32013, 32031]), 2, 2, 400, .1), Cluster(set([51121, 51155, 51161]), 0, 0, 600, .2), Cluster(set([51059, 51013, 51107]), -2, -2, 800, .3)])) self.assertTrue(util_clustering.are_cluster_lists_equal( self._cluster_list, [Cluster(set([32001, 32013, 32031]), 2, 2, 400, .1), Cluster(set([51121, 51155, 51161]), 0, 0, 600, .2), Cluster(set([51059, 51013, 51107]), -2, -2, 800, .3)]))
> cluster_list[dummy_right].total_population(): index_list.append(index_left[dummy_left]) if dummy_left < len(index_left) - 1: dummy_left = dummy_left + 1 print 'dummy_left=', dummy_left else: index_list.append(index_right[dummy_right]) if dummy_right < len(index_right) - 1: dummy_right = dummy_right + 1 print 'dummy_right=', dummy_right return index_list if __name__ == "__main__": CLUSTER_LIST = [Cluster(set([]), 0, 0, 12, 0),\ Cluster(set([]), 0, 0, 20, 0),\ Cluster(set([]), 0, 0, 30, 0),\ Cluster(set([]), 0, 0, 40, 0),\ Cluster(set([]), 0, 0, 100, 0),\ Cluster(set([]), 0, 0, 90, 0),\ Cluster(set([]), 0, 0, 80, 0),\ Cluster(set([]), 0, 0, 70, 0),\ Cluster(set([]), 0, 0, 1, 0),\ Cluster(set([]), 0, 0, 2, 0),\ Cluster(set([]), 0, 0, 3, 0),\ Cluster(set([]), 0, 0, 4, 0),\ Cluster(set([]), 0, 0, 5, 0),\ Cluster(set([]), 0, 0, 6, 0),\ Cluster(set([]), 0, 0, 7, 0)] INDEX = range(15)