for cluster in cluster_list: ans += cluster.cluster_error(datatable) return ans data_table1 = load_data_table(DATA_896_URL) data_table2 = load_data_table(DATA_896_URL) singleton_list1 = [] singleton_list2 = [] for line in data_table1: singleton_list1.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) for line in data_table2: singleton_list2.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) data1 = [] data2 = [] cluster_list1 = cl.hierarchical_clustering(singleton_list1, 21) for i in range(20, 5, -1): new_pair = cl.fast_closest_pair(cluster_list1) cluster_list1[new_pair[1]].merge_clusters(cluster_list1[new_pair[2]]) cluster_list1.remove(cluster_list1[new_pair[2]]) cluster_list2 = cl.kmeans_clustering(singleton_list2, i, 1) data1.append((i, compute_distortion(cluster_list1, data_table1))) data2.append((i, compute_distortion(cluster_list2, data_table2))) simpleplot.plot_lines('Quality Analysis - 896', 800, 600, 'num of clusters', 'Total Error', [data1, data2], False, ['hierarchical_clustering', 'kmeans_clustering'])
import simpleplot import time import codeskulptor codeskulptor.set_timeout(100) ######################################### def gen_random_clusters(num_clusters): """ Generate random cluster """ ans = [] for _ in range(num_clusters): ans.append(alg_cluster.Cluster(set([]), random.randrange(-1, 1), random.randrange(-1, 1), 0, 0)) return ans data1 = [] data2 = [] for i in range(2, 201): cluster_set = gen_random_clusters(i) time0 = time.time() cl.slow_closest_pair(cluster_set) time1 = time.time() cl.fast_closest_pair(cluster_set) time2 = time.time() data1.append((i, time1 - time0)) data2.append((i, time2 - time1)) simpleplot.plot_lines('Running Time Analysis', 400, 300, 'num of clusters', 'time usage', [data1, data2], False, ['slow_closest_pair', 'fast_closest_pair'])