def question_one(): """ Function for answering first question """ xvals = range(2, 200) slow_yvals = [] fast_yvals = [] for num in xvals: cluster_list = gen_random_clusters(num) initial = time.time() answer = prj3.slow_closest_pairs(cluster_list) final = time.time() slow_yvals.append(final - initial) for num in xvals: cluster_list = gen_random_clusters(num) initial = time.time() answer = prj3.fast_closest_pair(cluster_list) final = time.time() fast_yvals.append(final - initial) slow_line = plt.plot(xvals, slow_yvals, color='r', label="Slow Closest Pair") fast_line = plt.plot(xvals, fast_yvals, color='b', label="Fast Closest Pair") plt.legend(loc=2) plt.title("Efficiency of Slow and Fast Closest Pairs Algorithms") plt.xlabel("Number of Clusters") plt.ylabel("Run Times in Milliseconds") plt.show()
def test_clustering(data_set, method): ''' Test the distortion of a data_set under clustering method. Input: a tuple (data_table, cluster_list) and a clustering algorithm Output: a list of distortion values ''' # number of clusters to form num_cluster_list = np.arange(6, 21, 1) # init list to store distortion distortion_list = [] # Loop over cluster sizes list for num_cluster in num_cluster_list: # make a copy of the cluster_list cluster_list = [cluster.copy() for cluster in data_set[1]] if method == 'K-Means': # perfrom kmeans cluster_list = project.kmeans_clustering(cluster_list, num_cluster, 10) else: # perform hier clustering project.hierarchical_clustering(cluster_list, num_cluster) # calculate and save distortion distortion_list.append( compute_distortion_data_set(data_set[0], cluster_list)) return distortion_list
def compute_running_times(): ''' Computes the running time for slow_closest_pair() and fast_closest_pair() for clusters of size 2 to 200. Input: NONE Output: a pandas df ''' # Disables garbage collection gc.disable() # Init cluster sizes num_clusters_list = np.arange(2, 201, 1) # Init list variables for storing run times slow_runtimes = [] fast_runtimes = [] # loop over cluster sizes for num_clusters in num_clusters_list: # Generate a cluster_list of size num_clusters clusters_list = gen_random_clusters(num_clusters) # Sort the cluster_list clusters_list.sort(key=lambda cluster: cluster.horiz_center()) # Gets current time in seconds start_time = time.clock() # Perform slow clustering dummy_result = project.slow_closest_pair(clusters_list) # Calculate runtime and store value slow_runtimes.append(time.clock() - start_time) # Gets current time in seconds start_time = time.clock() # Perform fast clustering dummy_result = project.fast_closest_pair(clusters_list) # Calculate runtime and store value fast_runtimes.append(time.clock() - start_time) # create df from results return pd.DataFrame( dict({ 'slow': slow_runtimes, 'fast': fast_runtimes, 'num clusters': num_clusters_list }))
def clustering(algo_used, num_clusters, num_iter = 5): """ Uses specified algorithm to cluster data input: int for specified algorithm, data_table output: cluster_list """ singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) if algo_used == 1: cluster_list = sequential_clustering(singleton_list, num_clusters) print "Displaying", len(cluster_list), "sequential clusters" elif algo_used == 2: cluster_list = prj3.hierarchical_clustering(singleton_list, num_clusters) print "Displaying", len(cluster_list), "hierarchical clusters" elif algo_used == 3: cluster_list = prj3.kmeans_clustering(singleton_list, num_clusters, num_iter) print "Displaying", len(cluster_list), "k-means clusters" return cluster_list
def cluster_data(): ''' Load a data table, compute a list of clusters and Output: a tuple of two list of clusters (hierarchical, kmeans) ''' DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" data_table = viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) singleton_list_copy = [cluster.copy() for cluster in singleton_list] return (project.hierarchical_clustering(singleton_list, 9), project.kmeans_clustering(singleton_list_copy, 9, 5))
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) def compute_distortion(cluster_list): distortion = 0 for clus in cluster_list: error = clus.cluster_error(data_table) distortion += error return distortion hierarchical_distortion = [] kmeans_distortion = [] # cluster_list = sequential_clustering(singleton_list, 15) # print "Displaying", len(cluster_list), "sequential clusters" for num_final_clusters in range(6, 21): hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_final_clusters) print "Displaying", len(hierarchical_cluster_list), "hierarchical clusters" hierarchical_distortion.append(compute_distortion(hierarchical_cluster_list)) kmeans_cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_final_clusters, 5) print "Displaying", len(kmeans_cluster_list), "k-means clusters" kmeans_distortion.append(compute_distortion(kmeans_cluster_list)) plt.plot(range(6, 21), hierarchical_distortion, 'g', lw = 2, label = "hierarchical distortion") plt.plot(range(6, 21), kmeans_distortion, 'r', lw = 2, label = "kmeans distortion") plt.legend(loc = 'upper left') plt.xlabel('Number of final clusters') plt.xlabel('Number of final clusters') plt.ylabel('Distortion') plt.title('Comparison of distortion between two clustering methods \n based on 111 county data set') plt.grid() plt.savefig('Comparison of distortion (111)')
def plot_Q6(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" data_table = viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = project.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "hierarchical clusters" # draw the clusters using matplotlib alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
Divide and Conquer Method and Clustering Closest Pairs and Clustering Algorithms Test File """ import alg_cluster import Project_3 as prj3 slow_dist = True fast_dist = True if slow_dist == True: print "-------Testing Slow closest pairs first-------" print "\nTest 1.." print prj3.slow_closest_pairs([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0)]) print "Expected: set([(1.0, 0, 1)])" print "\nTest 2.." print prj3.slow_closest_pairs([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 0, 1, 1, 0), alg_cluster.Cluster(set([]), 0, 2, 1, 0)]) print "Expected: set([(1.0, 0, 1), (1.0, 1, 2)])" if fast_dist == True: print "\n" + "-------Testing fast closest pairs-------" print "\nTest 1..." print prj3.fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0)]) print "\nTest 2..." print prj3.fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0), alg_cluster.Cluster(set([]), 4, 0, 1, 0), alg_cluster.Cluster(set([]), 5, 0, 1, 0), alg_cluster.Cluster(set([]), 6, 0, 1, 0), alg_cluster.Cluster(set([]), 7, 0, 1, 0), alg_cluster.Cluster(set([]), 8, 0, 1, 0), alg_cluster.Cluster(set([]), 9, 0, 1, 0), alg_cluster.Cluster(set([]), 10, 0, 1, 0), alg_cluster.Cluster(set([]), 11, 0, 1, 0), alg_cluster.Cluster(set([]), 12, 0, 1, 0), alg_cluster.Cluster(set([]), 13, 0, 1, 0), alg_cluster.Cluster(set([]), 14, 0, 1, 0), alg_cluster.Cluster(set([]), 15, 0, 1, 0), alg_cluster.Cluster(set([]), 16, 0, 1, 0), alg_cluster.Cluster(set([]), 17, 0, 1, 0), alg_cluster.Cluster(set([]), 18, 0, 1, 0), alg_cluster.Cluster(set([]), 19, 0, 1, 0)]) print "Expected: one of the tuples in set([(1.0, 9, 10), (1.0, 2, 3), (1.0, 15, 16), (1.0, 11, 12), (1.0, 13, 14), (1.0, 16, 17), (1.0, 14, 15), (1.0, 12, 13), (1.0, 4, 5), (1.0, 18, 19), (1.0, 3, 4), (1.0, 8, 9), (1.0, 17, 18), (1.0, 6, 7), (1.0, 7, 8), (1.0, 5, 6), (1.0, 10, 11), (1.0, 0, 1), (1.0, 1, 2)])" print "\nTest 3..." print prj3.fast_closest_pair([alg_cluster.Cluster(set([]), 90.9548590217, -17.089022585, 1, 0), alg_cluster.Cluster(set([]), 90.2536656675, -70.5911544718, 1, 0), alg_cluster.Cluster(set([]), -57.5872347006, 99.7124028905, 1, 0), alg_cluster.Cluster(set([]), -15.9338519877, 5.91547495626, 1, 0), alg_cluster.Cluster(set([]), 19.1869055492, -28.0681513017, 1, 0), alg_cluster.Cluster(set([]), -23.0752410653, -42.1353490324, 1, 0), alg_cluster.Cluster(set([]), -65.1732261872, 19.675582646, 1, 0), alg_cluster.Cluster(set([]), 99.7789872101, -11.2619165604, 1, 0), alg_cluster.Cluster(set([]), -43.3699854405, -94.7349852817, 1, 0), alg_cluster.Cluster(set([]), 48.2281912402, -53.3441788034, 1, 0)]) print "Expected: one of the tuples in set([(10.5745166749, 0, 7)])"