def test_clustering(data_set, method): ''' Test the distortion of a data_set under clustering method. Input: a tuple (data_table, cluster_list) and a clustering algorithm Output: a list of distortion values ''' # number of clusters to form num_cluster_list = np.arange(6, 21, 1) # init list to store distortion distortion_list = [] # Loop over cluster sizes list for num_cluster in num_cluster_list: # make a copy of the cluster_list cluster_list = [cluster.copy() for cluster in data_set[1]] if method == 'K-Means': # perfrom kmeans cluster_list = project.kmeans_clustering(cluster_list, num_cluster, 10) else: # perform hier clustering project.hierarchical_clustering(cluster_list, num_cluster) # calculate and save distortion distortion_list.append( compute_distortion_data_set(data_set[0], cluster_list)) return distortion_list
def clustering(algo_used, num_clusters, num_iter = 5): """ Uses specified algorithm to cluster data input: int for specified algorithm, data_table output: cluster_list """ singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) if algo_used == 1: cluster_list = sequential_clustering(singleton_list, num_clusters) print "Displaying", len(cluster_list), "sequential clusters" elif algo_used == 2: cluster_list = prj3.hierarchical_clustering(singleton_list, num_clusters) print "Displaying", len(cluster_list), "hierarchical clusters" elif algo_used == 3: cluster_list = prj3.kmeans_clustering(singleton_list, num_clusters, num_iter) print "Displaying", len(cluster_list), "k-means clusters" return cluster_list
def cluster_data(): ''' Load a data table, compute a list of clusters and Output: a tuple of two list of clusters (hierarchical, kmeans) ''' DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv" data_table = viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) singleton_list_copy = [cluster.copy() for cluster in singleton_list] return (project.hierarchical_clustering(singleton_list, 9), project.kmeans_clustering(singleton_list_copy, 9, 5))
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) def compute_distortion(cluster_list): distortion = 0 for clus in cluster_list: error = clus.cluster_error(data_table) distortion += error return distortion hierarchical_distortion = [] kmeans_distortion = [] # cluster_list = sequential_clustering(singleton_list, 15) # print "Displaying", len(cluster_list), "sequential clusters" for num_final_clusters in range(6, 21): hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_final_clusters) print "Displaying", len(hierarchical_cluster_list), "hierarchical clusters" hierarchical_distortion.append(compute_distortion(hierarchical_cluster_list)) kmeans_cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_final_clusters, 5) print "Displaying", len(kmeans_cluster_list), "k-means clusters" kmeans_distortion.append(compute_distortion(kmeans_cluster_list)) plt.plot(range(6, 21), hierarchical_distortion, 'g', lw = 2, label = "hierarchical distortion") plt.plot(range(6, 21), kmeans_distortion, 'r', lw = 2, label = "kmeans distortion") plt.legend(loc = 'upper left') plt.xlabel('Number of final clusters') plt.xlabel('Number of final clusters') plt.ylabel('Distortion') plt.title('Comparison of distortion between two clustering methods \n based on 111 county data set') plt.grid() plt.savefig('Comparison of distortion (111)')
def plot_Q5(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/" DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv" data_table = viz.load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = project.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" # draw the clusters using matplotlib alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)