Example #1
0
def test_clustering(data_set, method):
    '''
    Test the distortion of a data_set under clustering method.
    
    Input: a tuple (data_table, cluster_list) and a clustering algorithm 

    Output: a list of distortion values
    '''
    # number of clusters to form
    num_cluster_list = np.arange(6, 21, 1)
    # init list to store distortion
    distortion_list = []
    # Loop over cluster sizes list
    for num_cluster in num_cluster_list:
        # make a copy of the cluster_list
        cluster_list = [cluster.copy() for cluster in data_set[1]]
        if method == 'K-Means':
            # perfrom kmeans
            cluster_list = project.kmeans_clustering(cluster_list, num_cluster,
                                                     10)
        else:
            # perform hier clustering
            project.hierarchical_clustering(cluster_list, num_cluster)
        # calculate and save distortion
        distortion_list.append(
            compute_distortion_data_set(data_set[0], cluster_list))
    return distortion_list
 def clustering(algo_used, num_clusters, num_iter = 5):
     """
     Uses specified algorithm to cluster data
     
     input: int for specified algorithm, data_table
     output: cluster_list
     """     
     singleton_list = []
     for line in data_table:
         singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
     
     if algo_used == 1:
         cluster_list = sequential_clustering(singleton_list, num_clusters)
         print "Displaying", len(cluster_list), "sequential clusters"
     elif algo_used == 2:
         cluster_list = prj3.hierarchical_clustering(singleton_list, num_clusters)
         print "Displaying", len(cluster_list), "hierarchical clusters"
     elif algo_used == 3:
         cluster_list = prj3.kmeans_clustering(singleton_list, num_clusters, num_iter)
         print "Displaying", len(cluster_list), "k-means clusters"
     
     return cluster_list
Example #3
0
def cluster_data():
    '''
    Load a data table, compute a list of clusters and 
    
    Output: a tuple of two list of clusters (hierarchical, kmeans)
    '''
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    singleton_list_copy = [cluster.copy() for cluster in singleton_list]

    return (project.hierarchical_clustering(singleton_list, 9),
            project.kmeans_clustering(singleton_list_copy, 9, 5))
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    def compute_distortion(cluster_list):
        distortion = 0
        for clus in cluster_list:
            error = clus.cluster_error(data_table)
            distortion += error
        return distortion
    hierarchical_distortion = []
    kmeans_distortion = []
    # cluster_list = sequential_clustering(singleton_list, 15)
    # print "Displaying", len(cluster_list), "sequential clusters"
    for num_final_clusters in range(6, 21):
        hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_final_clusters)
        print "Displaying", len(hierarchical_cluster_list), "hierarchical clusters"
        hierarchical_distortion.append(compute_distortion(hierarchical_cluster_list))
        kmeans_cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_final_clusters, 5)
        print "Displaying", len(kmeans_cluster_list), "k-means clusters"
        kmeans_distortion.append(compute_distortion(kmeans_cluster_list))
    plt.plot(range(6, 21), hierarchical_distortion, 'g', lw = 2, label = "hierarchical distortion")
    plt.plot(range(6, 21), kmeans_distortion, 'r', lw = 2, label = "kmeans distortion")
    plt.legend(loc = 'upper left')
    plt.xlabel('Number of final clusters')
    plt.xlabel('Number of final clusters')
    plt.ylabel('Distortion')
    plt.title('Comparison of distortion between two clustering methods \n based on 111 county data set')
    plt.grid()
    plt.savefig('Comparison of distortion (111)')
Example #5
0
def plot_Q6():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list = project.kmeans_clustering(singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    # draw the clusters using matplotlib
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)