Exemple #1
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    # cluster_list = sequential_clustering(singleton_list, 15)	
    # print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = project.hierarchical_clustering(singleton_list, 20)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    cluster_list = project.kmeans_clustering(singleton_list, 16, 5)	
    # print "Displaying", len(cluster_list), "k-means clusters"

            
    # draw the clusters using matplotlib or simplegui
    # if DESKTOP:
    #     alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
    #     #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    # else:
    #     alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers
    
    print "cluster:", cluster_list
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)	
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = project3.hierarchical_clustering(singleton_list, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = project3.kmeans_clustering(singleton_list, 9, 5)	
    #print "Displaying", len(cluster_list), "k-means clusters"

            
    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
    else:
        alg_clusters_simplegui.PlotClusters(data_table, cluster_list)
def gen_all_distortions(data_table, cluster_list, clustering_type, min_size,
                        max_size):
    """
    Returns a list of the distortions of cluster_list for 
    cluster outputs of min_size to max_size.
    
    data_table: either a 111, 290, or 896 county data set
    cluster_list = a list of 20 clusters
    clustering_type: either "hierarchical" or "kmeans"
    min_size: the minimum number of desired cluster outputs
    max_size: the maximum number of desired cluster outputs
    """
    all_distortions = []
    for size in range(min_size, max_size + 1):
        # Create deepcopy of clustering_list since hierarchical clustering mutates its cluster_list input.
        copy_list = deepcopy(cluster_list)
        assert (copy_list[x].fips_codes() == cluster_list[x].fips_codes()
                for x in range(len(cluster_list))), "copy_list != cluster_list"
        # Compute hierarchical or kmeans clustering.
        if clustering_type == "hierarchical":
            clustering = project.hierarchical_clustering(copy_list, size)
            assert (clustering[x].fips_codes() == copy_list[x].fips_codes()
                    for x in range(len(clustering))), "clustering != copy_list"
        elif clustering_type == "kmeans":
            clustering = project.kmeans_clustering(copy_list, size, 5)
        # Compute distortion and append to all_distortions list.
        distortion = compute_distortion(clustering, data_table)
        all_distortions.append(distortion)
    return all_distortions
def test_hierarchical24():
    """
    Test for hierarchical clustering
    Note that hierarchical_clustering mutates cluster_list
    """
    
    # load small data table
    print
    print "Testing hierarchical_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)
    
    
    # test data of the form [size of output cluster, sets of county tuples]
    hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])],
                   [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])],
                   [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])],
                   [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])],
                   ]

        
    suite = poc_simpletest.TestSuite()
    
    for num_clusters, expected_county_tuple in hierdata_24:
        
        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.hierarchical_clustering(cluster_list, num_clusters)
        student_county_tuple = set_of_county_tuples(student_clustering)
        
        # Prepare test
        error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)

    suite.report_results()
Exemple #5
0
def test_hierarchical24():
    """
    Test for hierarchical clustering
    Note that hierarchical_clustering mutates cluster_list
    """

    # load small data table
    print
    print "Testing hierarchical_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)


    # test data of the form [size of output cluster, sets of county tuples]
    hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])],
                   [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])],
                   [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])],
                   [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])],
                   ]


    suite = poc_simpletest.TestSuite()

    for num_clusters, expected_county_tuple in hierdata_24:

        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.hierarchical_clustering(cluster_list, num_clusters)
        student_county_tuple = set_of_county_tuples(student_clustering)

        # Prepare test
        error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)

    suite.report_results()
def run_example(table, method):
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(table)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_distortion_dict ={}
    start = 20
    end = 6

    count = start

    new_list = list(singleton_list)

    while count >=end:
        if method == 'h_cluster':

            cluster_list = alg_project3_solution.hierarchical_clustering(new_list, count)
            cluster_distortion_dict[count] = compute_distortion(cluster_list, data_table)
            new_list = cluster_list

        elif method == 'k_cluster':

            cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, count, 5)
            cluster_distortion_dict[count] = compute_distortion(cluster_list, data_table)
            #new_list = cluster_list

        count -=1


    #print "Displaying", len(cluster_list), "hierarchical clusters"
    #print "Displaying", len(cluster_list), "hierarchical clusters cluster error"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"
    #print "Displaying", len(cluster_list), "k-means clusters cluster error"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
        #print compute_distortion(cluster_list, data_table)
        return cluster_distortion_dict
    else:
        alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers
def question_10():
    """
    Compare the quality of two clustering methods by comparing distortion
    produced by the two.
    """
    
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    # choose the data file
    data_table = cluster_visual.load_data_table(DATA_111_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
    
    num_clusters = range(6, 20 + 1)
    distortion_k_means = []
    distortion_hier = []

    # compute distortion values for k-means clustering
    for each_num in num_clusters:
        cluster_list = project3.kmeans_clustering(singleton_list, each_num, 5)
        distortion_k_means.append(compute_distortion(cluster_list, data_table))

    # compute distortion values for hierarchical clustering
    hier_clusters = singleton_list
    for num_clus in range(20, 5, -1):
        hier_clusters = project3.hierarchical_clustering(hier_clusters, num_clus)
        distortion_hier.append(compute_distortion(hier_clusters, data_table))
    # reverse the distortion_hier list
    distortion_hier.reverse()
    
    # plot the results
    
    # Create a new figure of size 8x6 points, using 100 dots per inch
    plt.figure(figsize=(8,8), dpi=80)

    # Create a new subplot from a grid of 1x1
    plt.subplot(1,1,1) #parameters: row, column, location index

    plt.xlabel("Number of Clusters")
    plt.ylabel("Distortion")
    plt.title("Quality comparison - Two Clustering Methods - 111 Data")
    # Plot cosine using blue color with a continuous line of width 1 (pixels)
    plt.plot(num_clusters, distortion_k_means, color="blue", linewidth=2.0, linestyle="-", label="k-means")

    # Plot sine using green color with a continuous line of width 1 (pixels)
    plt.plot(num_clusters, distortion_hier, color="green", linewidth=2.0, linestyle="-", label="hierarchical")

    # Add Legends
    plt.legend(loc='upper right', frameon=False)
    # Show result on screen
    plt.show()
def question2_plot():
    """
    Generate the plot for question 2
    """
    data_table = load_data_table(DATA_3108_URL)

    singleton_list = []
    for line in data_table:
        cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
        singleton_list.append(cluster)
    cluster_list = hierarchical_clustering(singleton_list, 15)
    plot_clusters(data_table, cluster_list, True)
def question_ten():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_896_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(DATA_111_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    #original_dict = create_dictionary(data_table)

    hierarchical_distortion_list = dict([])
    for num_clusters in range(6, 21):
        print "hierarchical: num_cluster=", num_clusters
        cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_clusters)
        distortion = compute_distortion(cluster_list, data_table)
        hierarchical_distortion_list[num_clusters] = distortion
        

    print "About to display ...."

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16)
    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)

    #print "Displaying", len(cluster_list), "hierarchical clusters"

    cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)	
    #print "Displaying", len(cluster_list), "k-means clusters"


    # compute the distortion
    if (True):
        distortion = compute_distortion(cluster_list, data_table)
        print distortion
     
        

    # draw the clusters using matplotlib or simplegui
    if (False):    
        if DESKTOP:
            #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
            alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
        else:
            alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers
def question7():

    data_table = alg_project3_viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    clusters_hierarchical = project3.hierarchical_clustering(singleton_list, 9)
    clusters_kmeans = project3.kmeans_clustering(singleton_list, 9, 5)

    distortion_hierarchical = compute_distortion(clusters_hierarchical, data_table)
    distortion_kmeans = compute_distortion(clusters_kmeans, data_table)

    print "distortion hierarchical: ", distortion_hierarchical
    print "distortion k-means: ", distortion_kmeans
def compute_distortions():
    data_urls = [DATA_111_URL, DATA_290_URL, DATA_896_URL]
    data_tables = [load_data_table(url) for url in data_urls]
    
    distortions = np.zeros((3, 15, 2))
    for i in range(3):
        for j in range(15):
            singletons = [alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]) for line in data_tables[i]]
            nclust = j + 6
            
            # kmeans
            kclst = alg_project3_solution.kmeans_clustering(singletons, nclust, 5)
            distortions[i, j, 0] = compute_distortion(kclst, data_tables[i])
            
            # hclust
            hclst = alg_project3_solution.hierarchical_clustering(singletons, nclust)
            distortions[i, j, 1] = compute_distortion(hclst, data_tables[i])
                        
    return distortions              
def distortion_of_clustering():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    """
    data_table = load_data_table(DATA_896_URL)
    hierarchical_distortion_list = []
    kmeans_distortion_list = []

    for num_cluster in xrange(6, 21):
        singleton_list = []
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), \
                line[1], line[2], line[3], line[4]))
        hierarchical_cluster_list = project3.hierarchical_clustering(
            singleton_list, num_cluster)
        hierarchical_distortion_list.append(
            compute_distortion(hierarchical_cluster_list, data_table))

        singleton_list = []
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), \
                line[1], line[2], line[3], line[4]))
        kmeans_cluster_list = project3.kmeans_clustering(
            singleton_list, num_cluster, 5)
        kmeans_distortion_list.append(
            compute_distortion(kmeans_cluster_list, data_table))
    plt.figure()
    plt.hold(True)
    plt.plot(range(6, 21),
             hierarchical_distortion_list,
             'r',
             label=' hierarchical')
    plt.plot(range(6, 21), kmeans_distortion_list, 'b', label='kmeans')
    plt.legend(loc='upper right')
    plt.title('Quality Comparision DataSet=896')
    plt.xlabel('Num_clusters')
    plt.ylabel('Distortion')
    plt.hold(False)
    plt.show()
def distortion_of_clustering():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), \
            line[1], line[2], line[3], line[4]))
    hierarchical_cluster_list = project3.hierarchical_clustering(
        singleton_list, 9)
    print "hierarchical", compute_distortion(hierarchical_cluster_list,
                                             data_table)
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), \
        line[1], line[2], line[3], line[4]))
    kmeans_cluster_list = project3.kmeans_clustering(singleton_list, 9, 5)
    print "kmeans", compute_distortion(kmeans_cluster_list, data_table)
Exemple #14
0
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), \
            line[1], line[2], line[3], line[4]))

    cluster_list = project3.hierarchical_clustering(singleton_list, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = project3.kmeans_clustering(singleton_list, 15, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def question_7():
    '''
    Write a function compute_distortion(cluster_list) that takes a list of 
    clusters and uses cluster_error to compute its distortion. Now, use 
    compute_distortion to compute the distortions of the two clusterings in 
    questions 5 and 6. Enter the values for the distortions (with at least 
    four significant digits) for these two clusterings in the box below. 
    Clearly indicate the clusterings to which each value corresponds.

    As a check on the correctness of your code, the distortions associated with 
    the 16 output clusters produced by hierarchical clustering and k-means 
    clustering (with 5 iterations) on the 290 county data set are approximately 
    2.575×1011 and 2.323×1011, respectively.
    '''

    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list_h = project3.hierarchical_clustering(singleton_list, 9)
    print "Distortion of", len(cluster_list_h), "hierarchical clusters"
    print compute_distortion(cluster_list_h, data_table)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list_k = project3.kmeans_clustering(singleton_list, 9, 5)
    print "Distortion of", len(cluster_list_k), "k-means clusters"
    print compute_distortion(cluster_list_k, data_table)

    return
def gen_cluster_list(data_table,
                     clustering_type,
                     num_clusters,
                     num_iterations=None):
    """
    Generates and returns a list of clusters from a data table 
    using the given clustering type (i.e., hierarchical or kmeans),
    number of desired clusters num_clusters, and number of iterations num_iterations
    for k-means clustering.
    """
    # Create list of clusters from data_table.
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    # Computes hierarchical or kmeans clustering.
    if clustering_type == "hierarchical":
        cluster_list = project.hierarchical_clustering(singleton_list,
                                                       num_clusters)
    elif clustering_type == "kmeans":
        cluster_list = project.kmeans_clustering(singleton_list, num_clusters,
                                                 num_iterations)
    return cluster_list
def question10():

    data_table_111 = alg_project3_viz.load_data_table(DATA_111_URL)
    data_table_290 = alg_project3_viz.load_data_table(DATA_290_URL)
    data_table_896 = alg_project3_viz.load_data_table(DATA_896_URL)

    data_table_list_111 = compute_data_table(data_table_111)
    data_table_list_290 = compute_data_table(data_table_290)
    data_table_list_896 = compute_data_table(data_table_896)

    clusters = range(6,21)

    distortion_h_111_y = []
    distortion_h_290_y = []
    distortion_h_896_y = []

    distortion_k_111_y = []
    distortion_k_290_y = []
    distortion_k_896_y = []


    for idx in clusters:
        ###y points for hierarchicall data_111
        h_111 = project3.hierarchical_clustering(data_table_list_111, idx)
        distortion_h_111 = compute_distortion(h_111, data_table_111)
        distortion_h_111_y.append(distortion_h_111)

        ###y points for k-means data_111
        k_111 = project3.kmeans_clustering(data_table_list_111, idx, 5)
        distortion_k_111 = compute_distortion(k_111, data_table_111)
        distortion_k_111_y.append(distortion_k_111)

        ###y points for hier data_290
        h_290 = project3.hierarchical_clustering(data_table_list_290, idx)
        distortion_h_290 = compute_distortion(h_290, data_table_290)
        distortion_h_290_y.append(distortion_h_290)

        ###y points for k-means data 290
        k_290 = project3.kmeans_clustering(data_table_list_290, idx, 5)
        distortion_k_290 = compute_distortion(k_290, data_table_290)
        distortion_k_290_y.append(distortion_k_290)

        ###y points for hier data_896
        h_896 = project3.hierarchical_clustering(data_table_list_896, idx)
        distortion_h_896 = compute_distortion(h_896, data_table_896)
        distortion_h_896_y.append(distortion_h_896)

        ###y points for k-means data 896
        k_896 = project3.kmeans_clustering(data_table_list_896, idx, 5)
        distortion_k_896 = compute_distortion(k_896, data_table_896)
        distortion_k_896_y.append(distortion_k_896)



    plt.plot(clusters, distortion_h_111_y, '-b', label = 'hierarchical' )
    plt.plot(clusters, distortion_k_111_y, '-r', label = 'k-means')
    plt.title('Distortion for 111 points')
    plt.legend(loc = 'upper right')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()

    plt.plot(clusters, distortion_h_290_y, '-b', label='hierarchical')
    plt.plot(clusters, distortion_k_290_y, '-r', label='k-means')
    plt.title('Distortion for 290 points')
    plt.legend(loc='upper right')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()

    plt.plot(clusters, distortion_h_896_y, '-b', label='hierarchical')
    plt.plot(clusters, distortion_k_896_y, '-r', label='k-means')
    plt.title('Distortion for 896 points')
    plt.legend(loc='upper right')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()
Exemple #18
0
def run_example(table, method):
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(table)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_distortion_dict = {}
    start = 20
    end = 6

    count = start

    new_list = list(singleton_list)

    while count >= end:
        if method == 'h_cluster':

            cluster_list = alg_project3_solution.hierarchical_clustering(
                new_list, count)
            cluster_distortion_dict[count] = compute_distortion(
                cluster_list, data_table)
            new_list = cluster_list

        elif method == 'k_cluster':

            cluster_list = alg_project3_solution.kmeans_clustering(
                singleton_list, count, 5)
            cluster_distortion_dict[count] = compute_distortion(
                cluster_list, data_table)
            #new_list = cluster_list

        count -= 1

    #print "Displaying", len(cluster_list), "hierarchical clusters"
    #print "Displaying", len(cluster_list), "hierarchical clusters cluster error"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"
    #print "Displaying", len(cluster_list), "k-means clusters cluster error"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
        #print compute_distortion(cluster_list, data_table)
        return cluster_distortion_dict
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
    print "Loaded", len(data_lines), "data points"
    data_tokens = [line.split(',') for line in data_lines]
    return [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] 
            for tokens in data_tokens]
def compute_distortion(cluster_list, data_table):
    return sum([cluster_list[i].cluster_error(data_table) for i in range (len(cluster_list))])
    

data_table = load_data_table(DATA_896_URL)
singleton_list = []
for line in data_table:
   singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
distortion_h = []
distortion_k = []
for i in range(6, 21):
    cluster_list = project3.hierarchical_clustering(singleton_list, i)
    distortion_h.append(compute_distortion(cluster_list, data_table))
    cluster_list1 = project3.kmeans_clustering(singleton_list, i, 5)
    distortion_k.append(compute_distortion(cluster_list1, data_table))
    

x_axix1 = [n for n in range(6, 21)]
y_axix1 = distortion_h
y_axix2 = distortion_k
plt.plot(x_axix1, y_axix1, marker = "o", color = "red")
plt.plot(x_axix1, y_axix2, marker = "*", color = "blue")
plt.xlabel("number of output clusters")
plt.ylabel("Distortion")
plt.title("Comparison of distortion of alg (DATA_896_URL)")
plt.legend(["hierarchical_clustering", "k-means_clustering"], loc = "upper left")
Exemple #20
0
def question_10():
    """
    Compare the quality of two clustering methods by comparing distortion
    produced by the two.
    """

    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    # choose the data file
    data_table = cluster_visual.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    num_clusters = range(6, 20 + 1)
    distortion_k_means = []
    distortion_hier = []

    # compute distortion values for k-means clustering
    for each_num in num_clusters:
        cluster_list = project3.kmeans_clustering(singleton_list, each_num, 5)
        distortion_k_means.append(compute_distortion(cluster_list, data_table))

    # compute distortion values for hierarchical clustering
    hier_clusters = singleton_list
    for num_clus in range(20, 5, -1):
        hier_clusters = project3.hierarchical_clustering(
            hier_clusters, num_clus)
        distortion_hier.append(compute_distortion(hier_clusters, data_table))
    # reverse the distortion_hier list
    distortion_hier.reverse()

    # plot the results

    # Create a new figure of size 8x6 points, using 100 dots per inch
    plt.figure(figsize=(8, 8), dpi=80)

    # Create a new subplot from a grid of 1x1
    plt.subplot(1, 1, 1)  #parameters: row, column, location index

    plt.xlabel("Number of Clusters")
    plt.ylabel("Distortion")
    plt.title("Quality comparison - Two Clustering Methods - 111 Data")
    # Plot cosine using blue color with a continuous line of width 1 (pixels)
    plt.plot(num_clusters,
             distortion_k_means,
             color="blue",
             linewidth=2.0,
             linestyle="-",
             label="k-means")

    # Plot sine using green color with a continuous line of width 1 (pixels)
    plt.plot(num_clusters,
             distortion_hier,
             color="green",
             linewidth=2.0,
             linestyle="-",
             label="hierarchical")

    # Add Legends
    plt.legend(loc='upper right', frameon=False)
    # Show result on screen
    plt.show()
def question_10(data_set):
    '''
    Compute the distortion of the list of clusters produced by hierarchical 
    clustering and k-means clustering (using 5 iterations) on the 111, 290, 
    and 896 county data sets, respectively, where the number of output clusters 
    ranges from 6 to 20 (inclusive).Important note:To compute the distortion 
    for all 15 output clusterings produced by hierarchical_clustering, you 
    should remember that you can use the hierarchical cluster of size 20 to 
    compute the hierarchical clustering of size 19 and so on. Otherwise, you 
    will introduce an unnecessary factor of 15 into the computation of the 15 
    hierarchical clusterings.

    Once you have computed these distortions for both clustering methods, create 
    three separate plots (one for each data set) that compare the distortion of the 
    clusterings produced by both methods. Each plot should include two curves drawn 
    as line plots. The horizontal axis for each plot should indicate the number of 
    output clusters while the vertical axis should indicate the distortion 
    associated with each output clustering. For each plot, include a title that 
    indicates the data set used in creating the plots and a legend that 
    distinguishes the two curves.
    
    Takes a data set of either 3108, 896, 290, or 111 points
    '''

    xvals = xrange(20, 5, -1)
    kmeans_y = []
    hierarchical_y = []

    # load data by county
    data_urls = {
        3108: DATA_3108_URL,
        896: DATA_896_URL,
        290: DATA_290_URL,
        111: DATA_111_URL
    }
    data_table = load_data_table(data_urls[data_set])

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # compute k-means cluster distortion
    for num_clusters in xvals:
        print "Computing", num_clusters, "k-means clusters"
        kmeans = project3.kmeans_clustering(singleton_list, num_clusters, 5)
        kmeans_y.append(compute_distortion(kmeans, data_table))

    # compute hierarchical cluster distortion
    hierarchical = singleton_list
    for num_clusters in xvals:
        print "Computing", num_clusters, "hierarchical clusters"
        hierarchical = project3.hierarchical_clustering(
            hierarchical, num_clusters)
        hierarchical_y.append(compute_distortion(hierarchical, data_table))

    # plot results
    plt.plot(xvals, kmeans_y, color='r', label="K-Means Clustering")
    plt.plot(xvals, hierarchical_y, color='b', label="Hierarchical Clustering")
    plt.legend()
    plt.title("Distortion Comparison Between Clustering Methods on " +
              str(data_set) + " County Data Set")
    plt.xlabel("Number of Output Clusters")
    plt.ylabel("Distortion")
    plt.show()

    return


#question_1()
#question_7()
#question_10(111)
#question_10(290)
#question_10(896)
#question_10(3108)