Esempio n. 1
0
def run_question_10():
    data_tables = [
        pro3_viz.load_data_table(DATA_111_URL),
        pro3_viz.load_data_table(DATA_290_URL),
        pro3_viz.load_data_table(DATA_896_URL)
    ]

    for data_table in data_tables:  # for each cancer data set we do the following:
        data_table_cl = clusterize_data(data_table)
        x = []
        h_dist = []
        k_dist = []
        for num_of_clusters in range(6, 21):
            hierarchical_cluster_list = pro3.hierarchical_clustering(
                data_table_cl, num_of_clusters
            )  # this where we increase the number of clusters for both methods
            kmeans_cluster_list = pro3.kmeans_clustering(
                data_table_cl, num_of_clusters, 5)
            x.append(num_of_clusters)
            h_dist.append(
                compute_distortion(data_table, hierarchical_cluster_list) /
                10**11)  # note that 'data_table' is not clusterized
            k_dist.append(
                compute_distortion(data_table, kmeans_cluster_list) / 10**11)

        plt.title("Distortion of clustering methods for " +
                  str(len(data_table_cl)) + " points")
        plt.xlabel("Number of clusters")
        plt.ylabel("Distortion (10^11)")
        plt.plot(x, h_dist, label='hierarchical clustering')
        plt.plot(x, k_dist, label='k-means clustering (5 iterations)')
        plt.axis([6, 20, 0, 25])
        plt.legend()
        plt.show()
def q6():
	data_table = viz.load_data_table(viz.DATA_111_URL)
	singleton_list=[]
	for line in data_table:
		singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
	cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
	alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)   
def q2():
	data_table = viz.load_data_table(viz.DATA_3108_URL)
	singleton_list = []
	for line in data_table:
		singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))  
	cluster_list = alg_project3.hierarchical_clustering(singleton_list, 15)
	alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)   
Esempio n. 4
0
def question10(data, filename):
    table = load_data_table(data)
    clusters = load_as_list(data)
    xs = range(6, 21)
    ys_hier = []

    def dist(clusters):
        ys_hier.append(distortion(clusters, table))

    hierarchical_clustering(clusters, 6, dist, set(xs))
    ys_hier.reverse()
    clusters = load_as_list(data)
    ys_kmeans = [
        distortion(kmeans_clustering(clusters, x, 5), table) for x in xs
    ]

    plt.cla()
    plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion')
    plt.plot(xs, ys_kmeans, '-b', label='k-means clustering distortion')
    plt.title('Clustering distortion (%s)' % data)
    plt.xlabel('Number of output clusters')
    plt.ylabel('Distortion')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename)
    print('Saved plot to %s' % filename)
Esempio n. 5
0
def question5(filename):
    data = 'unifiedCancerData_111.csv'
    dist = distortion(
        visualize(data, filename, lambda x: hierarchical_clustering(x, 9)),
        load_data_table(data))
    print('Distortion in question5, hierarchical_clustering = %f (%s)' %
          (dist, dist))
def run_example(data_dir,  num_clusters):
    """
    Modified to do question 10 loops and save time plot
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    # DATA_3108_URL DATA_111_URL DATA_290_URL
    data_table = load_data_table(data_dir) 
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    begin = time.time()
    cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_clusters)
    end = time.time()
    hierarchical_dur = end - begin
    hierarchical_dist = compute_distortion(cluster_list, data_table) * 10e10
    
    begin = time.time()
    cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_clusters, 5)
    end = time.time()
    kmeans_dur = end - begin
    kmeans_dist = compute_distortion(cluster_list, data_table) * 10e10
    
    return hierarchical_dur, hierarchical_dist, kmeans_dur, kmeans_dist
Esempio n. 7
0
def run_question_3():
    data_table = pro3_viz.load_data_table(DATA_3108_URL)
    data_table_cl = clusterize_data(data_table)
    cluster_list = pro3.kmeans_clustering(data_table_cl, 15, 5)
    print "Displaying", len(cluster_list), "k-means clusters"

    # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)  # simple filled-in circles
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list,
                                          True)  # add cluster centers
def q2():
    data_table = viz.load_data_table(viz.DATA_3108_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list = alg_project3.hierarchical_clustering(singleton_list, 15)
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
Esempio n. 9
0
def compute_distortion():
    data_table = alp.load_data_table(alp.DATA_111_URL)
    cluster_list = alp.run_example()

    list1 = []
    for i in cluster_list:
        error = i.cluster_error(data_table)
        list1.append(error)
    print sum(list1)
def q6():
    data_table = viz.load_data_table(viz.DATA_111_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def question7(url, num_clusters):
    data_table = load_data_table(url)
    singleton_list = []
    for line in data_table:
        cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
        singleton_list.append(cluster)

    # cluster_list = hierarchical_clustering(singleton_list, num_clusters)
    cluster_list = kmeans_clustering(singleton_list, num_clusters, 5)
    print compute_distortion(cluster_list, data_table)
Esempio n. 12
0
def quality_check(data_url, alg):
    x = []
    y = []
    data_table = load_data_table(data_url)
    for num in range(6, 21):
        x.append(num)
        cluster_list = get_clusters(data_url, num, alg, data_table)
        y.append(compute_distortion(cluster_list, data_url, data_table))

    return (x, y)
Esempio n. 13
0
def run_question_5():
    data_table = pro3_viz.load_data_table(DATA_111_URL)
    data_table_cl = clusterize_data(data_table)
    cluster_list = pro3.hierarchical_clustering(data_table_cl, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)  # simple filled-in circles
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list,
                                          True)  # add cluster centers

    return cluster_list
def question5_plot():
    """
    Generate the plot for question 5
    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
        singleton_list.append(cluster)
    cluster_list = hierarchical_clustering(singleton_list, 9)
    plot_clusters(data_table, cluster_list, True)
Esempio n. 15
0
def run_question_7():
    data_table = pro3_viz.load_data_table(DATA_111_URL)
    data_table_cl = clusterize_data(data_table)

    hierarchical_cluster_list = pro3.hierarchical_clustering(data_table_cl, 9)
    kmeans_cluster_list = pro3.kmeans_clustering(data_table_cl, 9, 5)

    print "hierarchical clustering:", compute_distortion(
        data_table,
        hierarchical_cluster_list)  # note that 'data_table' is not clusterized
    print "k-means clustering:", compute_distortion(data_table,
                                                    kmeans_cluster_list)
def question3_plot():
    """
    Generate the plot for question 3
    """
    data_table = load_data_table(DATA_3108_URL)

    singleton_list = []
    for line in data_table:
        cluster = Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
        singleton_list.append(cluster)
    cluster_list = kmeans_clustering(singleton_list, 15, 5)
    plot_clusters(data_table, cluster_list, True)
def q10():
	nodes_list = {viz.DATA_111_URL:111, viz.DATA_290_URL:290, viz.DATA_896_URL:896}
	url_list = [viz.DATA_111_URL, viz.DATA_290_URL, viz.DATA_896_URL]

	kmeans_dict = dict()
	hierarchical_dict = dict()


	for url in url_list:
		data_table = viz.load_data_table(url)
		singleton_list = []
		for line in data_table:
			singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))


		kmeans_dict[url] = list()
		hierarchical_dict[url] = list()

		cluster_range = range(6, 20 + 1)
		for cluster_count in cluster_range:
			#kmeans
			cluster_list = alg_project3.kmeans_clustering(singleton_list, cluster_count, 5)
			kmeans_error = compute_distortion(cluster_list, data_table)	 
			kmeans_dict[url].append(kmeans_error)

		#hierarchical
		count = 20
		while count >= 6:
			alg_project3.hierarchical_clustering(singleton_list, count)
			hierarchical_error = compute_distortion(singleton_list, data_table)	 
			hierarchical_dict[url].insert(0, hierarchical_error)
			count -= 1

	for url in url_list:
		plt.title('Distortion for hierarchical and k-means clustering for '+str(nodes_list[url])+' points')
		plt.xlabel('Number of clusters')
		plt.ylabel('Distortion')
		line1, = plt.plot(cluster_range, kmeans_dict[url],'g') 
		line2, = plt.plot(cluster_range, hierarchical_dict[url],'b') 
		plt.legend((line1, line2), ('kmeans clustering', 'hierarchical clustering'))
		plt.show()




#q2()
#q2()
#q3()
#q5()
#q6()
#q7()
#q10()
def q7():

	data_table = viz.load_data_table(viz.DATA_111_URL)
	singleton_list = []
	for line in data_table:
		singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

	cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
	error2 = compute_distortion(cluster_list, data_table)
	
	cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9)
	error1 = compute_distortion(cluster_list, data_table)

	print 'hierarchical clustering',error1
	print 'kmeans clustering', error2
def q10():
    nodes_list = {
        viz.DATA_111_URL: 111,
        viz.DATA_290_URL: 290,
        viz.DATA_896_URL: 896
    }
    url_list = [viz.DATA_111_URL, viz.DATA_290_URL, viz.DATA_896_URL]

    kmeans_dict = dict()
    hierarchical_dict = dict()

    for url in url_list:
        data_table = viz.load_data_table(url)
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        kmeans_dict[url] = list()
        hierarchical_dict[url] = list()

        cluster_range = range(6, 20 + 1)
        for cluster_count in cluster_range:
            #kmeans
            cluster_list = alg_project3.kmeans_clustering(
                singleton_list, cluster_count, 5)
            kmeans_error = compute_distortion(cluster_list, data_table)
            kmeans_dict[url].append(kmeans_error)

        #hierarchical
        count = 20
        while count >= 6:
            alg_project3.hierarchical_clustering(singleton_list, count)
            hierarchical_error = compute_distortion(singleton_list, data_table)
            hierarchical_dict[url].insert(0, hierarchical_error)
            count -= 1

    for url in url_list:
        plt.title('Distortion for hierarchical and k-means clustering for ' +
                  str(nodes_list[url]) + ' points')
        plt.xlabel('Number of clusters')
        plt.ylabel('Distortion')
        line1, = plt.plot(cluster_range, kmeans_dict[url], 'g')
        line2, = plt.plot(cluster_range, hierarchical_dict[url], 'b')
        plt.legend((line1, line2),
                   ('kmeans clustering', 'hierarchical clustering'))
        plt.show()
def question7():

    data_table = alg_project3_viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    clusters_hierarchical = project3.hierarchical_clustering(singleton_list, 9)
    clusters_kmeans = project3.kmeans_clustering(singleton_list, 9, 5)

    distortion_hierarchical = compute_distortion(clusters_hierarchical, data_table)
    distortion_kmeans = compute_distortion(clusters_kmeans, data_table)

    print "distortion hierarchical: ", distortion_hierarchical
    print "distortion k-means: ", distortion_kmeans
def q7():

    data_table = viz.load_data_table(viz.DATA_111_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
    error2 = compute_distortion(cluster_list, data_table)

    cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9)
    error1 = compute_distortion(cluster_list, data_table)

    print('hierarchical clustering', error1)
    print('kmeans clustering', error2)
def question10_plot():
    urls = [DATA_111_URL, DATA_290_URL, DATA_896_URL]
    sizes = range(6, 21)
    data_sizes = [111, 290, 896]
    for url, data_size in zip(urls, data_sizes):
        data_table = load_data_table(url)
        singleton_list = []

        for line in data_table:
            cluster = Cluster(set([line[0]]), line[1],
                              line[2], line[3], line[4])
            singleton_list.append(cluster)

        # hierarchical clustering
        clusters = singleton_list
        distortion_hier = []
        while len(clusters) > 6:
            pair = fast_closest_pair(clusters)
            idx1, idx2 = pair[1], pair[2]
            clusters[idx1].merge_clusters(clusters[idx2])
            clusters.pop(idx2)
            if len(clusters) in sizes:
                distortion = compute_distortion(clusters, data_table)
                distortion_hier.append(distortion)

        # k-means
        distortion_kmeans = []
        for size in sizes:
            singleton_list = []
            for line in data_table:
                cluster = Cluster(set([line[0]]), line[1],
                                  line[2], line[3], line[4])
                singleton_list.append(cluster)
            c = kmeans_clustering(singleton_list, size, 5)
            distortion = compute_distortion(c, data_table)
            distortion_kmeans.append(distortion)
        plt.plot(sizes, distortion_hier[::-1])
        plt.plot(sizes, distortion_kmeans)
        legend_texts = ['hierarchical clustering',
                        'k-means clustering (5 iterations)']
        plt.legend(legend_texts, loc='upper right')
        plt.title('Distortion with %d county data' % (data_size))
        plt.xlabel('Size of Clusters')
        plt.ylabel('Distortion')
        plt.show()
Esempio n. 23
0
def load_data(cancer_id):
    '''
    Loads the cancer data.
    
    Input: A string that denotes which cancer data to load
    
    Output: a tuple (data_table, cluster_list).
    '''

    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    data_url = DIRECTORY + "data_clustering/unifiedCancerData_" + cancer_id + ".csv"

    data_table = viz.load_data_table(data_url)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    return (data_table, singleton_list)
Esempio n. 24
0
def compute_distortion(cluster_list):
    '''
    Computes the total distortion of a list of clusters.
    
    Input: cluster_list - a list of clusters.
    
    Output: total_distortion - a float of the total distortion
    '''

    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    # Init list variable for storing distortion
    distortion = []
    # loop over clusters
    for cluster in cluster_list:
        # Calculate and save distortion
        distortion.append(cluster.cluster_error(data_table))
    return sum(distortion)
Esempio n. 25
0
def cluster_data():
    '''
    Load a data table, compute a list of clusters and 
    
    Output: a tuple of two list of clusters (hierarchical, kmeans)
    '''
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    singleton_list_copy = [cluster.copy() for cluster in singleton_list]

    return (project.hierarchical_clustering(singleton_list, 9),
            project.kmeans_clustering(singleton_list_copy, 9, 5))
Esempio n. 26
0
def plot_Q6():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list = project.kmeans_clustering(singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    # draw the clusters using matplotlib
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
def question10(data, filename):
    table = load_data_table(data)
    clusters = Cluster.load_as_list(data)
    xs = range(6, 21)
    ys_hier = []

    def dist(clusters):
        ys_hier.append(distortion(clusters, table))

    hierarchical_clustering(clusters, 6, dist, set(xs))
    ys_hier.reverse()
    ys_kmeans = [distortion(kmeans_clustering(clusters, x, 5), table) for x in xs]

    plt.cla()
    plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion')
    plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion')
    plt.title('Clustering distortion (%s)' % data)
    plt.xlabel('Number of output clusters')
    plt.ylabel('Distortion')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename)
    print('Saved plot to %s' % filename)
Esempio n. 28
0
def gen_random_clusters():
    listx = []
    listy_slow = []
    listy_fast = []
    data_table = alp.load_data_table(alp.DATA_896_URL)
    
    for i in range(6,21):
        
        list1 = []
        sumerror = 0
        sumerror_fast = 0
        cluster_list = alp.run_example(i, True)
        cluster_list_fast = alp.run_example(i, False)
        
        for s in cluster_list:
            error = s.cluster_error(data_table)
            list1.append(error)
        sumerror = sum(list1)    
        list1 = []
        for s in cluster_list_fast:
            error = s.cluster_error(data_table)
            list1.append(error)
        sumerror_fast = sum(list1)    
        
        
        listx.append(i)
        listy_slow.append(sumerror)
        listy_fast.append(sumerror_fast)
        
    
    plt.plot(listx, listy_slow, '-r', label='hierarchical clustering')
    plt.plot(listx, listy_fast, '-b', label='kmeans clustering')
    plt.legend(loc='upper right')
    plt.title("Quality - Data Set of 896")
    plt.ylabel('Total error')
    plt.xlabel('Number of clusters')
    plt.show()
Esempio n. 29
0
def compute_q5_q6():
    # Load data
    table111 = viz_tools.load_data_table(DATA_111_URL)

    # Formate data as Clusters
    singleton_list = []
    for line in table111:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # Note: K-means tested first b/c clustering.hierarchical_clustering
    # mutates list of clusters

    # K-means
    kmeans_clusters = clustering.kmeans_clustering(singleton_list, 9, 5)
    k_distortion = compute_distortion(kmeans_clusters, table111)
    print("K-means Distortion: {}".format(k_distortion))

    # Hierarchical
    hierarchical_clusters = clustering.hierarchical_clustering(
        singleton_list, 9)
    h_distortion = compute_distortion(hierarchical_clusters, table111)
    print("Hierarchical Distortion: {}".format(h_distortion))
def run_example():
    """
    Modified to do question 7 showing distoration
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    # DATA_3108_URL DATA_111_URL DATA_290_URL
    data_table = load_data_table(DATA_111_URL) 
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    print "___________"
    cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    print "with Distoration of {0}".format(compute_distortion(cluster_list, data_table))
    
    print "___________"
    cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)	
    print "Displaying", len(cluster_list), "k-means clusters"
    print "with Distoration of {0}".format(compute_distortion(cluster_list, data_table))
Esempio n. 31
0
def test_compute_distortion():
    # Load data
    table290 = viz_tools.load_data_table(DATA_290_URL)

    # Formate data as Clusters
    singleton_list = []
    for line in table290:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # Note: K-means tested first b/c clustering.hierarchical_clustering
    # mutates list of clusters

    # Test 2: Expect 2.323×10^11
    kmeans_clusters = clustering.kmeans_clustering(singleton_list, 16, 5)
    k_distortion = compute_distortion(kmeans_clusters, table290)
    print("K-means Distortion: {}".format(k_distortion))

    # Test 1: Expect 2.575×10^11
    hierarchical_clusters = clustering.hierarchical_clustering(
        singleton_list, 16)
    h_distortion = compute_distortion(hierarchical_clusters, table290)
    print("Hierarchical Distortion: {}".format(h_distortion))
def compute_and_plot_distortions():
    """
    Compute the distortion of the list of clusters produced by hierarchical clustering and k-means clustering (using 5 iterations)
    on the 111, 290, and 896 county data sets, respectively, where the number of output clusters ranges from 6 to 20 (inclusive).
    Important note:To compute the distortion for all 15 output clusterings produced by hierarchical_clustering, you should remember
    that you can use the hierarchical cluster of size 20 to compute the hierarchical clustering of size 19 and so on. Otherwise,
     you will introduce an unnecessary factor of 15 into the computation of the 15 hierarchical clusterings.
    """

    #choose data set:
    #data_table = viz.load_data_table(viz.DATA_111_URL)
    #data_table = viz.load_data_table(viz.DATA_290_URL)
    data_table = viz.load_data_table(viz.DATA_896_URL)

    num_output_clusters = []
    kmeans_distortion = []
    hierarchical_distortion = []

    print "\nComputing kmeans distortions"
    for indx in range(6, 21):
        ##Dette loop kunne optimeres, saa beregningerne genbruges, men det er ikke noedvendigt, da k_means er saa hurtig
        num_output_clusters.append(indx)

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        kmeans_cluster_list = p.kmeans_clustering(singleton_list, indx, 5)
        distortion = compute_distortion(kmeans_cluster_list, data_table)
        kmeans_distortion.append(distortion)
        print indx, distortion

    print "Computed kmeans distortions"
    print ""
    print "Computing hierarchical distortions"

    for line in data_table:
        singleton_list.append(
            c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    hierarchical_cluster_list = singleton_list

    for indx in range(20, 5, -1):
        hierarchical_cluster_list = p.hierarchical_clustering(
            hierarchical_cluster_list, indx)
        distortion = compute_distortion(hierarchical_cluster_list, data_table)
        hierarchical_distortion.append(distortion)
        print indx, distortion

    hierarchical_distortion.reverse()

    print "Computed hierarchical distortions\n"
    print "Plotting data"

    plt.plot(num_output_clusters,
             kmeans_distortion,
             label="K-means clustering")
    plt.plot(num_output_clusters,
             hierarchical_distortion,
             label="Hierarchical clustering")

    plt.xlabel("Number of output clusters")
    plt.ylabel('Distortion')

    #tegner
    plt.legend()

    plt.title(
        "Comparison of distortion of two clustering methods \n Dataset: 896 counties"
    )

    #goer det hele synligt
    plt.show()
import alg_project3_viz
import proj3_solution
import alg_cluster
import matplotlib.pyplot as plt


DATA_URL = alg_project3_viz.DATA_896_URL
DATA_TABLE = alg_project3_viz.load_data_table(DATA_URL)


def compute_distortion(cluster_list):
    """
    Computes the Distortion of a list of clusters clustered by clustering methods.
    """
    total_distortion = 0
    for cluster in cluster_list:
        total_distortion += cluster.cluster_error(DATA_TABLE)
    return total_distortion


def compute_plot(x_list, y1_list, y2_list):
    """
    Plots the Running Time of two functions viz. y_list1 and y_list2
    """
    plt.figure(figsize=(12, 8), dpi = 80)
    plt.plot(x_list, y1_list, '-b', label='hierarchical clustering')
    plt.plot(x_list, y2_list, '-r', label='k-means clustering(5 iterations)')
    plt.xlabel('The number of output clusters')
    plt.ylabel('The distortion produced by the clustering methods')
    plt.title('Comparison of Distortion by two clustering methods on 896 counties')
    plt.legend(loc='upper right', prop={'size': 13.5})
def question10():

    data_table_111 = alg_project3_viz.load_data_table(DATA_111_URL)
    data_table_290 = alg_project3_viz.load_data_table(DATA_290_URL)
    data_table_896 = alg_project3_viz.load_data_table(DATA_896_URL)

    data_table_list_111 = compute_data_table(data_table_111)
    data_table_list_290 = compute_data_table(data_table_290)
    data_table_list_896 = compute_data_table(data_table_896)

    clusters = range(6,21)

    distortion_h_111_y = []
    distortion_h_290_y = []
    distortion_h_896_y = []

    distortion_k_111_y = []
    distortion_k_290_y = []
    distortion_k_896_y = []


    for idx in clusters:
        ###y points for hierarchicall data_111
        h_111 = project3.hierarchical_clustering(data_table_list_111, idx)
        distortion_h_111 = compute_distortion(h_111, data_table_111)
        distortion_h_111_y.append(distortion_h_111)

        ###y points for k-means data_111
        k_111 = project3.kmeans_clustering(data_table_list_111, idx, 5)
        distortion_k_111 = compute_distortion(k_111, data_table_111)
        distortion_k_111_y.append(distortion_k_111)

        ###y points for hier data_290
        h_290 = project3.hierarchical_clustering(data_table_list_290, idx)
        distortion_h_290 = compute_distortion(h_290, data_table_290)
        distortion_h_290_y.append(distortion_h_290)

        ###y points for k-means data 290
        k_290 = project3.kmeans_clustering(data_table_list_290, idx, 5)
        distortion_k_290 = compute_distortion(k_290, data_table_290)
        distortion_k_290_y.append(distortion_k_290)

        ###y points for hier data_896
        h_896 = project3.hierarchical_clustering(data_table_list_896, idx)
        distortion_h_896 = compute_distortion(h_896, data_table_896)
        distortion_h_896_y.append(distortion_h_896)

        ###y points for k-means data 896
        k_896 = project3.kmeans_clustering(data_table_list_896, idx, 5)
        distortion_k_896 = compute_distortion(k_896, data_table_896)
        distortion_k_896_y.append(distortion_k_896)



    plt.plot(clusters, distortion_h_111_y, '-b', label = 'hierarchical' )
    plt.plot(clusters, distortion_k_111_y, '-r', label = 'k-means')
    plt.title('Distortion for 111 points')
    plt.legend(loc = 'upper right')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()

    plt.plot(clusters, distortion_h_290_y, '-b', label='hierarchical')
    plt.plot(clusters, distortion_k_290_y, '-r', label='k-means')
    plt.title('Distortion for 290 points')
    plt.legend(loc='upper right')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()

    plt.plot(clusters, distortion_h_896_y, '-b', label='hierarchical')
    plt.plot(clusters, distortion_k_896_y, '-r', label='k-means')
    plt.title('Distortion for 896 points')
    plt.legend(loc='upper right')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.show()
def create_cluster_list(url):
    data_table = viz.load_data_table(url)
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
    return singleton_list, data_table
def question5(filename):
    data = 'data/unifiedCancerData_111.csv'
    dist = distortion(visualize(data, filename,
                                lambda x: hierarchical_clustering(x, 9)),
                      load_data_table(data))
    print('Distortion in question5, hierarchical_clustering = %f (%s)' % (dist, dist))
def question6(filename):
    data = 'data/unifiedCancerData_111.csv'
    dist = distortion(visualize(data, filename,
                                lambda x: kmeans_clustering(x, 9, 5)),
                      load_data_table(data))
    print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
Esempio n. 38
0
"""
Assignment 3 Question 7 Answer
"""

import alg_project3_viz as viz
import alg_project3_solution as sol
import alg_cluster

data_table = viz.load_data_table(viz.DATA_111_URL)

hier_data_list = sol.make_data_list(data_table)
kmeans_data_list = sol.make_data_list(data_table)

hier_cluster_list = sol.hierarchical_clustering(hier_data_list, 9)
kmeans_cluster_list = sol.kmeans_clustering(kmeans_data_list, 9, 5)

print("hierarchical:", sol.compute_distortion(hier_cluster_list, data_table))
print("kmeans:", sol.compute_distortion(kmeans_cluster_list, data_table))


# Hierarchical: 175163886915.8305 or 1.752 x 10^11 with four significant figures
# K-means: 271254226924.20047 or 2.712 x 10^11
# Copyright 2016 by Soros Liu
#
#                                                                          All Rights Reserved
"""

"""
import project_closest_pair_clustering as provided
import alg_project3_viz as viz
import alg_cluster
__author__ = 'Soros Liu'


def compute_distortion(cluster_list, data_table):
    distortion = 0.0
    for cluster in cluster_list:
        distortion += cluster.cluster_error(data_table)
    return distortion


if __name__ == '__main__':
    data_table = viz.load_data_table('unifiedCancerData_290.csv')
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list = provided.hierarchical_clustering(singleton_list, 16)
    print compute_distortion(cluster_list, data_table)
    # cluster_list = provided.kmeans_clustering(singleton_list, 16, 5)
    # print compute_distortion(cluster_list, data_table)
Esempio n. 40
0
def q10(plot_key):
    # Load data
    table111 = viz_tools.load_data_table(DATA_111_URL)
    table290 = viz_tools.load_data_table(DATA_290_URL)
    table896 = viz_tools.load_data_table(DATA_896_URL)

    # Create cluster function
    create_cluster = lambda line: alg_cluster.Cluster(set([line[0]]), line[
        1], line[2], line[3], line[4])

    # Formate data as Clusters
    klist111 = [create_cluster(line) for line in table111]
    klist290 = [create_cluster(line) for line in table290]
    klist896 = [create_cluster(line) for line in table896]
    hlist111 = [create_cluster(line) for line in table111]
    hlist290 = [create_cluster(line) for line in table290]
    hlist896 = [create_cluster(line) for line in table896]

    # Initialize distortion lists
    distortion111k, distortion290k, distortion896k = [], [], []
    distortion111h, distortion290h, distortion896h = [], [], []

    # Calculate distortion lists
    for num in range(20, 5, -1):
        if plot_key == 111:
            kmeans_cluster111 = clustering.kmeans_clustering(klist111, num, 5)
            h_cluster111 = clustering.hierarchical_clustering(hlist111, num)
            distortion111k.append(
                compute_distortion(kmeans_cluster111, table111))
            distortion111h.append(compute_distortion(h_cluster111, table111))
        elif plot_key == 290:
            kmeans_cluster290 = clustering.kmeans_clustering(klist290, num, 5)
            h_cluster290 = clustering.hierarchical_clustering(hlist290, num)
            distortion290k.append(
                compute_distortion(kmeans_cluster290, table290))
            distortion290h.append(compute_distortion(h_cluster290, table290))
        elif plot_key == 896:
            kmeans_cluster896 = clustering.kmeans_clustering(klist896, num, 5)
            h_cluster896 = clustering.hierarchical_clustering(hlist896, num)
            distortion896k.append(
                compute_distortion(kmeans_cluster896, table896))
            distortion896h.append(compute_distortion(h_cluster896, table896))

    # Plot results
    fig = plt.figure('Distortion for Different Clustering Methods')
    plt.title('Distortion for Different Clustering Methods: {} Points'.format(
        plot_key))
    plt.xlabel('Number of Clusters')
    plt.ylabel('Distortion')

    x = list(range(20, 5, -1))

    if plot_key == 111:
        y1, y4 = distortion111k, distortion111h
        plt.plot(x, y1, '-bo', markersize=1, label='K-means (111)')
        plt.plot(x, y4, '-co', markersize=1, label='Hierarchical (111)')
    elif plot_key == 290:
        y2, y5 = distortion290k, distortion290h
        plt.plot(x, y2, '-go', markersize=1, label='K-means (290)')
        plt.plot(x, y5, '-mo', markersize=1, label='Hierarchical (290)')
    elif plot_key == 896:
        y3, y6 = distortion896k, distortion896h
        plt.plot(x, y3, '-ro', markersize=1, label='K-means (896)')
        plt.plot(x, y6, '-yo', markersize=1, label='Hierarchical (896)')

    plt.legend(loc='best')

    plt.show()
# -*- Mode: Python -*-
# Author: Soros Liu <*****@*****.**>

# ==================================================================================================
# Copyright 2016 by Soros Liu
#
#                                                                          All Rights Reserved
"""

"""
import project_closest_pair_clustering as provided
import alg_project3_viz as viz
import alg_cluster
__author__ = 'Soros Liu'

def compute_distortion(cluster_list, data_table):
    distortion = 0.0
    for cluster in cluster_list:
        distortion += cluster.cluster_error(data_table)
    return distortion

if __name__ == '__main__':
    data_table = viz.load_data_table('unifiedCancerData_290.csv')
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
    cluster_list = provided.hierarchical_clustering(singleton_list, 16)
    print compute_distortion(cluster_list, data_table)
    # cluster_list = provided.kmeans_clustering(singleton_list, 16, 5)
    # print compute_distortion(cluster_list, data_table)
from alg_project3_viz import load_data_table, DATA_896_URL, DATA_290_URL, DATA_111_URL, DATA_3108_URL
from clustering_algorithms import *
from cluster import *
import matplotlib.pyplot as plt

DATA_TABLE_3108 = load_data_table(DATA_3108_URL)
DATA_TABLE_896 = load_data_table(DATA_896_URL)
DATA_TABLE_290 = load_data_table(DATA_290_URL)
DATA_TABLE_111 = load_data_table(DATA_111_URL)
data_table_list = [DATA_TABLE_111, DATA_TABLE_290, DATA_TABLE_896]

def compute_distortion(cluster_list, data_table):
    """
    compute distortion, sum of cluster errors in each cluster of cluster_list
    """
    return sum([cluster.cluster_error(data_table) for cluster in cluster_list])


def create_singleton_list(data_table):
	"""
	create initial cluster_list where each cluster contain only one country
	"""
	singleton_list = []
	for line in data_table:
		singleton_list.append(Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
	return singleton_list

def distortion_analysis(clustering_func, data_table, min_cluster_num, max_cluster_num):
	ans = []
	if clustering_func == hierarchical_clustering:
		cluster_list = create_singleton_list(data_table)
def a3q10_distortion(data_table):
    num_clusters = range(6, 21)
    kmean_distortion = []
    hierarchical_distortion = []
    for num in num_clusters:
        kmean_distortion.append(distortion(data_table, num)[0])
        hierarchical_distortion.append(distortion(data_table, num)[1])
    return (hierarchical_distortion, kmean_distortion)


def a3q10_plot(x, y1, y2, data):
    """
    Plot an example with two curves with legends
    """
    plt.title("Distortion of Hierarchical/KMeans of " + str(data) + " data")
    plt.xlabel("number of initial clusters")
    plt.ylabel("distortion")
    plt.plot(x, y1, '-b', label='Hierarchical clustering')
    plt.plot(x, y2, '-r', label='KMeans clustering')
    plt.legend(loc='upper right')
    plt.show()


data = 896
data_table = viz.load_data_table(viz.DATA_896_URL)
x = list(range(6, 21))
y1 = a3q10_distortion(data_table)[0]
y2 = a3q10_distortion(data_table)[1]
a3q10_plot(x, y1, y2, data)
Esempio n. 44
0
def question6(filename):
    data = 'unifiedCancerData_111.csv'
    dist = distortion(
        visualize(data, filename, lambda x: kmeans_clustering(x, 9, 5)),
        load_data_table(data))
    print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
Esempio n. 45
0
import alg_project3_viz
import proj3_solution
import alg_cluster
import matplotlib.pyplot as plt

DATA_URL = alg_project3_viz.DATA_896_URL
DATA_TABLE = alg_project3_viz.load_data_table(DATA_URL)


def compute_distortion(cluster_list):
    """
    Computes the Distortion of a list of clusters clustered by clustering methods.
    """
    total_distortion = 0
    for cluster in cluster_list:
        total_distortion += cluster.cluster_error(DATA_TABLE)
    return total_distortion


def compute_plot(x_list, y1_list, y2_list):
    """
    Plots the Running Time of two functions viz. y_list1 and y_list2
    """
    plt.figure(figsize=(12, 8), dpi=80)
    plt.plot(x_list, y1_list, '-b', label='hierarchical clustering')
    plt.plot(x_list, y2_list, '-r', label='k-means clustering(5 iterations)')
    plt.xlabel('The number of output clusters')
    plt.ylabel('The distortion produced by the clustering methods')
    plt.title(
        'Comparison of Distortion by two clustering methods on 896 counties')
    plt.legend(loc='upper right', prop={'size': 13.5})
def test_distortion():
    """
    sum up the errors for all clusters

    """
    data_urls = [DATA_111_URL, DATA_290_URL, DATA_896_URL]
    for idx, url in enumerate(data_urls):
        if idx == 0:
            title = '111'
        elif idx == 1:
            title = '290'
        else:
            title = '896'

        data_table = alg_project3_viz.load_data_table(url)

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        ascending_cluster_range = range(6, 21)

        cluster_lists = map(
            lambda cluster_count: closestpair.kmeans_clustering(
                singleton_list, cluster_count, 5), ascending_cluster_range)

        kmeans_cluster_distortion = map(
            lambda cluster_list: compute_distortion(cluster_list, data_table),
            cluster_lists)

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        cluster_range = range(20, 5, -1)
        hierarchical_cluster_distortion = []

        previous_clusters = singleton_list
        for num_clusters in cluster_range:
            new_clusters = closestpair.hierarchical_clustering(
                previous_clusters, num_clusters)
            hierarchical_cluster_distortion.append(
                compute_distortion(new_clusters, data_table))
            previous_clusters = new_clusters

        hierarchical_cluster_distortion.reverse()

        pyplot.plot(ascending_cluster_range,
                    hierarchical_cluster_distortion,
                    '-b',
                    label='hierarchical clustering')
        pyplot.plot(ascending_cluster_range,
                    kmeans_cluster_distortion,
                    '-r',
                    label='k-means clustering')
        pyplot.legend(loc='upper right')
        pyplot.ylabel('cluster distortion')
        pyplot.xlabel('number of clusters in graph')
        pyplot.title('distortion from hierarchical and k-means clustering: ' +
                     title + ' counties')
        pyplot.show()