Example #1
0
def question10(data, filename):
    table = load_data_table(data)
    clusters = load_as_list(data)
    xs = range(6, 21)
    ys_hier = []

    def dist(clusters):
        ys_hier.append(distortion(clusters, table))

    hierarchical_clustering(clusters, 6, dist, set(xs))
    ys_hier.reverse()
    clusters = load_as_list(data)
    ys_kmeans = [
        distortion(kmeans_clustering(clusters, x, 5), table) for x in xs
    ]

    plt.cla()
    plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion')
    plt.plot(xs, ys_kmeans, '-b', label='k-means clustering distortion')
    plt.title('Clustering distortion (%s)' % data)
    plt.xlabel('Number of output clusters')
    plt.ylabel('Distortion')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename)
    print('Saved plot to %s' % filename)
Example #2
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

       
        
    #cluster_list = sequential_clustering(singleton_list, 15)	
    #print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)	
    print "Displaying", len(cluster_list), "k-means clusters"

    print "Distortion", alg_project3_solution.compute_distortion(cluster_list, data_table)
    

            
    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers
def clustering_distortion(data_url, cluster_method):
    """Return a list of distortions.

    Input: a data_url for information on cancer data and either clustering
    method of des.kmeans_clustering or des.hierarchical_clustering

    Output: a list of distortions for a range of iterations for
    kmeans_clustering
    """
    cluster_list = des.cluster_lst(data_url)
    distortions_list = []

    if cluster_method == des.kmeans_clustering:
        for num_clstr in range(6, 21):
            kmeans_clusters = des.kmeans_clustering(cluster_list, num_clstr, 5)
            distortions_list.append(
                compute_distortion(kmeans_clusters, data_url))

    elif cluster_method == des.hierarchical_clustering:
        init_hierachical_clusters = des.hierarchical_clustering(
            cluster_list, 20)
        distortions_list.append(
            compute_distortion(init_hierachical_clusters, data_url))
        for num_clstr in range(19, 5, -1):
            hierachical_clusters = des.hierarchical_clustering(
                init_hierachical_clusters, num_clstr)
            distortions_list.append(
                compute_distortion(hierachical_clusters, data_url))
        distortions_list.reverse()

    else:
        return "Invalid cluster_method"

    return distortions_list
Example #4
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_290_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"
    cluster_list = alg_project3_solution.hierarchical_clustering(
        list(singleton_list), 16)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    print "Distortion", compute_distortion(cluster_list, data_table)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list2 = alg_project3_solution.kmeans_clustering(
        list(singleton_list), 16, 5)
    print "Displaying", len(cluster_list2), "k-means clusters"
    print "Distortion", compute_distortion(cluster_list2, data_table)
    def test_four_pairs(self):
        s = map(lambda x: Cluster(*x), [
            (set([1]), 0, 0, 13, 0.1),
            (set([2]), 0, 0, 14, 0.2),
            (set([3]), 0, 0, 14, 0.2),
            (set([4]), 0, 0, 14, 0.2),
            (set([10]), 10, 10, 14, 0.2),
            (set([11]), 10, 10, 14, 0.2),
            (set([12]), 10, 10, 14, 0.2),
            (set([13]), 10, 10, 14, 0.2),
        ])
        # s = map(lambda x: Cluster(*x),
        #         [(set([1]), 0, 0, 13, 0.1),
        #          (set([2]), 1, 0, 14, 0.2),
        #          (set([3]), 2, 0, 14, 0.2),
        #          (set([4]), 3, 0, 14, 0.2),

        #          (set([10]), 0, 100, 14, 0.2),
        #          (set([11]), 1, 100, 14, 0.2),
        #          (set([12]), 2, 100, 14, 0.2),
        #          (set([13]), 3, 100, 14, 0.2),
        #          ])

        h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   hierarchical_clustering(s, 2), [])
        k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   kmeans_clustering(s, 2, 10), [])

        h = sorted([list(x) for x in h])
        k = sorted([list(x) for x in k])

        # print(h)
        # print(k)
        self.assertEqual(h, k)
    def test_four_pairs(self):
        s = map(lambda x: Cluster(*x),
                [(set([1]), 0, 0, 13, 0.1),
                 (set([2]), 0, 0, 14, 0.2),
                 (set([3]), 0, 0, 14, 0.2),
                 (set([4]), 0, 0, 14, 0.2),

                 (set([10]), 10, 10, 14, 0.2),
                 (set([11]), 10, 10, 14, 0.2),
                 (set([12]), 10, 10, 14, 0.2),
                 (set([13]), 10, 10, 14, 0.2),
                 ])
        # s = map(lambda x: Cluster(*x),
        #         [(set([1]), 0, 0, 13, 0.1),
        #          (set([2]), 1, 0, 14, 0.2),
        #          (set([3]), 2, 0, 14, 0.2),
        #          (set([4]), 3, 0, 14, 0.2),

        #          (set([10]), 0, 100, 14, 0.2),
        #          (set([11]), 1, 100, 14, 0.2),
        #          (set([12]), 2, 100, 14, 0.2),
        #          (set([13]), 3, 100, 14, 0.2),
        #          ])

        h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   hierarchical_clustering(s, 2), [])
        k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   kmeans_clustering(s, 2, 10), [])

        h = sorted([list(x) for x in h])
        k = sorted([list(x) for x in k])

        # print(h)
        # print(k)
        self.assertEqual(h, k)
Example #7
0
def q10_legend(DATA_URL):
    data_table = load_data_table(DATA_URL)
    singleton_list = []
    hierarchical_cluster_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
        hierarchical_cluster_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    xvals = []
    yvals1 = []
    yvals2 = []
    for num_clusters in range(20, 5, -1):
        xvals.append(num_clusters)
        hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(
            hierarchical_cluster_list, num_clusters)
        yvals1.append(compute_distortion(hierarchical_cluster_list,
                                         data_table))
        yvals2.append(
            compute_distortion(
                alg_project3_solution.kmeans_clustering(
                    singleton_list, num_clusters, 5), data_table))
    curve1 = [[xvals[idx], yvals1[idx]] for idx in range(len(xvals))]
    curve2 = [[xvals[idx], yvals2[idx]] for idx in range(len(xvals))]
    simpleplot.plot_lines(
        "The distortion of output clusters uesd " + str(len(data_table)) +
        "-county data set", 800, 600, "the number of output clusters",
        "the distortion associated with each output clustering",
        [curve1, curve2], True, ["hierarchical cluster", "kmeans cluster"])
Example #8
0
def q10():
	sizes = xrange(6,21)
	data_file = open('unifiedCancerData_896.csv','r')
	data = data_file.read()
	data_lines = data.split('\n')
	data_tokens = [line.split(',') for line in data_lines]
	data_table = [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] for tokens in data_tokens]
	singleton_list = []
	singleton_list1 = []
	
	t1 = []
	t2 = []
	for item in sizes:
		singleton_list = []
		singleton_list1 = []
		for line in data_table:
			singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
			singleton_list1.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
		cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, item)
		t1.append(compute_distortion(cluster_list,data_table))
		cluster_list1 = alg_project3_solution.kmeans_clustering(singleton_list1, item, 5)
		t2.append(compute_distortion(cluster_list1,data_table))
	print t1
	print t2
	plt.plot(sizes,t1,'r-',label='hierarchical_clustering')
	plt.plot(sizes,t2,'b-',label='kmeans_clustering')
	plt.title('CancerData_896')
	plt.xlabel('Number of output clusters')
	plt.ylabel('Distortion associated with each output clustering')
	plt.legend(loc='upper right')
	plt.show()
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    # data_table = load_data_table(DATA_3108_URL)

    print 'in run_example'

    k_n=[]
    h_n=[]
    for x in range(6,21):
        print '------>:',x,'<-----\n'
        # kmeans
        data_table=load_data_table(DATA_111_URL)
        singleton_list=[]
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        cluster_list_k = alg_project3_solution.kmeans_clustering(singleton_list, x, 5)
        kmeans=reduce(lambda x,y:x+y,map(lambda x:x.cluster_error(data_table),cluster_list_k))
        k_n.append(kmeans)

        #hierarchical
        data_table=load_data_table(DATA_111_URL)
        singleton_list=[]
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        cluster_list_h = alg_project3_solution.hierarchical_clustering(singleton_list, x)
        hierarchical=reduce(lambda x,y:x+y,map(lambda x:x.cluster_error(data_table),cluster_list_h))
        h_n.append(hierarchical)
    print 'kmean:',k_n
    print 'hierarchical:',h_n
 def test02(self):
     cluster_list = kmeans_clustering([Cluster(set([0]), 0, 0, 1, 0), Cluster(set([1]), 0, 3, 1, 0), 
                                       Cluster(set([2]), 1, 2, 1, 0), Cluster(set([3]), 2, 2, 1, 0), 
                                       Cluster(set([4]), 3, 0, 1, 0), Cluster(set([5]), 3, 3, 1, 0)], 
                                       2, 3)
     
     self.assertEqual(len(cluster_list),2)
     
 def test01(self):
     cluster_list = kmeans_clustering([Cluster(set([0]), -4.0, 0.0, 1, 0), 
                                       Cluster(set([1]), 0.0, -1.0, 1, 0), 
                                       Cluster(set([2]), 0.0, 1.0, 1, 0), 
                                       Cluster(set([3]), 4.0, 0.0, 1, 0)], 2, 3
                                              )
     
     self.assertEqual(len(cluster_list),2)
Example #12
0
def cluster_by_kmeans(data_table_url,num_clusters,num_iterate):
    # load data table
    data_table = load_data_table(data_table_url)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_clusters, num_iterate)	
    return cluster_list
Example #13
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    data_table = load_data_table_local(DATA_290)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    singleton_list_copy = [singleton.copy() for singleton in singleton_list]

    # cluster_list = sequential_clustering(singleton_list, 15)
    # print "Displaying", len(cluster_list), "sequential clusters"

    hierarchical_distortions = []

    cluster_list = alg_project3_solution.hierarchical_clustering(
        singleton_list, 20)
    hierarchical_distortions.append(
        compute_distortion(cluster_list, data_table))
    for num_clusters in range(19, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            cluster_list, num_clusters)
        hierarchical_distortions.append(
            compute_distortion(cluster_list, data_table))
    hierarchical_distortions.reverse()
    # print "Displaying", len(cluster_list), "hierarchical clusters"

    kmeans_distortions = []

    for num_clusters in range(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list_copy, num_clusters, 5)
        kmeans_distortions.append(compute_distortion(cluster_list, data_table))
    # print "Displaying", len(cluster_list), "k-means clusters"

    # code to compute distortion
    # distortion = compute_distortion(cluster_list, data_table)
    # print "distortion = " + str(distortion)

    # draw the clusters using matplotlib or simplegui
    # if DESKTOP:
    # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
    #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    # else:
    # alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers

    return [hierarchical_distortions, kmeans_distortions]
def question_seven():
    """Return the distortion for kmeans and hierarchical clusters."""
    q7_data_url = des.DATA_111_URL  # change url depending on desired data table
    q7_clst = des.cluster_lst(q7_data_url)
    q7_kmeans_clusters = des.kmeans_clustering(q7_clst, 9, 5)
    q7_hierarchical_clusters = des.hierarchical_clustering(q7_clst, 9)

    kmeans_dist = compute_distortion(q7_kmeans_clusters, q7_data_url)
    hierarchical_dist = compute_distortion(q7_hierarchical_clusters,
                                           q7_data_url)

    return "hierarchical distortion =", hierarchical_dist, "kmeans distortion =", kmeans_dist
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_111_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(DATA_896_URL)


    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    hier = []
    k_means = []
    for num in range(6, 21):
        singleton_list_copy = [item.copy() for item in singleton_list]
        #cluster_list = sequential_clustering(singleton_list, 15)
        #print "Displaying", len(cluster_list), "sequential clusters"

        cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list_copy, num)
        hier.append(compute_distortion(cluster_list, data_table))
        #print "Displaying", len(cluster_list), "hierarchical clusters"

        cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num, 5)
        k_means.append(compute_distortion(cluster_list, data_table))
    print hier
    print k_means
    plt.plot(range(6, 21), hier, label="hierarchical_clustering")
    plt.plot(range(6, 21), k_means, label="kmeans_clustering")
    plt.xlabel("Number of outcome clusters")
    plt.ylabel("Distrotion")
    plt.title("Distrotion with 896 counties")
    plt.legend()
    plt.show()
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "k means", compute_distortion(cluster_list, data_table)
    #print "Displaying", len(cluster_list), "k-means clusters"


    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"


    # draw the clusters using matplotlib or simplegui
    """
Example #16
0
def compute_kmeans_distortions(cluster_list):
    """ list -> list
    Takes a list of cluster objects and iteratively clusters the data further,
    while calculating the distortion at each iteration.  Returns a list of
    distortion values.
    """
    distortions = []

    for iteration in range(6, 21):
        new_list = sol.kmeans_clustering(cluster_list, iteration, 5)
        distortions.append(sol.compute_distortion(new_list, data_table))

    return distortions
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_896_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 16, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    kmeans = []
    for clusters_number in xrange(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, clusters_number, 5)
        kmeans.append([
            clusters_number, 0.0 +
            alg_project3_solution.compute_distortion(cluster_list, data_table)
        ])

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 20)
    #hierarchical = [[20, alg_project3_solution.compute_distortion(cluster_list, data_table)]]
    hierarchical = []
    for clusters_number in xrange(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            singleton_list, clusters_number)
        hierarchical.append([
            clusters_number, 0.0 +
            alg_project3_solution.compute_distortion(cluster_list, data_table)
        ])
    hierarchical.reverse()
    #print hierarchical[10], kmeans[10]

    simpleplot.plot_lines(
        "Distortion of the clusterings produced by hierarchical and k-means metods on 896 county data set",
        800, 600, "Number of clusters n [6 .. 20]", "Distortion",
        [hierarchical, kmeans], False,
        ["Hierarchical clustering", "k-means clustering with 5 iterations"])
Example #18
0
def q7():
	data_file = open('unifiedCancerData_111.csv','r')
	data = data_file.read()
	data_lines = data.split('\n')
	data_tokens = [line.split(',') for line in data_lines]
	data_table = [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] for tokens in data_tokens]
    
	singleton_list = []
	for line in data_table:
		singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

	#cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
	cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
	print compute_distortion(cluster_list,data_table)
Example #19
0
def clustering():
    title_list = ['111 counties', '290 counties', '896 counties']
    url_list = [DATA_111_URL, DATA_290_URL, DATA_896_URL]
    distortion_hierarchical = [[], [], []]
    distortion_kmeans = [[], [], []]
    num_clusters_list = range(20, 5, -1)

    for idx in range(len(url_list)):
        data_table = load_data_table(url_list[idx])
        cluster_list = []
        for line in data_table:
            cluster_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
        cluster_list_copy = [cluster.copy() for cluster in cluster_list]
        for num_cluster in num_clusters_list:
            cluster_list = student.hierarchical_clustering(
                cluster_list, num_cluster)
            distortion = compute_distortion(cluster_list, data_table)
            distortion_hierarchical[idx].append(distortion)
            print "Displaying", len(
                cluster_list), "hierarchical clusters, distortion:", distortion

        for num_cluster in num_clusters_list:
            cluster_list = student.kmeans_clustering(cluster_list_copy,
                                                     num_cluster, 5)
            distortion = compute_distortion(cluster_list, data_table)
            distortion_kmeans[idx].append(distortion)
            print "Displaying", len(
                cluster_list), "k-means clusters, distortion:", distortion

        plot_num = 131 + idx
        plt.subplot(plot_num)
        plt.plot(num_clusters_list,
                 distortion_hierarchical[idx],
                 "o-",
                 label="hierarchical")
        plt.plot(num_clusters_list,
                 distortion_kmeans[idx],
                 "x-",
                 label="kmeans")
        plt.legend()
        plt.ylabel('Distortion')
        plt.xlabel('Number of clusters')
        plt.grid(True)
        plt.title(title_list[idx])
    plt.show()
def compute_distortion(cluster_list, data_table, out_size):
    clust_list = cluster_list[:]

    # note that hierarchical_clustering mutates cluster_list
    clusters_k = cluster_algs.kmeans_clustering(clust_list, out_size, 5)
    clusters_h = cluster_algs.hierarchical_clustering(cluster_list, out_size)

    distortion_h = 0
    distortion_k = 0

    for cluster_h in clusters_h:
        distortion_h += cluster_h.cluster_error(data_table)

    for cluster_k in clusters_k:
        distortion_k += cluster_k.cluster_error(data_table)

    return (distortion_h, distortion_k)
def run_example_two():

    #data_table = load_data_table(DATA_896_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(DATA_111_URL)

    min_num_of_clusters = 6
    max_num_of_clusters = 20

    kmeans_points = {}
    hierarchical_points = {}

    num_of_clusters = min_num_of_clusters

    while num_of_clusters <= max_num_of_clusters:
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        # generate the clusters
        cluster_list = alg_project3_solution.hierarchical_clustering(
            singleton_list, num_of_clusters)

        # calculate the distortion
        distortion = compute_distortion(cluster_list, data_table)
        #print(distortion)
        hierarchical_points.update({num_of_clusters: distortion})

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, num_of_clusters, 5)
        distortion = compute_distortion(cluster_list, data_table)
        #print(distortion)
        kmeans_points.update({num_of_clusters: distortion})

        num_of_clusters += 1

    plot_graphs(hierarchical_points, kmeans_points)
Example #22
0
def quality_k(data_table, interval):
    """
    Input:
        Loaded data table
        List with the number of output clusters
    Return:
        List of distortion of the clusterings,
        produced by k-means clustering method
    """
    singleton_list = \
        [alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
            for line in data_table]
    distortion = []
    for num_clstrs in interval:
        cluster_list = \
            alg_project3_solution.kmeans_clustering(singleton_list, num_clstrs, 5)
        distortion.append(sum([clstr.cluster_error(data_table) for clstr in cluster_list]))
    return distortion
Example #23
0
def run_kmeans():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 15, 5)	
    print "Displaying", len(cluster_list), "k-means clusters"
            
    # draw the clusters using matplotlib or simplegui
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def test_kmeans():
    """
    Test for k-means clustering
    kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways
    """

    # load small data table
    print
    print "Testing kmeans_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)

    kmeansdata_24 = [[15, 1, set([('34017', '36061'), ('06037',), ('06059',), ('36047',), ('36081',), ('06071', '08031'), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [15, 3, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [15, 5, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [10, 1, set([('34017', '36061'), ('06029', '06037', '06075'), ('11001', '24510', '34013', '34039', '51013', '51760', '51840', '54009'), ('06059',), ('36047',), ('36081',), ('06071', '08031', '41051', '41067'), ('36059',), ('36005',), ('01073', '55079')])],
                     [10, 3, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [10, 5, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [5, 1, set([('06029', '06037', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36047', '51013', '51760', '51840', '54009', '55079'), ('06059',), ('36005', '36059', '36061', '36081'), ('06071', '08031', '41051', '41067')])],
                     [5, 3, set([('06029', '06037', '06075'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '51760', '51840', '54009', '55079')])],
                     [5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])]]

    suite = poc_simpletest.TestSuite()

    for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24:

        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.kmeans_clustering(cluster_list, num_clusters, num_iterations)
        student_county_tuple = set_of_county_tuples(student_clustering)

        # Prepare test
        error_message = "Testing kmeans_custering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += " num_iterations = " + str(num_iterations)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)

    suite.report_results()
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_896_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)	
    #print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16)
    #print "Displaying", len(cluster_list), "hierarchical clusters"
 
    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 16, 5)	
    #print "Displaying", len(cluster_list), "k-means clusters"
    
    kmeans = []
    for clusters_number in xrange(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, clusters_number, 5)
        kmeans.append([clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table)])

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 20)
    #hierarchical = [[20, alg_project3_solution.compute_distortion(cluster_list, data_table)]]
    hierarchical = []
    for clusters_number in xrange(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, clusters_number)
        hierarchical.append([clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table)])
    hierarchical.reverse() 
    #print hierarchical[10], kmeans[10]
 
 
    
    
    simpleplot.plot_lines("Distortion of the clusterings produced by hierarchical and k-means metods on 896 county data set",
                      800, 600, "Number of clusters n [6 .. 20]", "Distortion",
                      [hierarchical, kmeans], False,
                      ["Hierarchical clustering", "k-means clustering with 5 iterations"])            
def run_distortion():

    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print("Displaying", len(cluster_list), "sequential clusters")

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    #print( 'Hierarchical Distortion = ', compute_distortion(cluster_list, data_table) )

    cluster_list2 = []
    cluster_list = alg_project3_solution.kmeans_clustering(
        singleton_list, 9, 5)
    print('KMeans Distortion = ', compute_distortion(cluster_list, data_table))
Example #27
0
def plot_distortions():
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv"
    DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_24_URL = DIRECTORY + "data_clustering/unifiedCancerData_24.csv"

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    data_table = load_data_table(DATA_896_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    errors_h = []
    for num_clusters in range(6, 21):
        cluster_list = hierarchical_clustering(
            [clu.copy() for clu in singleton_list], num_clusters)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_h.append(cluster_error)

    errors_k = []
    for num_clusters in range(6, 21):
        cluster_list = kmeans_clustering(
            [clu.copy() for clu in singleton_list], num_clusters, 5)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_k.append(cluster_error)

    xlabel("number of output clusters")
    ylabel("distortion")
    #xscale('log')
    #yscale('log')
    plot(range(6, 21), errors_h, '-b', label="hierarchical")
    plot(range(6, 21), errors_k, '-r', label="kmeans")

    legend(loc="upper left")
    title("896 county data sets")
    show()
Example #28
0
def plot_distortions():
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv"
    DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_24_URL = DIRECTORY + "data_clustering/unifiedCancerData_24.csv"

    
        
    #cluster_list = sequential_clustering(singleton_list, 15)    
    #print "Displaying", len(cluster_list), "sequential clusters"

    data_table = load_data_table(DATA_896_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    errors_h = []
    for num_clusters in range(6,21):             
        cluster_list = hierarchical_clustering([clu.copy() for clu in singleton_list], num_clusters)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_h.append(cluster_error)
        
    errors_k = []
    for num_clusters in range(6,21):             
        cluster_list = kmeans_clustering([clu.copy() for clu in singleton_list], num_clusters, 5)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_k.append(cluster_error)
        
    xlabel("number of output clusters")
    ylabel("distortion")
    #xscale('log')
    #yscale('log')
    plot(range(6,21), errors_h, '-b', label="hierarchical")
    plot(range(6,21), errors_k, '-r', label="kmeans")
    
    legend(loc="upper left")
    title("896 county data sets")
    show()
Example #29
0
def run_distortion_graph():
    data_table = load_data_table(DATA_896_URL)
    size_clusters = range(6, 21)
    hierarchical_distortion = []
    kmeans_distortion = []
    for size in size_clusters:
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
        cluster_list_hierarchical = alg_project3_solution.hierarchical_clustering(
            singleton_list, size)
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
        cluster_list_kmeans = alg_project3_solution.kmeans_clustering(
            singleton_list, size, 5)
        hierarchical_distortion.append(
            compute_distortion(cluster_list_hierarchical, data_table) /
            100000000000.0)
        kmeans_distortion.append(
            compute_distortion(cluster_list_kmeans, data_table) /
            100000000000.0)

    plt.figure()
    plt.plot(size_clusters,
             hierarchical_distortion,
             '-b',
             label='Hierarchical_distortion')
    plt.plot(size_clusters, kmeans_distortion, '-g', label='Kmeans_distortion')
    plt.legend(loc='upper right')
    plt.title(
        'Distortion for hierarchical and k-means clustering for 896 data')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion(x 10^11)')
    plt.show()
def question10(data, filename):
    table = load_data_table(data)
    clusters = Cluster.load_as_list(data)
    xs = range(6, 21)
    ys_hier = []

    def dist(clusters):
        ys_hier.append(distortion(clusters, table))

    hierarchical_clustering(clusters, 6, dist, set(xs))
    ys_hier.reverse()
    ys_kmeans = [distortion(kmeans_clustering(clusters, x, 5), table) for x in xs]

    plt.cla()
    plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion')
    plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion')
    plt.title('Clustering distortion (%s)' % data)
    plt.xlabel('Number of output clusters')
    plt.ylabel('Distortion')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename)
    print('Saved plot to %s' % filename)
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    start_time = time()
    cluster_list = alg_project3_solution.hierarchical_clustering(
        singleton_list, 15)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    end_time = time()

    hierarchical_clustering_time = end_time - start_time

    start_time = time()
    cluster_list = alg_project3_solution.kmeans_clustering(
        singleton_list, 15, 5)
    print "Displaying", len(cluster_list), "k-means clusters"
    end_time = time()

    kmeans_clustering_time = end_time - start_time

    print hierarchical_clustering_time, kmeans_clustering_time
    """
Example #32
0
def question10_plot(date_url):
    data_table = load_data_table(date_url)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    xvals = [cluster_num_k for cluster_num_k in range(6, 21)]

    kc_cd_yvals = []
    for cluster_num_k in range(20, 5, -1):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, cluster_num_k, 5)
        kc_cd_yvals.append(compute_distortion(cluster_list, data_table))
    kc_cd_yvals.reverse()

    hc_cd_yvals = []
    cluster_list = list(singleton_list)
    for cluster_num_k in range(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            cluster_list, cluster_num_k)
        hc_cd_yvals.append(compute_distortion(cluster_list, data_table))
    hc_cd_yvals.reverse()

    plt.plot(xvals, kc_cd_yvals, '-b', label='kmeans_clustering_distortion')
    plt.plot(xvals,
             hc_cd_yvals,
             '-r',
             label='hierarchical_clustering_distortion')
    plt.legend(loc='upper right')
    plt.xlabel('cluster num')
    plt.ylabel('distortion')
    title_str = 'DATA: ' + re.search('[0-9]+', date_url).group(0)
    plt.title(title_str)
    plt.grid(True)
    plt.show()
Example #33
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    data_table = load_data_table(DATA_111_URL)
    #data_table = load_data_table(DATA_290_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    # print "Displaying", len(cluster_list), "hierarchical clusters"
    # print "distortion:", compute_distortion(cluster_list, data_table)
    # start = time.clock()
    # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 15)
    # elapsed = (time.clock() - start)
    # print "elapsed:",elapsed
    # print "Displaying", len(cluster_list), "hierarchical clusters"
    # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16)
    # print "Displaying", len(cluster_list), "hierarchical clusters"
    # print "distortion:", compute_distortion(cluster_list, data_table)

    cluster_list = alg_project3_solution.kmeans_clustering(
        singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "k-means clusters"
    print "distortion:", compute_distortion(cluster_list, data_table)
def compute_and_plot_distortions():
    """
    Compute the distortion of the list of clusters produced by hierarchical clustering and k-means clustering (using 5 iterations)
    on the 111, 290, and 896 county data sets, respectively, where the number of output clusters ranges from 6 to 20 (inclusive).
    Important note:To compute the distortion for all 15 output clusterings produced by hierarchical_clustering, you should remember
    that you can use the hierarchical cluster of size 20 to compute the hierarchical clustering of size 19 and so on. Otherwise,
     you will introduce an unnecessary factor of 15 into the computation of the 15 hierarchical clusterings.
    """

    #choose data set:
    #data_table = viz.load_data_table(viz.DATA_111_URL)
    #data_table = viz.load_data_table(viz.DATA_290_URL)
    data_table = viz.load_data_table(viz.DATA_896_URL)

    num_output_clusters = []
    kmeans_distortion = []
    hierarchical_distortion = []

    print "\nComputing kmeans distortions"
    for indx in range(6, 21):
        ##Dette loop kunne optimeres, saa beregningerne genbruges, men det er ikke noedvendigt, da k_means er saa hurtig
        num_output_clusters.append(indx)

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        kmeans_cluster_list = p.kmeans_clustering(singleton_list, indx, 5)
        distortion = compute_distortion(kmeans_cluster_list, data_table)
        kmeans_distortion.append(distortion)
        print indx, distortion

    print "Computed kmeans distortions"
    print ""
    print "Computing hierarchical distortions"

    for line in data_table:
        singleton_list.append(
            c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    hierarchical_cluster_list = singleton_list

    for indx in range(20, 5, -1):
        hierarchical_cluster_list = p.hierarchical_clustering(
            hierarchical_cluster_list, indx)
        distortion = compute_distortion(hierarchical_cluster_list, data_table)
        hierarchical_distortion.append(distortion)
        print indx, distortion

    hierarchical_distortion.reverse()

    print "Computed hierarchical distortions\n"
    print "Plotting data"

    plt.plot(num_output_clusters,
             kmeans_distortion,
             label="K-means clustering")
    plt.plot(num_output_clusters,
             hierarchical_distortion,
             label="Hierarchical clustering")

    plt.xlabel("Number of output clusters")
    plt.ylabel('Distortion')

    #tegner
    plt.legend()

    plt.title(
        "Comparison of distortion of two clustering methods \n Dataset: 896 counties"
    )

    #goer det hele synligt
    plt.show()
def run_suite():
    """
    Testing code for the functions written for Word Wrangler
    """
    
    # create a TestSuite (and an object)
    suite = poc_simpletest.TestSuite()

    # create a set of 3 clusters
    cluster1 = CC.Cluster([1, 1], 0, 0, 100, 0.00001)
    cluster2 = CC.Cluster([2, 2, 2], 3, 4, 200, 0.00002)
    cluster3 = CC.Cluster([3, 3, 3, 3], 6, 8, 300, 0.00003)
    list_of_clusters = [cluster1, cluster2, cluster3]
        
    # testing the slow_closest_pair function with the 3 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #1: testing slow_closest_pair on 3 clusters")
    # testing the fast_closest_pair function with the 3 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #2: testing fast_closest_pair on 3 clusters")

    # add a fourth cluster to the list
    cluster4 = CC.Cluster([4, 4, 4, 4, 4], 12, 16, 400, 0.00004)
    list_of_clusters.append(cluster4)

    # testing the slow_closest_pair function with the 4 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #3: testing slow_closest_pair on 4 clusters")
    # testing the fast_closest_pair function with the 4 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #4: testing fast_closest_pair on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), 0, 0, 1, 0)
    cluster2 = CC.Cluster(set([]), 1, 0, 1, 0)
    cluster3 = CC.Cluster(set([]), 2, 0, 1, 0)
    cluster4 = CC.Cluster(set([]), 3, 0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing closest_pair_strip on 4 clusters
    suite.run_test(student.closest_pair_strip(list_of_clusters, 1.5, 1.0), (1.0, 1, 2),
                   "Test #5: testing closest_pair_strip on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), 1.0, 0.0, 1, 0)
    cluster2 = CC.Cluster(set([]), 4.0, 0.0, 1, 0)
    cluster3 = CC.Cluster(set([]), 5.0, 0.0, 1, 0)
    cluster4 = CC.Cluster(set([]), 7.0, 0.0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing fast_closest_pair on 4 clusters
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.0, 1, 2),
                   "Test #6: testing closest_pair_strip on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), -4.0, 0.0, 1, 0)
    cluster2 = CC.Cluster(set([]), 0.0, -1.0, 1, 0)
    cluster3 = CC.Cluster(set([]), 0.0, 1.0, 1, 0)
    cluster4 = CC.Cluster(set([]), 4.0, 0.0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing closest_pair_strip on 4 clusters
    suite.run_test(student.closest_pair_strip(list_of_clusters, 0.0, 4.1231059999999999), (2.0, 1, 2),
                   "Test #7: testing closest_pair_strip on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), -4.0, 0.0, 1, 0)
    cluster2 = CC.Cluster(set([]), 0.0, -1.0, 1, 0)
    cluster3 = CC.Cluster(set([]), 0.0, 1.0, 1, 0)
    cluster4 = CC.Cluster(set([]), 4.0, 0.0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing fast_closest_pair on 4 clusters
    suite.run_test(student.fast_closest_pair(list_of_clusters), (2.0, 1, 2),
                   "Test #8: testing fast_closest_pair on 4 clusters")

    # create a sorted list_of_clusters from a small dataset containing 8 clusters
    fhandle = open("unifiedCancerData_8.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
    print "The following list_of_clusters was loaded:"
    for index in range(len(list_of_clusters)):
        print index, list_of_clusters[index]
    print

    # testing the slow_closest_pair function with 8 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (2.4479655653349655, 5, 7),
                   "Test #9: testing slow_closest_pair on 8 clusters")
    # testing the fast_closest_pair function with 8 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (2.4479655653349655, 5, 7),
                   "Test #10: testing fast_closest_pair on 8 clusters")
    # testing the hierarchical_clustering function with 8 clusters
    clustering_result = student.hierarchical_clustering(list_of_clusters, 5)
    for index in range(len(clustering_result)):
        print clustering_result[index]
    print
    # testing the kmeans_clustering function with 8 clusters
    clustering_result = student.kmeans_clustering(list_of_clusters, 5, 3)
    for index in range(len(clustering_result)):
        print clustering_result[index]
    print

    # create a sorted list_of_clusters from a small dataset containing 17 clusters
    fhandle = open("unifiedCancerData_17.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
 
    # testing the slow_closest_pair function with 17 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (1.9439662413427632, 9, 10),
                   "Test #11: testing slow_closest_pair on 17 clusters")
    # testing the fast_closest_pair function with 17 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.9439662413427632, 9, 10),
                   "Test #12: testing fast_closest_pair on 17 clusters")

    # create a sorted list_of_clusters from a small dataset containing 24 clusters
    fhandle = open("unifiedCancerData_24.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
    print "The following list_of_clusters was loaded:"
    for index in range(len(list_of_clusters)):
        print index, list_of_clusters[index]
    print

    # testing the kmeans_clustering function with 24 clusters
    clustering_result = student.kmeans_clustering(list_of_clusters, 10, 1)
    print "This output was created by kmeans_slustering:"
    for index in range(len(clustering_result)):
        print index, clustering_result[index]
    print

    # create a sorted list_of_clusters from a small dataset containing 39 clusters
    fhandle = open("unifiedCancerData_39.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
 
    # testing the slow_closest_pair function with 39 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (1.6612217536988727, 22, 24),
                   "Test #13: testing slow_closest_pair on 39 clusters")
    # testing the fast_closest_pair function with 39 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.6612217536988727, 22, 24),
                   "Test #14: testing fast_closest_pair on 39 clusters")

    # create a sorted list_of_clusters from a small dataset containing 111 clusters
    fhandle = open("unifiedCancerData_111.csv")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
    print "The following list_of_clusters was loaded:"
    for index in range(len(list_of_clusters)):
        print index, list_of_clusters[index]
    print
 
    # testing the slow_closest_pair function with 111 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (1.266216002018164, 79, 81),
                   "Test #15: testing slow_closest_pair on 111 clusters")
    # testing the fast_closest_pair function with 111 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.266216002018164, 79, 81),
                   "Test #16: testing fast_closest_pair on 111 clusters")
    # testing the hierarchical_clustering function with 111 clusters
    clustering_result = student.hierarchical_clustering(list_of_clusters, 5)
    for index in range(len(clustering_result)):
        print clustering_result[index]
    print

    # report number of tests and failures
    print
    suite.report_results()
            for tokens in data_tokens]


#####################################
# Code for answering question 7 of the application

# Read the input data for 290 county data and create a list of clusters
data_table = load_data_table(DATA_290_URL)
    
singleton_list = []
for line in data_table:
    singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

# Create the clustered lists needed for computing the distortions
hierarchical_list = sol.hierarchical_clustering(singleton_list, 16)
kmeans_list = sol.kmeans_clustering(singleton_list, 16, 5) 

# Compute and print the distortions
num_clusters = len(kmeans_list)
hierarchical_distortion = 0
kmeans_distortion = 0
for index in range(num_clusters):
    hierarchical_distortion += hierarchical_list[index].cluster_error(data_table)
    kmeans_distortion += kmeans_list[index].cluster_error(data_table)
    
# Print the results
print
print "=====> Results for 290 county datapoints in 16 clusters"
print ".......... Distortion for hiearchical_clustering:", hierarchical_distortion
print ".......... Distortion for kmeans_clustering:     ", kmeans_distortion
print
Example #37
0
def question6(filename):
    data = 'unifiedCancerData_111.csv'
    dist = distortion(
        visualize(data, filename, lambda x: kmeans_clustering(x, 9, 5)),
        load_data_table(data))
    print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
Example #38
0
def question3(filename):
    visualize('unifiedCancerData_3108.csv', filename,
              lambda x: kmeans_clustering(x, 15, 5))
Example #39
0
def kmeans():
    sol.kmeans_clustering(s, 9, 10)
def question6(filename):
    data = 'data/unifiedCancerData_111.csv'
    dist = distortion(visualize(data, filename,
                                lambda x: kmeans_clustering(x, 9, 5)),
                      load_data_table(data))
    print('Distortion in question6, kmeans = %f (%s)' % (dist, dist))
Example #41
0
"""
Assignment 3 Question 7 Answer
"""

import alg_project3_viz as viz
import alg_project3_solution as sol
import alg_cluster

data_table = viz.load_data_table(viz.DATA_111_URL)

hier_data_list = sol.make_data_list(data_table)
kmeans_data_list = sol.make_data_list(data_table)

hier_cluster_list = sol.hierarchical_clustering(hier_data_list, 9)
kmeans_cluster_list = sol.kmeans_clustering(kmeans_data_list, 9, 5)

print("hierarchical:", sol.compute_distortion(hier_cluster_list, data_table))
print("kmeans:", sol.compute_distortion(kmeans_cluster_list, data_table))


# Hierarchical: 175163886915.8305 or 1.752 x 10^11 with four significant figures
# K-means: 271254226924.20047 or 2.712 x 10^11
Example #42
0
def error_data():
    data_table = load_data_table(DATA_896_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    num_clusters = []
    hier_error = []
    k_means_error = []

    for num in range(6, 21):

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        num_clusters.append(num)
        cluster_list = alg_project3_solution.hierarchical_clustering(
            singleton_list, num)
        print "Displaying", len(cluster_list), "hierarchical clusters"
        error_sum = 0.0
        for cluster in cluster_list:
            error_sum += cluster.cluster_error(data_table)
        hier_error.append(error_sum / 1e11)
        print(error_sum / 1e11)

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, num, 5)
        print "Displaying", len(cluster_list), "k-means clusters"
        error_sum = 0.0
        for cluster in cluster_list:
            error_sum += cluster.cluster_error(data_table)
        k_means_error.append(error_sum / 1e11)
        print(error_sum / 1e11)

    matplotlib.rc('figure', figsize=(16, 8))

    plt.plot(num_clusters, hier_error, label="Hierarchical")
    plt.plot(num_clusters, k_means_error, label="K-means")

    plt.legend()
    plt.xlabel('Number of Clusters', fontsize=14, color='Green')
    plt.ylabel('Distortion x 10^11 ', fontsize=14, color='Brown')
    plt.title(
        'Distortion for Hierarchical and K-means custering for 896 points')
    plt.grid(True)
    plt.show()
    #plt.savefig('question_10_896.png')


#error_data()
Example #43
0
def kmeans():
    sol.kmeans_clustering(s, 9, 10)
Example #44
0
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_896_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # cluster_list = sequential_clustering(singleton_list, 15)
    # print "Displaying", len(cluster_list), "sequential clusters"

    # question 5
    # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    # print "Displaying", len(cluster_list), "hierarchical clusters"

    # question 6
    # cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    # print "Displaying", len(cluster_list), "k-means clusters"

    # question 7
    # cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    # kmeans_result = alg_project3_solution.compute_distortion(cluster_list, data_table)
    # print("Displaying", kmeans_result, "kmeans_result")
    # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    # hierarchical_result = alg_project3_solution.compute_distortion(cluster_list, data_table)
    # print("Displaying", hierarchical_result, "hierarchical_result")

    # question 10
    kmeans_res = []
    for clusters_number in range(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, clusters_number, 5)
        kmeans_res.append([
            clusters_number,
            alg_project3_solution.compute_distortion(cluster_list, data_table)
        ])

    hier_res = []
    for clusters_number in range(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            singleton_list, clusters_number)
        hier_res.append([
            clusters_number,
            alg_project3_solution.compute_distortion(cluster_list, data_table)
        ])

    hier_res.reverse()
    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  # add cluster centers
        create_separate_plots(kmeans_res, hier_res)
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
def question3(filename):
    visualize('data/unifiedCancerData_3108.csv', filename,
              lambda x: kmeans_clustering(x, 15, 5))