def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """
    # Create copy of cluster_list, sorted in descending order of clusters' populations.
    sorted_cluster_list = sorted(cluster_list, key = lambda cluster: cluster.total_population(), reverse = True)        

    # Position initial clusters at the location of clusters with largest populations.
    centers = [alg_cluster.Cluster(set(), cluster.horiz_center(), cluster.vert_center(), 0, 0) 
               for cluster in sorted_cluster_list[:num_clusters]]

    for dummy_idx in range(num_iterations):
        # Initialize num_clusters empty clusters.
        k_clusters = [alg_cluster.Cluster(set(), 0, 0, 0, 0) for dummy_idx in range(num_clusters)]
        # For every cluster, merge cluster into the closest k_cluster.
        for cluster in cluster_list:
            min_dist = float('inf')
            for center in centers:
                if cluster.distance(center) < min_dist:
                    min_dist = cluster.distance(center)
                    closest = centers.index(center)
            k_clusters[closest].merge_clusters(cluster)
        # Update centers.
        centers = [alg_cluster.Cluster(set(), cluster.horiz_center(), cluster.vert_center(), 0, 0) 
                   for cluster in k_clusters]

    return k_clusters
Esempio n. 2
0
def compare_distortions():
    
     #data_table = load_data_table(DATA_111_URL)
     #data_table = load_data_table(DATA_290_URL)
     data_table = load_data_table(DATA_896_URL)
     dist_hierarchical = []
     dist_kmeans = []
     
     out_cluster_k = [k for k in range(6,21)]
     
     print  out_cluster_k
     
     for k in out_cluster_k:
        
        singleton_list = []
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, k)
        dist_hierarchical.append(compute_distortion(cluster_list))
        
        singleton_list = []
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))


        cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, k, 5)
        dist_kmeans.append(compute_distortion(cluster_list))

     return dist_hierarchical,dist_kmeans,out_cluster_k
Esempio n. 3
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    # position initial clusters at the location of clusters with largest populations
    copy_cluster_list = list(cluster_list)
    copy_cluster_list.sort(key=lambda cluster: cluster.total_population())
    copy_cluster_list.reverse()
    size = len(cluster_list)
    new_clusters = [
        alg_cluster.Cluster(set([]),
                            copy_cluster_list[dummy_idx].horiz_center(),
                            copy_cluster_list[dummy_idx].vert_center(), 0, 0.0)
        for dummy_idx in range(num_clusters)
    ]
    for idx1 in range(num_iterations):
        old_clusters = list(new_clusters)
        new_clusters = [
            alg_cluster.Cluster(set([]),
                                old_clusters[dummy_idx].horiz_center(),
                                old_clusters[dummy_idx].vert_center(), 0, 0.0)
            for dummy_idx in range(num_clusters)
        ]
        for idx2 in range(size):
            closest_idx = kmeans_closest_idx(cluster_list[idx2], old_clusters)
            new_clusters[closest_idx].merge_clusters(cluster_list[idx2])
    return new_clusters
Esempio n. 4
0
def q10_legend(DATA_URL):
    data_table = load_data_table(DATA_URL)
    singleton_list = []
    hierarchical_cluster_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
        hierarchical_cluster_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    xvals = []
    yvals1 = []
    yvals2 = []
    for num_clusters in range(20, 5, -1):
        xvals.append(num_clusters)
        hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(
            hierarchical_cluster_list, num_clusters)
        yvals1.append(compute_distortion(hierarchical_cluster_list,
                                         data_table))
        yvals2.append(
            compute_distortion(
                alg_project3_solution.kmeans_clustering(
                    singleton_list, num_clusters, 5), data_table))
    curve1 = [[xvals[idx], yvals1[idx]] for idx in range(len(xvals))]
    curve2 = [[xvals[idx], yvals2[idx]] for idx in range(len(xvals))]
    simpleplot.plot_lines(
        "The distortion of output clusters uesd " + str(len(data_table)) +
        "-county data set", 800, 600, "the number of output clusters",
        "the distortion associated with each output clustering",
        [curve1, curve2], True, ["hierarchical cluster", "kmeans cluster"])
Esempio n. 5
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    # position initial clusters at the location of clusters with largest populations
    point_list = [(cluster.total_population(), cluster.horiz_center(),
                   cluster.vert_center()) for cluster in cluster_list]
    centers = []
    for dummy_i in range(num_clusters):
        temp_point = max(point_list)
        centers.append((temp_point[1], temp_point[2]))
        point_list.remove(temp_point)
    answer = []
    for dummy_i in range(num_iterations):
        answer = []
        for center in centers:
            answer.append(
                alg_cluster.Cluster(set([]), center[0], center[1], 0, 0))
        for cluster in cluster_list:
            temp_dist = (float("inf"), -1)
            for idx in range(num_clusters):
                temp_dist = min(temp_dist, (cluster.distance(
                    alg_cluster.Cluster(set([]), centers[idx][0],
                                        centers[idx][1], 0, 0)), idx))
            answer[temp_dist[1]].merge_clusters(cluster)
        centers = [(mean.horiz_center(), mean.vert_center())
                   for mean in answer]
    return answer
Esempio n. 6
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """
    nodelist = list(cluster_list)
    centers = []
    nodelist_by_pop = list(nodelist)
    nodelist_by_pop.sort(key = lambda cluster: cluster.total_population(), reverse=True)      

    # position initial clusters at the location of clusters with largest populations
    for idx in range(num_clusters):
        centers.append(alg_cluster.Cluster(set([]), nodelist_by_pop[idx].horiz_center(), nodelist_by_pop[idx].vert_center(), nodelist_by_pop[idx].total_population(), 0))

    for idx in range(num_iterations):
        # set empty list of clusters with clusters = num_clusters
        results = [alg_cluster.Cluster(set([]), 0, 0, 0, 0) for _ in range(num_clusters)]
        for each in nodelist:
            shortest_dist = float("inf")
            center_position = 0
            for center in centers:
                dist = each.distance(center)
                if dist < shortest_dist:
                    shortest_dist = dist
                    center_position = centers.index(center)
                    
            results[center_position].merge_clusters(each)
            
        # reset centers
        centers = list(results)
        
    return results
Esempio n. 7
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_290_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"
    cluster_list = alg_project3_solution.hierarchical_clustering(
        list(singleton_list), 16)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    print "Distortion", compute_distortion(cluster_list, data_table)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list2 = alg_project3_solution.kmeans_clustering(
        list(singleton_list), 16, 5)
    print "Displaying", len(cluster_list2), "k-means clusters"
    print "Distortion", compute_distortion(cluster_list2, data_table)
Esempio n. 8
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    """
    points = cluster_list[:]

    # n <-- |p|;
    len_points_list = len(points)

    # position initial clusters at the location of clusters with largest populations (i.e., cluster[3] which is population)
    cluster_centers = []
    temp_cl = points[:]

    temp_cl.sort(key=lambda cluster: cluster.total_population())
    for cluster in reversed(temp_cl):
        if len(cluster_centers) < num_clusters:
            cluster_centers.append(
                alg_cluster.Cluster(set([]), cluster.horiz_center(),
                                    cluster.vert_center(), 0, 0))

    # For number of iterations
    for dummy_var in range(num_iterations):
        # initialize k (num_clusters) empty sets C1, ... Ck;
        cluster_groupings = []
        for index in range(len(cluster_centers)):
            cluster_groupings.append(alg_cluster.Cluster(set(), 0, 0, 0, 0))
        # # For each county
        # for j = 0 to n - 1 do
        for index in range(len_points_list):
            # Find the old cluster center that is closest
            # L <-- argminsub(1<=f<=k) (dsub(psubj), musubf);
            min_dist = float('inf')
            nearest_cluster_index = None

            for idx, cluster in enumerate(cluster_centers):
                if points[index].distance(cluster) < min_dist:
                    min_dist = points[index].distance(cluster)
                    nearest_cluster_index = idx

            # Add the county to the corresponding new cluster
            # Handled with Cluster class merge_clusters method, which will automatically update the cluster centers to correct locations.
            cluster_groupings[nearest_cluster_index].merge_clusters(
                points[index])
        # Set old clusters equal to new clusters
        # for f = 1 to k do
        for index in range(len(cluster_centers)):
            # muf = center (Cf)     // handled with Cluster class built-in method(s)
            cluster_centers[index] = cluster_groupings[index].copy()

    # return {C1, C2, ..., Ck};
    return cluster_groupings
def run_kmeans_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """

    data_table = load_data_table(DATA_896_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    def compute_distortion(cluster_list):
        error = 0
        for cluster in cluster_list:
            error += cluster.cluster_error(data_table)
        return error

    error = []
    for cluster_num in range(6, 21):
        cluster_list = kmeans_clustering(singleton_list, cluster_num, 5)
        error.append(compute_distortion(cluster_list))
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
    return error

    print("Displaying", len(cluster_list), "kmeans clusters")

    # cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    # print "Displaying", len(cluster_list), "hierarchical clusters"

    # cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    # print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list,
                                              True)  # add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
Esempio n. 10
0
 def setUp(self):
     self.cluster1 = alg_cluster.Cluster(set([]), 0, 0, 1, 0)
     self.cluster2 = alg_cluster.Cluster(set([]), 1, 0, 1, 0)
     self.cluster3 = alg_cluster.Cluster(set([]), 2, 0, 1, 0)
     self.cluster_list1 = [
         alg_cluster.Cluster(set([]), 0.02, 0.39, 1, 0),
         alg_cluster.Cluster(set([]), 0.19, 0.75, 1, 0),
         alg_cluster.Cluster(set([]), 0.35, 0.03, 1, 0),
         alg_cluster.Cluster(set([]), 0.73, 0.81, 1, 0),
         alg_cluster.Cluster(set([]), 0.76, 0.88, 1, 0),
         alg_cluster.Cluster(set([]), 0.78, 0.11, 1, 0)
     ]
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list

    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    # position initial clusters at the location of clusters with largest populations
    new_cluster_list = [cluster.copy() for cluster in cluster_list]
    new_cluster_list.sort(key=lambda cluster: cluster.total_population(),
                          reverse=True)

    n_cluster = len(new_cluster_list)
    miu_list = list(new_cluster_list[:num_clusters])
    #print miu_list
    #for i in range(num_clusters):
    #   miu_list.append(alg_cluster.Cluster())

    while num_iterations > 0:
        c_buckets = []
        num_clusters1 = num_clusters
        while num_clusters1 > 0:
            c_buckets.append(alg_cluster.Cluster(set(), 0, 0, 0, 0))
            num_clusters1 -= 1

        #print len(c_buckets)
        for j_index in range(n_cluster):
            min_dist = float("inf")
            for f_index in range(num_clusters):
                dist = new_cluster_list[j_index].distance(miu_list[f_index])
                if dist < min_dist:
                    min_dist = dist
                    l_index = f_index
            #print "mindist", min_dist, l_index

            c_buckets[l_index].merge_clusters(new_cluster_list[j_index])
            #print c_buckets[:2]

        for f_index in range(num_clusters):
            x_pos = c_buckets[f_index].horiz_center()
            y_pos = c_buckets[f_index].vert_center()
            miu_list[f_index] = alg_cluster.Cluster(set(), x_pos, y_pos, 0, 0)

        num_iterations -= 1

    return c_buckets
def kmeans_clustering(cluster_list, len_k, iter_times_q):
    """
    Compute the len_k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    # position initial clusters at the location of clusters with largest populations

    cluster_list_copy = list(cluster_list)
    cluster_list_copy.sort(key=lambda cluster: cluster.total_population(),
                           reverse=True)
    miu = []
    for idx_f in range(len_k):
        miu.append((cluster_list_copy[idx_f].horiz_center(),
                    cluster_list_copy[idx_f].vert_center()))

    for dummy_i in range(iter_times_q):
        k_cluster_list = []
        nk_cluster_list = []
        for idx_x in range(len_k):
            temp_cluster = alg_cluster.Cluster(set(), float(miu[idx_x][0]),
                                               float(miu[idx_x][1]), int(0),
                                               float(0))
            k_cluster_list.append(temp_cluster)
            temp_cluster = alg_cluster.Cluster(set(), float(0), float(0),
                                               int(0), float(0))
            nk_cluster_list.append(temp_cluster)

        for idx_j in range(len(cluster_list)):
            min_distance = float('inf')
            idx_l = -1
            for idx_f in range(len_k):
                temp_distance = cluster_list[idx_j].distance(
                    k_cluster_list[idx_f])
                if temp_distance <= min_distance:
                    min_distance = temp_distance
                    idx_l = idx_f
            nk_cluster_list[idx_l].merge_clusters(cluster_list[idx_j])

        miu = []
        for idx_f in range(len_k):
            miu.append((nk_cluster_list[idx_f].horiz_center(),
                        nk_cluster_list[idx_f].vert_center()))

    return nk_cluster_list
Esempio n. 13
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    size = len(cluster_list)
    copy_list = [cluster.copy() for cluster in cluster_list]
    copy_list.sort(key=lambda cluster: -1 * cluster.total_population())
    # position initial clusters at the location of clusters with largest populations
    centers = copy_list[:num_clusters]

    for _ in range(0, num_iterations):
        cluster_set = [
            alg_cluster.Cluster(set(), 0, 0, 0, 0) for _ in range(num_clusters)
        ]

        for idx_j in range(0, size):
            index = find_index(cluster_list[idx_j], centers)
            cluster_set[index].merge_clusters(cluster_list[idx_j])

        for idx_k in range(0, num_clusters):
            centers[idx_k] = cluster_set[idx_k].copy()

    return centers
Esempio n. 14
0
def plot_distortion(data_url, size_range):
    """
    :return:
    """
    data_table = load_data_table(data_url)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    hc_list = singleton_list
    x = []
    y_hc = []
    y_kmc = []
    # compute distortion of hierarchical_clustering in specific range
    # Note: use the hierarchical cluster of size n to compute the size n-1 and so on.
    for size in range(size_range[1], size_range[0] - 1, -1):
        print len(hc_list)
        hc_list = alg_project3_solution.hierarchical_clustering(hc_list, size)
        y_hc.append(compute_distortion(hc_list, data_table))
    y_hc.reverse()  # reverse the hc_list
    print y_hc
    for size in range(size_range[0], size_range[1] + 1):
        x.append(size)
        kmc_list = alg_project3_solution.kmeans_clustering(
            singleton_list, size, 5)
        y_kmc.append(compute_distortion(kmc_list, data_table))
    title = data_url[-7:-4]
    plt.plot(x, y_hc, '-b', label='hierarchical_clustering')
    plt.plot(x, y_kmc, '-r', label='k-means_clustering')
    plt.legend(loc='upper right')
    plt.xlabel("Number of Output Clusters")
    plt.ylabel("Distortion")
    plt.title(title + " county data sets")
    plt.show()
Esempio n. 15
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    
    Input: List of clusters, number of clusters, number of iterations
    Output: List of clusters whose length is num_clusters
    """
    
    num = len(cluster_list)
    cluster_index = range(num)
    cluster_index.sort(key=lambda x:cluster_list[x].total_population())
    cluster_index.reverse()
    # initialize k-means clusters to be initial clusters with largest populations
    center_list = [ cluster_list[cluster_index[pos]].copy() for pos in range(num_clusters) ]
    prev_center_list = []

    for _ in range(num_iterations):
        prev_center_list = center_list
        center_list = []
        for idx in range(num_clusters):
            center_list.append(alg_cluster.Cluster(set(), 0.0, 0.0, 0, 0.0))
        best_idx_list = []
        for idx in range(num):
            best_dist = prev_center_list[0].distance(cluster_list[idx])
            best_idx = 0
            for cidx in range(num_clusters):
                dist = prev_center_list[cidx].distance(cluster_list[idx])
                if best_dist > dist:
                    best_idx = cidx
                    best_dist = dist
            best_idx_list.append(best_idx)
        for idx in range(num):
            center_list[best_idx_list[idx]].merge_clusters(cluster_list[idx])

    return center_list
Esempio n. 16
0
def visualize_data(cluster_input, data, method=None, display_centers=False):
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(data)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    if method == None:
        cluster_list = sequential_clustering(singleton_list, cluster_input)	
        print("Displaying", len(cluster_list), "sequential clusters")
    elif method == 'hierarchical_clustering':
        cluster_list = clustering.hierarchical_clustering(singleton_list, cluster_input)
        print("Displaying", len(cluster_list), "hierarchical clusters")
    elif method == 'kmeans_clustering':
        cluster_list = clustering.kmeans_clustering(singleton_list,
                                                    cluster_input[0],
                                                    cluster_input[1])
        print("Displaying", len(cluster_list), "k-means clusters")
    else:
        print("ERROR: method entered into visualize_data not recognized")

    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, display_centers)
Esempio n. 17
0
def compare_distortion():
    # data_table = load_data_table('unifiedCancerData_111.csv')
    # data_table = load_data_table('unifiedCancerData_290.csv')
    data_table = load_data_table('unifiedCancerData_896.csv')
    hie_distortion = []
    kmeans_distortion = []
    num_output = range(20, 5, -1)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    hie_list = singleton_list[:]
    for num_cluster in num_output:
        hie_list = alg_project3_solution.hierarchical_clustering(
            hie_list, num_cluster)
        hie_distortion.append(app3_7.compute_distortion(hie_list, data_table))
        kmeans_list = alg_project3_solution.kmeans_clustering(
            singleton_list, num_cluster, 5)
        kmeans_distortion.append(
            app3_7.compute_distortion(kmeans_list, data_table))
    plt.plot(num_output, hie_distortion, '-b', label='hierarchical clustering')
    plt.plot(num_output, kmeans_distortion, '-r', label='k-means clustering')
    plt.legend(loc='upper right')
    plt.xlabel('number of output clusters')
    plt.ylabel('distortion of two clustering methods')
    plt.title('Comparison of the Distortion with ' + str(len(singleton_list)) +
              ' County Data Set')
    plt.show()
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """
    # position initial clusters at the location of clusters with largest populations
    list_len = len(cluster_list)
    centers = cluster_list[:]
    centers.sort(key=lambda cluster: cluster.total_population(), reverse=True)
    centers = centers[:num_clusters]
    cluster_sets = None
    for dummy_i in range(num_iterations):
        cluster_sets = [
            alg_cluster.Cluster(set([]), 0.0, 0.0, 0.0, 0.0)
            for dummy_idx in range(num_clusters)
        ]
        for idx in range(list_len):
            min_idx = min(range(num_clusters),
                          key=lambda center_idx: cluster_list[idx].distance(
                              centers[center_idx]))
            cluster_sets[min_idx].merge_clusters(cluster_list[idx])
        centers = cluster_sets
    return cluster_sets
Esempio n. 19
0
def run_example():
    """ Load a data table, compute a list of clusters and plot a list of clusters. Set DESKTOP = True/False to use either matplotlib or simplegui """
    data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # ************** Here we have to choose the type of clustering we want to use for visualization ********************
    #cluster_list = sequential_clustering(singleton_list, 15); print "Displaying", len(cluster_list), "sequential clusters"
    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9); print "Displaying", len(cluster_list), "hierarchical clusters"
    cluster_list = alg_project3_solution.kmeans_clustering(
        singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "k-means clusters"

    if DESKTOP:  # draw the clusters using matplotlib or simplegui
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  # add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
Esempio n. 20
0
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = closest_pairs_and_clustering_algorithms.hierarchical_clustering(singleton_list, 9)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    cluster_list = closest_pairs_and_clustering_algorithms.kmeans_clustering(
        singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list,
                                              True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    ''' Takes a list of Cluster objects and applies k-means clustering as described in the pseudo-code 
	KMeansClustering from Homework 3 to this list of clusters. This function should compute an initial 
	list of clusters (line 2 in the pseudo-code) with the property that each cluster consists of a single 
	county chosen from the set of the num_cluster counties with the largest populations. The function 
	should then compute num_iterations of k-means clustering and return this resulting list of clusters'''
    _cluster_length = len(cluster_list)
    _temp = cluster_list[:]
    _cluster_copy = cluster_list[:]
    _temp.sort(key=lambda cluster: cluster.total_population())
    _k_centers = _temp[-(num_clusters):]
    #	ans = -1
    #print _k_centers
    for _current_iteration in range(0, num_iterations):
        _k_initial_sets = [
            alg_cluster.Cluster(set([]), 0, 0, 0, 0)
            for _dmy in range(0, num_clusters)
        ]
        for _point in range(0, _cluster_length):
            _min_dist = float('inf')
            _closest_center = -1
            for _k_idx in range(0, num_clusters):
                _current_dist = _cluster_copy[_point].distance(
                    _k_centers[_k_idx])
                if _current_dist < _min_dist:
                    _closest_center = _k_idx
                    _min_dist = _current_dist
            _k_initial_sets[_closest_center].merge_clusters(
                _cluster_copy[_point])
            #dist = min([_inner_cluster_list[_inner1].distance(center, for center in _k_centers if ])
        for _each in range(0, num_clusters):
            _k_centers[_each] = _k_initial_sets[_each]
#		ans = _k_initial_sets
    return _k_centers
Esempio n. 22
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list

    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """
    # position initial clusters at the location of clusters with largest populations
    num = len(cluster_list)

    clusters_by_pop = sorted(cluster_list,
                             key=lambda z: z.total_population(),
                             reverse=True)[:num_clusters]
    cluster_dict = {
        idx: cluster
        for idx, cluster in enumerate(clusters_by_pop)
    }

    for _ in xrange(num_iterations):
        clusters = [
            alg_cluster.Cluster(fips_codes=set(),
                                horiz_pos=0,
                                vert_pos=0,
                                population=0,
                                risk=0) for _ in xrange(num_clusters)
        ]
        for index in xrange(num):
            minimum = min(
                xrange(num_clusters),
                key=lambda f: cluster_list[index].distance(cluster_dict[f]))
            clusters[minimum].merge_clusters(cluster_list[index])
        for idy in xrange(num_clusters):
            cluster_dict[idy] = clusters[idy]
    return cluster_dict.values()
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    cluster_list = hierarchical_clustering(singleton_list, 15)

    print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
    else:
        alg_clusters_simplegui.PlotClusters(data_table, cluster_list)
Esempio n. 24
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """ Compute the k-means clustering of a set of clusters. Note: the function may not mutate cluster_list. Input: List of clusters, integers number of clusters and number of iterations Output: List of clusters whose length is num_clusters """
    cluster_list = [
        cluster.copy() for cluster in cluster_list
    ]  # first we create a list of cluster object copies, as we need to work on a copy and the 'fips_codes' don't come as sets fot the cancer data
    initial_centers = sorted(
        cluster_list, reverse=True, key=lambda cls: cls.total_population()
    )[:
      num_clusters]  # position initial clusters of size num_clusters at the location of clusters with largest populations

    for dummy_iteration in range(
            num_iterations
    ):  # the main loop that counts for the number of iterations
        clusters = [
            (alg_cluster.Cluster(set([]), 0, 0, 0, 0), cluster)
            for cluster in initial_centers
        ]  # we need an empty cluster to merge with the incoming clusters, we also add the initial coords to simplify future ops.
        for cluster in cluster_list:
            new_cluster, dummy_center = min(
                clusters, key=lambda cls: cls[1].distance(cluster)
            )  # as we iterate, we store 'cluster' object as 'new_cluster' where the initial coords are the closest
            new_cluster.merge_clusters(
                cluster
            )  # by using the reference to the cluster object, we merge the object in 'clusters' with the one we picked
        initial_centers = [
            cluster for cluster, dummy_center in clusters
        ]  # now we have to update, the initial centers for the next iteration, as this is where the initial positions into 'clusters' came from

    return initial_centers
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    
    Input: List of clusters, number of clusters, number of iterations
    Output: List of clusters whose length is num_clusters
    """
    cluster_list_sorted = sorted(cluster_list,
                                 key=lambda x: x.total_population(),
                                 reverse=True)
    k_clusters = cluster_list_sorted[:num_clusters]
    # initialize k-means clusters to be initial clusters with largest populations
    for dummy_idx in xrange(num_iterations):
        new_clusters = [
            alg_cluster.Cluster(set([]), 0, 0, 1, 0)
            for dummy_idx in xrange(num_clusters)
        ]
        for idx_j in xrange(len(cluster_list)):
            current_dist = [
                cluster_list[idx_j].distance(k_clusters[idx_l])
                for idx_l in xrange(num_clusters)
            ]
            idx_l = min(xrange(len(current_dist)),
                        key=current_dist.__getitem__)
            new_clusters[idx_l].merge_clusters(cluster_list[idx_j])
        k_clusters = new_clusters[:]

    return k_clusters
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    
    Input: List of clusters, number of clusters, number of iterations
    Output: List of clusters whose length is num_clusters
    """
    cluster_list_k = []
    population_counties_list = [item.total_population() for item in cluster_list]
    dummy_k = num_clusters
    while dummy_k>0:
        max_population = max(population_counties_list)
        idx = population_counties_list.index(max_population)
        cluster_list_k.append(cluster_list[idx])
        population_counties_list[idx] = -1
        dummy_k-=1
    cluster_list_k.reverse()    
    for iteration in range(num_iterations):
        if iteration:
            pass
        # print iteration
        temp_clusters = [alg_cluster.Cluster(set([]), item.horiz_center(), item.vert_center(), 0, 0.0) for item in cluster_list_k]
        for index in range(len(cluster_list)):
            min_index = 0
            min_value = 100000000.0
            for idx in range(num_clusters):
                if cluster_list[index].distance(cluster_list_k[idx]) < min_value:
                    min_index = idx
                    min_value = cluster_list[index].distance(cluster_list_k[idx])
            temp_clusters[min_index].merge_clusters(cluster_list[index])		
        for idx in range(num_clusters):
            cluster_list_k[idx] =  temp_clusters[idx]
    return temp_clusters
Esempio n. 27
0
def gen_singleton_list(data_table):
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    return singleton_list
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """
    clusters = cluster_list[::]
    # position initial clusters at the location of clusters with largest populations
    clusters.sort(key = lambda cluster: cluster.total_population(), reverse = True)
    clusters = clusters[:num_clusters]
   
    for _ in range(num_iterations):
        new_clusters = [alg_cluster.Cluster(set(),0,0,0,0) for _ in range(num_clusters)]
        for cluster in cluster_list:
            # find closest center
            closest = float("inf")
            for idx in range(len(clusters)):
                dist = clusters[idx].distance(cluster)
                if dist < closest:
                    closest = dist
                    center = idx
            # merge the cluster closest to new_clusters
            new_clusters[center].merge_clusters(cluster)
        # store new center
        clusters = new_clusters
    return clusters
Esempio n. 29
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    # position initial clusters at the location of clusters with largest populations
    clusters = [cluster for cluster in cluster_list]
    clusters.sort(key=lambda x: x.total_population(), reverse=True)
    clusters = clusters[:num_clusters]

    for _ in range(num_iterations):
        # num_iterations == q
        # initalize num_clusters i.e k empty cluster
        empty_cluster = [
            alg_cluster.Cluster(set([]), 0, 0, 0, 0)
            for _ in range(num_clusters)
        ]

        for jdx in range(len(cluster_list)):
            distance, merge_with = float('inf'), None
            for cluster in clusters:
                if cluster_list[jdx].distance(cluster) < distance:
                    distance, merge_with = cluster_list[jdx].distance(
                        cluster), cluster

            empty_cluster[clusters.index(merge_with)].merge_clusters(
                cluster_list[jdx])
        # new_clusters[.index(closest_cluster_center)].merge_clusters(county)
        clusters = empty_cluster
    return clusters
def gen_random_clusters(num_clusters):
    random_clusters = list()
    for cluster in range(num_clusters):
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        random_clusters.append(alg_cluster.Cluster(set([]), x, y, 0, 0))
    return random_clusters