def question_one():
     """
     Function for answering first question
     """
     xvals = range(2, 200)
     slow_yvals = []
     fast_yvals = []
     for num in xvals:
         cluster_list = gen_random_clusters(num)
         initial = time.time()
         answer = prj3.slow_closest_pairs(cluster_list)
         final = time.time()
         slow_yvals.append(final - initial)
     for num in xvals:
         cluster_list = gen_random_clusters(num)
         initial = time.time()
         answer = prj3.fast_closest_pair(cluster_list)
         final = time.time()
         fast_yvals.append(final - initial)
     slow_line = plt.plot(xvals, slow_yvals, color='r', label="Slow Closest Pair")
     fast_line = plt.plot(xvals, fast_yvals, color='b', label="Fast Closest Pair")
     plt.legend(loc=2)
     plt.title("Efficiency of Slow and Fast Closest Pairs Algorithms")
     plt.xlabel("Number of Clusters")
     plt.ylabel("Run Times in Milliseconds")
     plt.show()
Esempio n. 2
0
def test_clustering(data_set, method):
    '''
    Test the distortion of a data_set under clustering method.
    
    Input: a tuple (data_table, cluster_list) and a clustering algorithm 

    Output: a list of distortion values
    '''
    # number of clusters to form
    num_cluster_list = np.arange(6, 21, 1)
    # init list to store distortion
    distortion_list = []
    # Loop over cluster sizes list
    for num_cluster in num_cluster_list:
        # make a copy of the cluster_list
        cluster_list = [cluster.copy() for cluster in data_set[1]]
        if method == 'K-Means':
            # perfrom kmeans
            cluster_list = project.kmeans_clustering(cluster_list, num_cluster,
                                                     10)
        else:
            # perform hier clustering
            project.hierarchical_clustering(cluster_list, num_cluster)
        # calculate and save distortion
        distortion_list.append(
            compute_distortion_data_set(data_set[0], cluster_list))
    return distortion_list
Esempio n. 3
0
def compute_running_times():
    '''
    Computes the running time for slow_closest_pair() and fast_closest_pair() 
    for clusters of size 2 to 200.
    
    Input: NONE
    
    Output: a pandas df 
    '''
    # Disables garbage collection
    gc.disable()
    # Init cluster sizes
    num_clusters_list = np.arange(2, 201, 1)
    # Init list variables for storing run times
    slow_runtimes = []
    fast_runtimes = []
    # loop over cluster sizes
    for num_clusters in num_clusters_list:
        # Generate a cluster_list of size num_clusters
        clusters_list = gen_random_clusters(num_clusters)
        # Sort the cluster_list
        clusters_list.sort(key=lambda cluster: cluster.horiz_center())
        # Gets current time in seconds
        start_time = time.clock()
        # Perform slow clustering
        dummy_result = project.slow_closest_pair(clusters_list)
        # Calculate runtime and store value
        slow_runtimes.append(time.clock() - start_time)
        # Gets current time in seconds
        start_time = time.clock()
        # Perform fast clustering
        dummy_result = project.fast_closest_pair(clusters_list)
        # Calculate runtime and store value
        fast_runtimes.append(time.clock() - start_time)
    # create df from results
    return pd.DataFrame(
        dict({
            'slow': slow_runtimes,
            'fast': fast_runtimes,
            'num clusters': num_clusters_list
        }))
 def clustering(algo_used, num_clusters, num_iter = 5):
     """
     Uses specified algorithm to cluster data
     
     input: int for specified algorithm, data_table
     output: cluster_list
     """     
     singleton_list = []
     for line in data_table:
         singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
     
     if algo_used == 1:
         cluster_list = sequential_clustering(singleton_list, num_clusters)
         print "Displaying", len(cluster_list), "sequential clusters"
     elif algo_used == 2:
         cluster_list = prj3.hierarchical_clustering(singleton_list, num_clusters)
         print "Displaying", len(cluster_list), "hierarchical clusters"
     elif algo_used == 3:
         cluster_list = prj3.kmeans_clustering(singleton_list, num_clusters, num_iter)
         print "Displaying", len(cluster_list), "k-means clusters"
     
     return cluster_list
Esempio n. 5
0
def cluster_data():
    '''
    Load a data table, compute a list of clusters and 
    
    Output: a tuple of two list of clusters (hierarchical, kmeans)
    '''
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    singleton_list_copy = [cluster.copy() for cluster in singleton_list]

    return (project.hierarchical_clustering(singleton_list, 9),
            project.kmeans_clustering(singleton_list_copy, 9, 5))
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    def compute_distortion(cluster_list):
        distortion = 0
        for clus in cluster_list:
            error = clus.cluster_error(data_table)
            distortion += error
        return distortion
    hierarchical_distortion = []
    kmeans_distortion = []
    # cluster_list = sequential_clustering(singleton_list, 15)
    # print "Displaying", len(cluster_list), "sequential clusters"
    for num_final_clusters in range(6, 21):
        hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_final_clusters)
        print "Displaying", len(hierarchical_cluster_list), "hierarchical clusters"
        hierarchical_distortion.append(compute_distortion(hierarchical_cluster_list))
        kmeans_cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num_final_clusters, 5)
        print "Displaying", len(kmeans_cluster_list), "k-means clusters"
        kmeans_distortion.append(compute_distortion(kmeans_cluster_list))
    plt.plot(range(6, 21), hierarchical_distortion, 'g', lw = 2, label = "hierarchical distortion")
    plt.plot(range(6, 21), kmeans_distortion, 'r', lw = 2, label = "kmeans distortion")
    plt.legend(loc = 'upper left')
    plt.xlabel('Number of final clusters')
    plt.xlabel('Number of final clusters')
    plt.ylabel('Distortion')
    plt.title('Comparison of distortion between two clustering methods \n based on 111 county data set')
    plt.grid()
    plt.savefig('Comparison of distortion (111)')
Esempio n. 7
0
def plot_Q6():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"

    data_table = viz.load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list = project.kmeans_clustering(singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    # draw the clusters using matplotlib
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
Divide and Conquer Method and Clustering
Closest Pairs and Clustering Algorithms
Test File
"""

import alg_cluster
import Project_3 as prj3

slow_dist = True
fast_dist = True


if slow_dist == True:
    print "-------Testing Slow closest pairs first-------"
    print "\nTest 1.."
    print prj3.slow_closest_pairs([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0)])
    print "Expected: set([(1.0, 0, 1)])"
    print "\nTest 2.."
    print prj3.slow_closest_pairs([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 0, 1, 1, 0), alg_cluster.Cluster(set([]), 0, 2, 1, 0)])
    print "Expected: set([(1.0, 0, 1), (1.0, 1, 2)])" 
    
if fast_dist == True:
    print "\n" + "-------Testing fast closest pairs-------"
    print "\nTest 1..."
    print prj3.fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0)])
    print "\nTest 2..."
    print prj3.fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0), alg_cluster.Cluster(set([]), 4, 0, 1, 0), alg_cluster.Cluster(set([]), 5, 0, 1, 0), alg_cluster.Cluster(set([]), 6, 0, 1, 0), alg_cluster.Cluster(set([]), 7, 0, 1, 0), alg_cluster.Cluster(set([]), 8, 0, 1, 0), alg_cluster.Cluster(set([]), 9, 0, 1, 0), alg_cluster.Cluster(set([]), 10, 0, 1, 0), alg_cluster.Cluster(set([]), 11, 0, 1, 0), alg_cluster.Cluster(set([]), 12, 0, 1, 0), alg_cluster.Cluster(set([]), 13, 0, 1, 0), alg_cluster.Cluster(set([]), 14, 0, 1, 0), alg_cluster.Cluster(set([]), 15, 0, 1, 0), alg_cluster.Cluster(set([]), 16, 0, 1, 0), alg_cluster.Cluster(set([]), 17, 0, 1, 0), alg_cluster.Cluster(set([]), 18, 0, 1, 0), alg_cluster.Cluster(set([]), 19, 0, 1, 0)])
    print "Expected: one of the tuples in set([(1.0, 9, 10), (1.0, 2, 3), (1.0, 15, 16), (1.0, 11, 12), (1.0, 13, 14), (1.0, 16, 17), (1.0, 14, 15), (1.0, 12, 13), (1.0, 4, 5), (1.0, 18, 19), (1.0, 3, 4), (1.0, 8, 9), (1.0, 17, 18), (1.0, 6, 7), (1.0, 7, 8), (1.0, 5, 6), (1.0, 10, 11), (1.0, 0, 1), (1.0, 1, 2)])"
    print "\nTest 3..."
    print prj3.fast_closest_pair([alg_cluster.Cluster(set([]), 90.9548590217, -17.089022585, 1, 0), alg_cluster.Cluster(set([]), 90.2536656675, -70.5911544718, 1, 0), alg_cluster.Cluster(set([]), -57.5872347006, 99.7124028905, 1, 0), alg_cluster.Cluster(set([]), -15.9338519877, 5.91547495626, 1, 0), alg_cluster.Cluster(set([]), 19.1869055492, -28.0681513017, 1, 0), alg_cluster.Cluster(set([]), -23.0752410653, -42.1353490324, 1, 0), alg_cluster.Cluster(set([]), -65.1732261872, 19.675582646, 1, 0), alg_cluster.Cluster(set([]), 99.7789872101, -11.2619165604, 1, 0), alg_cluster.Cluster(set([]), -43.3699854405, -94.7349852817, 1, 0), alg_cluster.Cluster(set([]), 48.2281912402, -53.3441788034, 1, 0)])
    print "Expected: one of the tuples in set([(10.5745166749, 0, 7)])"