예제 #1
0
def compute_distortion(cluster_list):
    """ takes a list of clusters and uses cluster_error to compute its distortion """
    # use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6
    num_clusters = 9
    num_iterations = 5
    hier_list = []
    kmeans_list = []

    # load data table
    data_table = load_data_table(DATA_111_URL)

    hier_list = cpf.hierarchical_clustering(cluster_list, num_clusters)
    hier_error = 0
    for cluster in hier_list:
        hier_error += cluster.cluster_error(data_table)

    kmeans_list = cpf.kmeans_clustering(cluster_list, num_clusters,
                                        num_iterations)
    kmeans_error = 0
    for cluster in kmeans_list:
        kmeans_error += cluster.cluster_error(data_table)

    print "\n\n\n\n-------- -------- Results -------- --------"
    print "Number of clusters:  %r" % num_clusters
    print "hierarchical_clustering error: ", hier_error
    print "kmeans_clustering error: ", kmeans_error
    print "----- ----- ----- ----- ----- ----- -----"
    return [hier_error, kmeans_error]
예제 #2
0
def compute_distortion(cluster_list, data_table, num_clusters):
    """ takes a list of clusters and uses cluster_error to compute its distortion """
    # use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6
    num_iterations = 5
    hier_list = []
    kmeans_list = []

    # print "\n\nCluster list created.  Passing list to hierarchical_clustering ..."
    hier_list = cpf.hierarchical_clustering(cluster_list, num_clusters)

    # print "Computing distortion on ", len(hier_list), "hierarchical clusters"
    hier_error = 0
    for cluster in hier_list:
        hier_error += cluster.cluster_error(data_table)

    # print "\n\nPassing list to kmeans_clustering ..."
    kmeans_list = cpf.kmeans_clustering(cluster_list, num_clusters,
                                        num_iterations)
    # print "Computing distortion on ", len(kmeans_list), "k-means clusters"

    kmeans_error = 0
    for cluster in kmeans_list:
        kmeans_error += cluster.cluster_error(data_table)
    # print "\n\n\n\n-------- -------- Results -------- --------"
    # print "Number of clusters:  %r" % num_clusters
    # print "hierarchical_clustering error: ", hier_error
    # print "kmeans_clustering error: ", kmeans_error
    # print "----- ----- ----- ----- ----- ----- -----"
    return [hier_error, kmeans_error]
예제 #3
0
def run_question(number, data_set):
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters.
    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    global DESKTOP
    print "Loading data table ..."
    data_table = load_data_table(data_set)
    print "Data table loaded.  Creating clusters ..."
    singleton_list = []

    # set correct number of clusters
    if number in [2, 3]:
        num_clusters = 15
    elif number in [5, 6]:
        num_clusters = 9
    print "\nQuestion number:  ", number
    print "Number of clusters to be calculated:  ", num_clusters

    # parse data_table into cluster objects
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    print "\nCluster list created.  Passing list to hierarchical_clustering ..."

    # calculate clusters
    if number == 0:
        cluster_list = sequential_clustering(singleton_list, 15)
        print "Displaying", len(cluster_list), "sequential clusters"
    elif number in [2, 5]:
        cluster_list = cpf.hierarchical_clustering(singleton_list,
                                                   num_clusters)
        print "Displaying", len(cluster_list), "hierarchical clusters"
    elif number in [3, 6]:
        cluster_list = cpf.kmeans_clustering(singleton_list, num_clusters, 5)
        print "Displaying", len(cluster_list), "k-means clusters"
    else:
        "Please pass a valid number to run_question.  Valid options are 0, 2, 3, 5, or 6."

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list,
                                              True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
예제 #4
0
def run_question(number, data_set):
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters.
    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    global DESKTOP
    print "Loading data table ..."
    data_table = load_data_table(data_set)
    print "Data table loaded.  Creating clusters ..."

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    print "Cluster list created.  Passing list to hierarchical_clustering ..."

    if number == 0:
        cluster_list = sequential_clustering(singleton_list, 15)
        print "Displaying", len(cluster_list), "sequential clusters"
    elif number in [2, 5]:
        cluster_list = cpf.hierarchical_clustering(singleton_list, 9)
        print "Displaying", len(cluster_list), "hierarchical clusters"
    elif number in [3, 6]:
        cluster_list = cpf.kmeans_clustering(singleton_list, 9, 5)
        print "Displaying", len(cluster_list), "k-means clusters"
    else:
        "Please pass a valid number to run_question."

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list,
                                              True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
예제 #5
0
def compute_distortion_plot_helper(cluster_list, data_table, num_clusters):
    """ takes a list of clusters and uses cluster_error to compute its distortion """
    # use compute_distortion to compute the distortions of the two clusterings in questions 5 and 6
    num_iterations = 5
    hier_list = []
    kmeans_list = []

    hier_list = cpf.hierarchical_clustering(cluster_list, num_clusters)
    hier_error = 0
    for cluster in hier_list:
        hier_error += cluster.cluster_error(data_table)

    kmeans_list = cpf.kmeans_clustering(cluster_list, num_clusters,
                                        num_iterations)
    kmeans_error = 0
    for cluster in kmeans_list:
        kmeans_error += cluster.cluster_error(data_table)

    # print "\n\n\n\n-------- -------- Results -------- --------"
    # print "Number of clusters:  %r" % num_clusters
    # print "hierarchical_clustering error: ", hier_error
    # print "kmeans_clustering error: ", kmeans_error
    # print "----- ----- ----- ----- ----- ----- -----"
    return [hier_error, kmeans_error]
예제 #6
0
def test_kmeans():
    """
    Test for k-means clustering
    kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways
    """

    # load small data table
    print
    print "\n\nTesting kmeans_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)

    kmeansdata_24 = [[
        15, 1,
        set([('34017', '36061'), ('06037', ), ('06059', ), ('36047', ),
             ('36081', ), ('06071', '08031'), ('36059', ), ('36005', ),
             ('55079', ), ('34013', '34039'), ('06075', ), ('01073', ),
             ('06029', ), ('41051', '41067'),
             ('11001', '24510', '51013', '51760', '51840', '54009')])
    ],
                     [
                         15, 3,
                         set([('34017', '36061'), ('06037', '06059'),
                              ('06071', ), ('36047', ), ('36081', ),
                              ('08031', ), ('36059', ), ('36005', ),
                              ('55079', ), ('34013', '34039'), ('06075', ),
                              ('01073', ), ('06029', ), ('41051', '41067'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         15, 5,
                         set([('34017', '36061'), ('06037', '06059'),
                              ('06071', ), ('36047', ), ('36081', ),
                              ('08031', ), ('36059', ), ('36005', ),
                              ('55079', ), ('34013', '34039'), ('06075', ),
                              ('01073', ), ('06029', ), ('41051', '41067'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         10, 1,
                         set([('34017', '36061'), ('06029', '06037', '06075'),
                              ('11001', '24510', '34013', '34039', '51013',
                               '51760', '51840', '54009'), ('06059', ),
                              ('36047', ), ('36081', ),
                              ('06071', '08031', '41051', '41067'),
                              ('36059', ), ('36005', ), ('01073', '55079')])
                     ],
                     [
                         10, 3,
                         set([('34013', '34017', '36061'),
                              ('06029', '06037', '06075'),
                              ('08031', '41051', '41067'), ('06059', '06071'),
                              ('34039', '36047'), ('36081', ), ('36059', ),
                              ('36005', ), ('01073', '55079'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         10, 5,
                         set([('34013', '34017', '36061'),
                              ('06029', '06037', '06075'),
                              ('08031', '41051', '41067'), ('06059', '06071'),
                              ('34039', '36047'), ('36081', ), ('36059', ),
                              ('36005', ), ('01073', '55079'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         5, 1,
                         set([('06029', '06037', '06075'),
                              ('01073', '11001', '24510', '34013', '34017',
                               '34039', '36047', '51013', '51760', '51840',
                               '54009', '55079'), ('06059', ),
                              ('36005', '36059', '36061', '36081'),
                              ('06071', '08031', '41051', '41067')])
                     ],
                     [
                         5, 3,
                         set([('06029', '06037', '06075'),
                              ('11001', '24510', '34013', '34017', '34039',
                               '36005', '36047', '36059', '36061', '36081',
                               '51013'), ('08031', '41051', '41067'),
                              ('06059', '06071'),
                              ('01073', '51760', '51840', '54009', '55079')])
                     ],
                     [
                         5, 5,
                         set([('06029', '06037', '06075'),
                              ('08031', '41051', '41067'), ('06059', '06071'),
                              ('01073', '55079'),
                              ('11001', '24510', '34013', '34017', '34039',
                               '36005', '36047', '36059', '36061', '36081',
                               '51013', '51760', '51840', '54009')])
                     ]]

    suite = poc_simpletest.TestSuite()

    for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24:

        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        # compute student answer
        student_clustering = student.kmeans_clustering(cluster_list,
                                                       num_clusters,
                                                       num_iterations)
        student_county_tuple = set_of_county_tuples(student_clustering)

        # Prepare test
        error_message = "\n\nTesting kmeans_custering on 24 county table, \nnum_clusters = " + str(
            num_clusters)
        error_message += "   num_iterations = " + str(num_iterations)
        error_message += "\n\nStudent county tuples: " + str(
            student_county_tuple)
        error_message += "\n\nExpected county tuples: " + str(
            expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True,
                       error_message)

    print "\n\n"
    suite.report_results()