Ejemplo n.º 1
0
def compute_distortion():
    data_table = alp.load_data_table(alp.DATA_111_URL)
    cluster_list = alp.run_example()

    list1 = []
    for i in cluster_list:
        error = i.cluster_error(data_table)
        list1.append(error)
    print sum(list1)
Ejemplo n.º 2
0
def question10(URLs, min_number_clusters=6, max_number_clusters=20, steps=5, file_to_save="Question10.png"):
    '''

    :return:
    '''
    for url in URLs:
        hierarchical_distortion_result = []
        kmeans_distortion_result = []
        for num in range(max_number_clusters,min_number_clusters-1,-1):
            data_table, singleton_list = alg_project3_viz.run_example(url)
            cluster_list_hierarchical = hierarchical_clustering(singleton_list, num)
            hierarchical_distortion = sum([x.cluster_error(data_table) for x in cluster_list_hierarchical])
            hierarchical_distortion_result.append(hierarchical_distortion)
        hierarchical_distortion_result.reverse()

        for num in range(min_number_clusters, max_number_clusters + 1):
            data_table, singleton_list = alg_project3_viz.run_example(url)
            #print "num clsusters", num
            cluster_list_kmeans = kmeans_clustering(singleton_list, num, steps)
            kmeans_distortion = sum(cluster.cluster_error(data_table) for cluster in cluster_list_kmeans)
            #print "kmeans distortion", kmeans_distortion
            kmeans_distortion_result.append(kmeans_distortion)

        counties = re.search('_(\d+)\.csv', url).group(1)
        print "hierarchical clustering with", num, "cluster for", counties, "counties", hierarchical_distortion_result
        print "k-means clustering with", num, "cluster for", counties, "counties", kmeans_distortion_result, "\n"

        label_hierar = 'Distortion of hierarchical clustering for' + counties
        label_kmeans = 'Distortion of k-means clustering for' + counties
        range0 = range(min_number_clusters, max_number_clusters+1)

        plot.plot(range0, hierarchical_distortion_result, '-r', label=label_hierar)
        plot.plot(range0, kmeans_distortion_result, '-g', label=label_kmeans)

        plot.title('Hierarchical vs K-means distortion - PyCharm (IntelliJ)')
        plot.xlabel('Size of Cluster')
        plot.ylabel('Distortion')
        plot.legend(loc='upper right')
        plot.tight_layout()
        if file_to_save:
            file = file_to_save[:-4] + "_" + counties + '.png'
            plot.savefig(file)
        plot.clf()
Ejemplo n.º 3
0
def question6(URL, file_to_save, number_clusters=9, iterations=5, centers=False):
    '''

    :return:
    '''
    data_table, singleton_list = alg_project3_viz.run_example(URL)
    cluster_list = kmeans_clustering(singleton_list, number_clusters, iterations)
    print "Displaying", len(cluster_list), "k-means clusters"
    if centers:
        file = file_to_save[:-4] + 'with_centers' + '.png'
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file, True)
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file_to_save, False)
Ejemplo n.º 4
0
def question2(URL, file_to_save, number_clusters=15, centers=False):
    '''

    :return:
    '''
    data_table, singleton_list = alg_project3_viz.run_example(URL)
    cluster_list = hierarchical_clustering(singleton_list, number_clusters)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    if centers:
        file = file_to_save[:-4] + 'with_centers' + '.png'
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file, centers)
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file_to_save, centers)
Ejemplo n.º 5
0
def question7(URL, number_clusters=9, iterations=5):
    '''

    :return:
    '''
    data_table, singleton_list = alg_project3_viz.run_example(URL)
    cluster_list_hierarchical = hierarchical_clustering(singleton_list, number_clusters)
    cluster_list_kmeans = kmeans_clustering(singleton_list, number_clusters, iterations)
    hierarchical_distortion = sum([x.cluster_error(data_table) for x in cluster_list_hierarchical])
    kmeans_distortion = sum([y.cluster_error(data_table) for y in cluster_list_kmeans])
    counties = re.search('_(\d+)\.csv', URL).group(1)
    print "Displaying distortion of hierarchical clusteringfor", counties, "counties", hierarchical_distortion
    print "Displaying distortion of k-means clustering", counties, "counties", kmeans_distortion
Ejemplo n.º 6
0
def gen_random_clusters():
    listx = []
    listy_slow = []
    listy_fast = []
    data_table = alp.load_data_table(alp.DATA_896_URL)
    
    for i in range(6,21):
        
        list1 = []
        sumerror = 0
        sumerror_fast = 0
        cluster_list = alp.run_example(i, True)
        cluster_list_fast = alp.run_example(i, False)
        
        for s in cluster_list:
            error = s.cluster_error(data_table)
            list1.append(error)
        sumerror = sum(list1)    
        list1 = []
        for s in cluster_list_fast:
            error = s.cluster_error(data_table)
            list1.append(error)
        sumerror_fast = sum(list1)    
        
        
        listx.append(i)
        listy_slow.append(sumerror)
        listy_fast.append(sumerror_fast)
        
    
    plt.plot(listx, listy_slow, '-r', label='hierarchical clustering')
    plt.plot(listx, listy_fast, '-b', label='kmeans clustering')
    plt.legend(loc='upper right')
    plt.title("Quality - Data Set of 896")
    plt.ylabel('Total error')
    plt.xlabel('Number of clusters')
    plt.show()