Example #1
0
def clusterData(data, clust, results, to_plot):
    plot_sample_size = 6000
    if clust['clustering_type'] == 'kmeans':
        #TODO kmeans works well even on 2.000.000 questions
        kmeans = KMeans(init='k-means++', n_clusters=clust['n_clusters'], n_init=10)
        kmeans.fit(data)
        clust['centers'] = kmeans.cluster_centers_
        results['cluster_labels'] = kmeans.labels_
        if to_plot:
            plot.PlotData(data, kmeans, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'spectral':
        spectral = cluster.SpectralClustering(n_clusters=clust['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
        spectral.fit(data)
        plot.PlotData(data, spectral, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'birch':
        birch = cluster.Birch(n_clusters=results['n_clusters'])
        birch.fit(data)
        results['cluster_labels'] = birch.labels_
        print 'number of entries clustered', len(results['cluster_labels'])
        plot.PlotData(data, birch, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'dbscan':
        dbscan = cluster.DBSCAN(eps=.2)
        dbscan.fit(data)
        results['cluster_labels'] = dbscan.labels_
        plot.PlotData(data, dbscan, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'affinity_propagation':
        affinity_propagation = cluster.AffinityPropagation(damping=.9, preference=-200)
        affinity_propagation.fit(data)
        plot.PlotData(data, affinity_propagation, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'ward':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data, n_neighbors=10, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        ward = cluster.AgglomerativeClustering(n_clusters=clust['n_clusters'], linkage='ward',
                                           connectivity=connectivity)
        ward.fit(data)
        results['cluster_labels'] = ward.labels_
        plot.PlotData(data, ward, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'average_linkage':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data, n_neighbors=10, include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock", n_clusters=clust['n_clusters'],
        connectivity=connectivity)
        average_linkage.fit(data)
        results['cluster_labels'] = average_linkage.labels_
        plot.PlotData(data, average_linkage, plot_sample_size, clust['exp'])
    df = csr.clustDfFromRes(results)
    stats = csr.clusterResults(df, clust)
    return df, stats
Example #2
0
def clusterData(data, clust, results, to_plot):
    plot_sample_size = 6000
    if clust['clustering_type'] == 'kmeans':
        #TODO kmeans works well even on 2.000.000 questions
        kmeans = KMeans(init='k-means++',
                        n_clusters=clust['n_clusters'],
                        n_init=10)
        kmeans.fit(data)
        clust['centers'] = kmeans.cluster_centers_
        results['cluster_labels'] = kmeans.labels_
        if to_plot:
            plot.PlotData(data, kmeans, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'spectral':
        spectral = cluster.SpectralClustering(n_clusters=clust['n_clusters'],
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        spectral.fit(data)
        plot.PlotData(data, spectral, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'birch':
        birch = cluster.Birch(n_clusters=results['n_clusters'])
        birch.fit(data)
        results['cluster_labels'] = birch.labels_
        print 'number of entries clustered', len(results['cluster_labels'])
        plot.PlotData(data, birch, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'dbscan':
        dbscan = cluster.DBSCAN(eps=.2)
        dbscan.fit(data)
        results['cluster_labels'] = dbscan.labels_
        plot.PlotData(data, dbscan, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'affinity_propagation':
        affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                           preference=-200)
        affinity_propagation.fit(data)
        plot.PlotData(data, affinity_propagation, plot_sample_size,
                      clust['exp'])

    if clust['clustering_type'] == 'ward':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        ward = cluster.AgglomerativeClustering(n_clusters=clust['n_clusters'],
                                               linkage='ward',
                                               connectivity=connectivity)
        ward.fit(data)
        results['cluster_labels'] = ward.labels_
        plot.PlotData(data, ward, plot_sample_size, clust['exp'])

    if clust['clustering_type'] == 'average_linkage':
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(data,
                                        n_neighbors=10,
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=clust['n_clusters'],
            connectivity=connectivity)
        average_linkage.fit(data)
        results['cluster_labels'] = average_linkage.labels_
        plot.PlotData(data, average_linkage, plot_sample_size, clust['exp'])
    df = csr.clustDfFromRes(results)
    stats = csr.clusterResults(df, clust)
    return df, stats