Beispiel #1
0
def test_single_linkage_sklearn_compare(nrows, ncols, nclusters, k,
                                        connectivity):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=1.0,
                      shuffle=False)

    cuml_agg = AgglomerativeClustering(n_clusters=nclusters,
                                       affinity='euclidean',
                                       linkage='single',
                                       n_neighbors=k,
                                       connectivity=connectivity)

    cuml_agg.fit(X)

    sk_agg = cluster.AgglomerativeClustering(n_clusters=nclusters,
                                             affinity='euclidean',
                                             linkage='single')
    sk_agg.fit(cp.asnumpy(X))

    # Cluster assignments should be exact, even though the actual
    # labels may differ
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0)
    assert (cuml_agg.n_connected_components_ == sk_agg.n_connected_components_)
    assert (cuml_agg.n_leaves_ == sk_agg.n_leaves_)
    assert (cuml_agg.n_clusters_ == sk_agg.n_clusters_)
Beispiel #2
0
def hierarchical(data, accuracy, si_scores):
    combination = []
    data = np.array(data)
    # data=data.T
    # clust = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward')
    clust = cluster.AgglomerativeClustering(n_clusters=2)
    label = clust.fit_predict(data.T)
    print(label)
    label = list(label)
    acc_0 = accuracy_0(label)
    acc_1 = accuracy_1(label)
    for i in range(len(data)):
        combination.append(get_keys(features_label, data[i].tolist()))
    print('feature combination: {}'.format(combination))
    print('accuracy_0: {}'.format(acc_0))
    # print('accuracy_1: {}'.format(acc_1))
    # 使用轮廓系数评估模型的优虐
    si_score = silhouette_score(data.T,
                                label,
                                metric='euclidean',
                                sample_size=len(data.T))
    print('si_score: {:.4f}'.format(si_score))
    accuracy.append(acc_0)
    si_scores.append(si_score)
    return combination, acc_0, si_score, accuracy, si_scores
Beispiel #3
0
def evalClusteringOnLabels(
        SentenceEmbeddings: Iterable[Iterable[float]],
        groupLabels: Iterable,
        verbose=True) -> Iterable[float]:  # pragma: no cover
    """
    Evaluate a vector of sentence embeddings for clustering around
        pre-defined categories.

    For example, you can test that embeddings in an ad campaign cluster 
        around the campaigns, by passing the ad embeddings and the campaign labels.

    Since cluster evaluation is hard, we use agglomerative hierarchical with euclidean/ward linkage. 
    See below for why this specific algorithm was chosen. We also pass through three metrics:
        adjusted mutual information score,
        adj. Rand index,
        Fowlkes-Mallows score

    The result is a vector of all results in order (
        adj. MI score, adj. Rand score, F-M score
    ) for agglomerative/spectral clusterings

    :param SentenceEmbeddings: 
        A vector of sentence embedding vectors. Co-indexed with groupLabels
    :param groupLabels: 
        Category labels for the sentences. Co-indexed with SentenceEmbeddings
    :param verbose: 
        prints a result table if true.

    :returns: a np.array of all index score results
    -------------------------------
    Algorithm choice:
        We need a clustering algorithm that does not assume cluster shape, that is stable across runs,
        and that is stable across parameter choices. 
        This is for evaluation to be as deterministic as possible.

        This means the following are unacceptable: k-means (unstable across runs), 
        spectral (unstable across parameters), mean shift (assumes globular shape).

        This leaves DBSCAN and agglomerative clustering. 
        
        DBSCAN tends to perform poorly on clusters of word embeddings. 
        It seems they are not clustered by density.

        Agglomerative has an added advantage: on normalized embeddings, 
            the euclidian metric is the same as the well-liked cosine distance 
            (for semantic similarity). Thus agglomerative is our choice.
    """
    n_clusters = len(set(groupLabels))
    agglo = cluster.AgglomerativeClustering(
        n_clusters=n_clusters, affinity='euclidean',
        linkage='ward').fit(SentenceEmbeddings).labels_
    results = []
    results.append(metrics.adjusted_mutual_info_score(agglo, groupLabels))
    results.append(metrics.adjusted_rand_score(agglo, groupLabels))
    results.append(metrics.fowlkes_mallows_score(agglo, groupLabels))
    if verbose:
        print("adj. MI score:   {0:.2f}".format(results[0]))
        print("adj. RAND score: {0:.2f}".format(results[1]))
        print("F-M score:       {0:.2f}".format(results[2]))
    return np.array(results)
def hierarchy(feat, n_clusters=2, knn=30):
    from sklearn.neighbors import kneighbors_graph
    knn_graph = kneighbors_graph(feat, knn, include_self=False)
    hierarchy = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                connectivity=knn_graph,
                                                linkage='ward').fit(feat)
    return hierarchy.labels_
def cluster_faces_by_Agglomerative(data, threshold=1.0):
    encodings = [d["encoding"] for d in data]
    clt = cluster.AgglomerativeClustering(distance_threshold=threshold,
                                          n_clusters=None)
    clt.fit(encodings)
    # print(clt.labels_)
    return clt.labels_
Beispiel #6
0
def test_AgglomerativeClustering_linkage(*data):
    '''
    test the performance with different linkages
    :param data:  data, target
    :return: None
    '''
    X, labels_true = data
    nums = range(1, 50)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    linkages = ['ward', 'complete', 'average']
    markers = "+o*"
    for i, linkage in enumerate(linkages):
        ARIs = []
        for num in nums:
            clst = cluster.AgglomerativeClustering(n_clusters=num,
                                                   linkage=linkage)
            predicted_labels = clst.fit_predict(X)
            ARIs.append(adjusted_rand_score(labels_true, predicted_labels))
        ax.plot(nums,
                ARIs,
                marker=markers[i],
                label="linkage:{0}".format(linkage))

    ax.set_xlabel("n_clusters")
    ax.set_ylabel("ARI")
    ax.legend(loc="best")
    fig.suptitle("AgglomerativeClustering")
    plt.show()
Beispiel #7
0
def get_assignments():

	key2val = request.args.to_dict()

	n_clusters = int(key2val["n_clusters"]) if "n_clusters" in key2val else 3

	data = pd.read_csv("../data/clustering/mergedforCluster.csv")
	fips = data["FIPS Code"].copy().values
	columns = set(data.columns.values)
	del data["FIPS Code"]

	X = data.fillna(0)

	if "subset" in key2val and len(ast.literal_eval(key2val["subset"])) > 0:
		subset = [s for s in ast.literal_eval(key2val["subset"]) if s in columns]
		X = X[subset]		

	db = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
	X = StandardScaler().fit_transform(X)
	db.fit(X)

	response = []

	fips = [str(f) for f in fips]
	labels = [int(l) for l in db.labels_]

	for kv in zip(fips, labels):
		response.append({"fips": kv[0], "cluster": kv[1]})

	return jsonify(response)
Beispiel #8
0
def Agg(tfidf, clusters):
    agg = cluster.AgglomerativeClustering(n_clusters=89,
                                          linkage='average').fit_predict(tfidf)
    # result = normalized_mutual_info_score(agg, clusters)
    result = v_measure_score(agg, clusters)

    print("the Aggiomerative clustering cluster algorithm result is: ", result)
Beispiel #9
0
def Ward(tfidf, clusters):
    ward = cluster.AgglomerativeClustering(101, linkage='ward').fit((tfidf))
    # result = normalized_mutual_info_score(ward.labels_, clusters)
    result = v_measure_score(ward.labels_, clusters)

    print("the Ward hierarchical clustering cluster algorithm result is: ",
          result)
Beispiel #10
0
 def ClusterTrain(self, component=2, model='Agglomerative'):
     """Using cluster method to divide the sample into different category
     unsupervisedly. Different model can be used.
         1. Spectral Clustering
         2. Agglomerative Clustering
         3. MiniBatch KMeans
     Parameters
     ----------
     component: int, the dimension that convert to.
     model: string, the model you select for manifold learning
     """
     print '-' * 49 + '\n' + 'Clustering\n' + '-' * 49
     clusterlist = {
         'spectral':
         cluster.SpectralClustering(n_clusters=component,
                                    eigen_solver='arpack',
                                    affinity="nearest_neighbors",
                                    random_state=0),
         'Agglomerative':
         cluster.AgglomerativeClustering(n_clusters=component,
                                         linkage='ward'),  #nice
         'MiniBatch':
         cluster.MiniBatchKMeans(n_clusters=component)
     }
     MyCluster = clusterlist[model]
     return MyCluster.fit_predict(self.Feature)