Ejemplo n.º 1
0
def compute_kmeans(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs):
    """Computes KMeans clusters using `sklearn.cluster.KMeans`(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)"""
    data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)]

    km = sklearn.cluster.KMeans(n_clusters=n_clusters, **kwargs).fit(data.T)

    return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=km.cluster_centers_, labels=km.labels_))
Ejemplo n.º 2
0
def compute_kmeans2(corpus: VectorizedCorpus, tokens: List[str] = None, n_clusters: int = 8, **kwargs):
    """Computes KMeans clusters using `scipy.cluster.vq.kmeans2` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans2.html"""
    data: scipy.sparse.spmatrix = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)]
    data = data.T.todense()
    if not np.issubdtype(data.dtype, np.floating):
        data = data.astype(np.float64)
    centroids, labels = scipy.cluster.vq.kmeans2(data, n_clusters, **kwargs)

    return KMeansCorpusClusters(corpus, tokens, KMeansResult(centroids=centroids, labels=labels))
Ejemplo n.º 3
0
def compute_hca(
    corpus: VectorizedCorpus, tokens: List[str], linkage_method: str = 'ward', linkage_metric: str = 'euclidean'
) -> HCACorpusClusters:
    """Computes HCA clusters using `scipy.cluster.hierarchy.linkage` (https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html"""
    data = corpus.data if tokens is None else corpus.data[:, corpus.token_indices(tokens)]

    linkage_matrix = linkage(data.T.todense(), method=linkage_method, metric=linkage_metric)
    """ from documentation

        A (n-1) by 4 matrix Z is returned. At the i-th iteration, clusters with token_ids Z[i, 0] and Z[i, 1] are combined to form cluster n + i.
        A cluster with an index less than n corresponds to one of the original observations.
        The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2].
        The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster.

    """

    return HCACorpusClusters(corpus, tokens, linkage_matrix)
Ejemplo n.º 4
0
def test_token_indices(corpus: VectorizedCorpus):
    assert corpus.token_indices(['a', 'c', 'z']) == [0, 2]