def test_sklearn_kmeans(assign_labels):
    sc = SpectralClustering(n_components=25,
                            random_state=0,
                            assign_labels=assign_labels,
                            kmeans_params={'n_clusters': 8})
    sc.fit(X)
    assert isinstance(sc.assign_labels_, sklearn.cluster.KMeans)
def test_callable_affinity():
    affinity = partial(
        metrics.pairwise.pairwise_kernels,
        metric="rbf",
        filter_params=True,
        gamma=1.0 / len(X),
    )
    sc = SpectralClustering(affinity=affinity, gamma=None)
    sc.fit(X)
Exemple #3
0
def test_basic(as_ndarray, persist_embedding):
    sc = SpectralClustering(
        n_components=25, random_state=0, persist_embedding=persist_embedding
    )
    if as_ndarray:
        X_ = X.compute()
    else:
        X_ = X
    sc.fit(X_)
    assert len(sc.labels_) == len(X_)
Exemple #4
0
def test_affinity_raises():
    sc = SpectralClustering(affinity="foo")
    with pytest.raises(ValueError) as m:
        sc.fit(X)

    assert m.match("Unknown affinity metric name 'foo'")

    sc = SpectralClustering(affinity=np.array([]))
    with pytest.raises(TypeError) as m:
        sc.fit(X)
        assert m.match("Unexpected type for affinity 'ndarray'")
Exemple #5
0
def test_assign_labels_raises():
    sc = SpectralClustering(assign_labels="foo")
    with pytest.raises(ValueError) as m:
        sc.fit(X)

    assert m.match("Unknown 'assign_labels' 'foo'")

    sc = SpectralClustering(assign_labels=dict())
    with pytest.raises(TypeError) as m:
        sc.fit(X)

    assert m.match("Invalid type ")
Exemple #6
0
    def run(self):
        if self.word_vectors not in {"fasttext", "word2vec"}:
            raise ValueError(
                f'Expected fasttext or word2vec; got {self.word_vectors}')

        print(
            f'Initializing dask dataframe of word embeddings at {datetime.now()}'
        )
        ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR /
                                      f'{self.word_vectors}_to_csv' / "*.part")

        print(
            f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}'
        )
        X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1)
        X = X.to_dask_array(lengths=True)

        # Perform k-means clustering
        print(f'Starting K-Means clustering at {datetime.now()}')
        k_means_clustering_model = KMeans(n_clusters=self.num_clusters,
                                          n_jobs=-1,
                                          max_iter=config.K_MEANS_MAX_ITER)
        k_means_cluster_labels = k_means_clustering_model.fit(X)

        # Write k-means results to disk
        print(
            f'Joining K-means results and writing to disk at {datetime.now()}')
        k_means_results_ddf = ddf.join(k_means_cluster_labels)
        k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means'
        k_means_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path)

        # Perform spectral clustering
        print(f'Starting Spectral clustering at {datetime.now()}')
        spectral_clustering_model = SpectralClustering(
            n_clusters=self.num_clusters,
            n_jobs=-1,
            persist_embedding=True,
            kmeans_params={"max_iter": config.K_MEANS_MAX_ITER})
        spectral_cluster_labels = spectral_clustering_model.fit(X)

        # Write spectral results to disk
        print(
            f'Joining Spectral results and writing to disk at {datetime.now()}'
        )
        spectral_results_ddf = ddf.join(spectral_cluster_labels)
        spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral'
        spectral_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path)

        # And save the success flag
        with self.output().open("w") as f:
            # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n")
            # f.write(spectral_clustering_model.get_params(deep=True))
            f.write(f'{self.word_vectors}: Success!')
Exemple #7
0
def test_n_components_raises():
    sc = SpectralClustering(n_components=len(X))
    with pytest.raises(ValueError) as m:
        sc.fit(X)
    assert m.match("n_components")
def test_callable_affinity():
    affinity = partial(metrics.pairwise.pairwise_kernels,
                       metric='rbf',
                       filter_params=True)
    sc = SpectralClustering(affinity=affinity)
    sc.fit(X)
def test_basic(data, persist_embedding):
    sc = SpectralClustering(n_components=25,
                            random_state=0,
                            persist_embedding=persist_embedding)
    sc.fit(data)
    assert len(sc.labels_) == len(X)