def test_sklearn_kmeans(assign_labels): sc = SpectralClustering(n_components=25, random_state=0, assign_labels=assign_labels, kmeans_params={'n_clusters': 8}) sc.fit(X) assert isinstance(sc.assign_labels_, sklearn.cluster.KMeans)
def test_callable_affinity(): affinity = partial( metrics.pairwise.pairwise_kernels, metric="rbf", filter_params=True, gamma=1.0 / len(X), ) sc = SpectralClustering(affinity=affinity, gamma=None) sc.fit(X)
def test_basic(as_ndarray, persist_embedding): sc = SpectralClustering( n_components=25, random_state=0, persist_embedding=persist_embedding ) if as_ndarray: X_ = X.compute() else: X_ = X sc.fit(X_) assert len(sc.labels_) == len(X_)
def test_affinity_raises(): sc = SpectralClustering(affinity="foo") with pytest.raises(ValueError) as m: sc.fit(X) assert m.match("Unknown affinity metric name 'foo'") sc = SpectralClustering(affinity=np.array([])) with pytest.raises(TypeError) as m: sc.fit(X) assert m.match("Unexpected type for affinity 'ndarray'")
def test_assign_labels_raises(): sc = SpectralClustering(assign_labels="foo") with pytest.raises(ValueError) as m: sc.fit(X) assert m.match("Unknown 'assign_labels' 'foo'") sc = SpectralClustering(assign_labels=dict()) with pytest.raises(TypeError) as m: sc.fit(X) assert m.match("Invalid type ")
def run(self): if self.word_vectors not in {"fasttext", "word2vec"}: raise ValueError( f'Expected fasttext or word2vec; got {self.word_vectors}') print( f'Initializing dask dataframe of word embeddings at {datetime.now()}' ) ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR / f'{self.word_vectors}_to_csv' / "*.part") print( f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}' ) X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1) X = X.to_dask_array(lengths=True) # Perform k-means clustering print(f'Starting K-Means clustering at {datetime.now()}') k_means_clustering_model = KMeans(n_clusters=self.num_clusters, n_jobs=-1, max_iter=config.K_MEANS_MAX_ITER) k_means_cluster_labels = k_means_clustering_model.fit(X) # Write k-means results to disk print( f'Joining K-means results and writing to disk at {datetime.now()}') k_means_results_ddf = ddf.join(k_means_cluster_labels) k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means' k_means_ddf_output_path.mkdir(parents=True, exist_ok=True) dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path) # Perform spectral clustering print(f'Starting Spectral clustering at {datetime.now()}') spectral_clustering_model = SpectralClustering( n_clusters=self.num_clusters, n_jobs=-1, persist_embedding=True, kmeans_params={"max_iter": config.K_MEANS_MAX_ITER}) spectral_cluster_labels = spectral_clustering_model.fit(X) # Write spectral results to disk print( f'Joining Spectral results and writing to disk at {datetime.now()}' ) spectral_results_ddf = ddf.join(spectral_cluster_labels) spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral' spectral_ddf_output_path.mkdir(parents=True, exist_ok=True) dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path) # And save the success flag with self.output().open("w") as f: # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n") # f.write(spectral_clustering_model.get_params(deep=True)) f.write(f'{self.word_vectors}: Success!')
def test_n_components_raises(): sc = SpectralClustering(n_components=len(X)) with pytest.raises(ValueError) as m: sc.fit(X) assert m.match("n_components")
def test_callable_affinity(): affinity = partial(metrics.pairwise.pairwise_kernels, metric='rbf', filter_params=True) sc = SpectralClustering(affinity=affinity) sc.fit(X)
def test_basic(data, persist_embedding): sc = SpectralClustering(n_components=25, random_state=0, persist_embedding=persist_embedding) sc.fit(data) assert len(sc.labels_) == len(X)