def test_bad_arguments(): X, y = make_blobs(100, random_state=42) mst = MSTClustering() assert_raises_regex(ValueError, "Must specify either cutoff or cutoff_frac", mst.fit, X, y) mst = MSTClustering(cutoff=-1) assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X) mst = MSTClustering() msg = "Must call fit\(\) before get_graph_segments()" assert_raises_regex(ValueError, msg, mst.get_graph_segments) mst = MSTClustering(cutoff=0, metric='precomputed') mst.fit(pairwise_distances(X)) msg = "Cannot use ``get_graph_segments`` with precomputed metric." assert_raises_regex(ValueError, msg, mst.get_graph_segments)
def MST_clustering(filename): with open(filename, 'r') as f: words = f.readlines() words = [word.rstrip() for word in words if len(word) > 4] words = np.asarray(words) jac_similarity = np.array([[jaccard(w1, w2) for w1 in words[:500]] for w2 in words[:500]]) #pdb.set_trace() mst = MSTClustering(min_cluster_size=10, cutoff_scale=1) # cut-off scale ?? mst.fit(jac_similarity) mst_matrix = mst.full_tree_ X_tsne = TSNE(learning_rate=100).fit_transform(mst_matrix.todense()) labels = mst.labels_ pdb.set_trace() plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels) #plot_mst(mst) plt.show()
def get_mst(dataframe): model = MSTClustering(cutoff_scale=2) model.fit(dataframe) return model.labels_
model = MSTClustering(cutoff_scale=2, approximate=False) labels = model.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow') plt.show() plot_minimum_spanning_tree(model) plt.show() rng = np.random.RandomState(int(100 * y[-1])) noise = -14 + 28 * rng.rand(200, 2) X_noisy = np.vstack([X, noise]) y_noisy = np.concatenate([y, np.full(200, -1, dtype=int)]) plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c='lightblue', cmap='spectral_r') plt.xlim(-15, 15) plt.ylim(-15, 15) plt.show() noisy_model = MSTClustering(cutoff_scale=1) noisy_model.fit(X_noisy) plot_minimum_spanning_tree(noisy_model) plt.show() noisy_model = MSTClustering(cutoff_scale=1, min_cluster_size=10) noisy_model.fit(X_noisy) plot_minimum_spanning_tree(noisy_model) plt.show()