def test_precomputed(): X, y = make_blobs(100, random_state=42) D = pairwise_distances(X) mst1 = MSTClustering(cutoff=0.1) mst2 = MSTClustering(cutoff=0.1, metric='precomputed') assert_equal(mst1.fit_predict(X), mst2.fit_predict(D))
def test_precomputed_metric(): N = 30 n_neighbors = 10 rng = np.random.RandomState(42) X = rng.rand(N, 3) G_sparse = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance') G_dense = G_sparse.toarray() G_dense[G_dense == 0] = np.nan kwds = dict(cutoff=0.1) y1 = MSTClustering(n_neighbors=n_neighbors, **kwds).fit_predict(X) y2 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_sparse) y3 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_dense) assert_allclose(y1, y2) assert_allclose(y2, y3)
def _check(n, min_cluster_size): y_pred = MSTClustering(cutoff=n, n_neighbors=2, min_cluster_size=min_cluster_size, approximate=True).fit_predict(X) labels, counts = np.unique(y_pred, return_counts=True) counts = counts[labels >= 0] if len(counts): assert_(counts.min() >= min_cluster_size)
def test_bad_arguments(): X, y = make_blobs(100, random_state=42) mst = MSTClustering() assert_raises_regex(ValueError, "Must specify either cutoff or cutoff_frac", mst.fit, X, y) mst = MSTClustering(cutoff=-1) assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X) mst = MSTClustering() msg = "Must call fit\(\) before get_graph_segments()" assert_raises_regex(ValueError, msg, mst.get_graph_segments) mst = MSTClustering(cutoff=0, metric='precomputed') mst.fit(pairwise_distances(X)) msg = "Cannot use ``get_graph_segments`` with precomputed metric." assert_raises_regex(ValueError, msg, mst.get_graph_segments)
def test_precomputed_metric_with_duplicates(): N = 30 n_neighbors = N - 1 rng = np.random.RandomState(42) # make data with duplicate points X = rng.rand(N, 3) X[-5:] = X[:5] # compute sparse distances G_sparse = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance') # compute dense distances G_dense = pairwise_distances(X, X) kwds = dict(cutoff=0.1) y1 = MSTClustering(n_neighbors=n_neighbors, **kwds).fit_predict(X) y2 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_sparse) y3 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_dense) assert_allclose(y1, y2) assert_allclose(y2, y3)
def check_shape(ndim, cutoff, N=10): X = np.random.rand(N, ndim) mst = MSTClustering(cutoff=cutoff).fit(X) segments = mst.get_graph_segments() print(ndim, cutoff, segments[0].shape) assert len(segments) == ndim assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments) segments = mst.get_graph_segments(full_graph=True) print(segments[0].shape) assert len(segments) == ndim assert all(seg.shape == (2, N - 1) for seg in segments)
def do_clustering(): # create some data with four clusters # X, y = make_blobs(200, centers=4, random_state=42) X = np.genfromtxt('./file16.csv', delimiter=',') print(X.shape) X = X[:, 1:] # predict the labels with the MST algorithm model = MSTClustering(cutoff_scale=2) labels = model.fit_predict(X) # plot the results plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow', marker='.') plt.savefig('./mst.png')
def __init__(self, df, cutoff_scale=None, min_cluster_size=None, n_neighbors=None, set_mst=None, labels=None, segments=None, seps=None): self.df = df self.cutoff_scale = cutoff_scale self.min_cluster_size = min_cluster_size self.n_neighbors = n_neighbors self.set_mst = MSTClustering(cutoff_scale=cutoff_scale, min_cluster_size=min_cluster_size, n_neighbors=n_neighbors) pos = np.array([list(i) for i in zip(df.ra, df.dec)]) self.labels = self.set_mst.fit_predict(pos) self.segments = self.set_mst.get_graph_segments(full_graph=True) self.seps = self.get_sep_mst()
def MST_clustering(filename): with open(filename, 'r') as f: words = f.readlines() words = [word.rstrip() for word in words if len(word) > 4] words = np.asarray(words) jac_similarity = np.array([[jaccard(w1, w2) for w1 in words[:500]] for w2 in words[:500]]) #pdb.set_trace() mst = MSTClustering(min_cluster_size=10, cutoff_scale=1) # cut-off scale ?? mst.fit(jac_similarity) mst_matrix = mst.full_tree_ X_tsne = TSNE(learning_rate=100).fit_transform(mst_matrix.todense()) labels = mst.labels_ pdb.set_trace() plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels) #plot_mst(mst) plt.show()
def PRIM_algo(X_train, y_train): # predict the labels with the MST algorithm silhouette_score_list = [] X_train = PCA(2, svd_solver="full").fit_transform(X_train) for i in range(2, 10): model = MSTClustering(cutoff_scale=i) labels = model.fit_predict(X_train, y_train) plt.title(str(i) + " scatter") x = [item[0] for item in X_train] y = [item[1] for item in X_train] print("this is x: ", x) print("this is y: ", y) plt.scatter(x, y, c=labels, cmap=cm.jet) plt.title("PRIM - " + str(i) + " scatter") plt.show() try: if (len(list(set(labels))) > 1): silhouette_score_list.append( metrics.silhouette_score(X_train, labels, metric='euclidean')) else: silhouette_score_list.append(-1) except: print("silhouette_score did not work") # print("Silhouette: ",silhouette_score(df,cluster_of_each_point_in_data)) # #Computing "the Silhouette Score" # print("Silhouette Coefficient: %0.3f" # % metrics.silhouette_score(X_train, labels, metric='euclidean')) t_Test(X_train, labels) print(labels) if (len(silhouette_score_list) != 0): kn = KneeLocator([i + 1 for i in range(len(silhouette_score_list))], silhouette_score_list, curve='convex', direction='decreasing') print(kn.elbow) create_graph(silhouette_score_list, y_text="SSE", start_point=2)
matrix.append(row) row = [float(w)] else: row.append(float(w)) old_sample=sample matrix.append(row) mat=np.array(matrix) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=3, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(mat).embedding_ clf = PCA(n_components=2) pos = clf.fit_transform(pos) fig, ax = plt.subplots() model = MSTClustering(cutoff_scale=200, approximate=False) labels = model.fit_predict(pos) #### Om man vill ha kanter: #X = model.X_fit_ #segments = model.get_graph_segments(full_graph=False) #ax.plot(segments[0], segments[1], '-k', zorder=1, lw=1) #ax.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap='rainbow', zorder=2) #ax.axis('tight') ##### #### Utan kanter: plt.scatter(pos[:, 0], pos[:, 1], c=labels, s=100, lw=0) ####
def _check_n(n): y_pred = MSTClustering(cutoff=n).fit_predict(X) assert_equal(len(np.unique(y_pred)), n + 1)
def _check_n(n): y_pred = MSTClustering(cutoff=n, n_neighbors=2, approximate=True).fit_predict(X) assert_equal(len(np.unique(y_pred)), n + 1)
def check_graph_segments_vals(): X = np.arange(5)[:, None]**2 mst = MSTClustering(cutoff=0).fit(X) segments = mst.get_graph_segments() assert len(segments) == 1 assert_allclose(segments[0], [[0, 4, 4, 9], [1, 1, 9, 16]])
def _check_params(kwds): y_pred = MSTClustering(n_neighbors=100, **kwds).fit_predict(X) assert_equal(len(np.unique(y_pred)), 3) assert_allclose([np.std(y[y == i]) for i in range(3)], 0)
for axi, full_graph, colors in zip(ax, [True, False], ['lightblue', model.labels_]): segments = model.get_graph_segments(full_graph=full_graph) axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1) axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2) axi.axis('tight') ax[0].set_title('Full Minimum Spanning Tree', size=16) ax[1].set_title('Trimmed Minimum Spanning Tree', size=16) X, y = make_blobs(200, centers=4, random_state=42) plt.scatter(X[:, 0], X[:, 1], c='lightblue') plt.show() model = MSTClustering(cutoff_scale=2, approximate=False) labels = model.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow') plt.show() plot_minimum_spanning_tree(model) plt.show() rng = np.random.RandomState(int(100 * y[-1])) noise = -14 + 28 * rng.rand(200, 2) X_noisy = np.vstack([X, noise]) y_noisy = np.concatenate([y, np.full(200, -1, dtype=int)]) plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c='lightblue', cmap='spectral_r') plt.xlim(-15, 15)
#print(k) #print(pkl[k[n]]) #print(list(pkl)[0:5]) X = pkl[k[n]] for i in range(0, len(X)): for j in range(0, len(X)): if i == j: #print(i,j) X[i, j] = maxa #print(X) #print(X[0]) cut = 1.4 from mst_clustering import MSTClustering model = MSTClustering(cutoff_scale=maxa * cut, approximate=False) labels = model.fit_predict(X) #print(labels) # model2 = MSTClustering(cutoff_scale=maxa*0.9, approximate=False) # labels2 = model2.fit_predict(X) # print(labels2) data_src = data + k[n] #print(data_src) c = 0 for pic in os.listdir(data_src): #print(pic) img = cv2.imread(os.path.join(data_src, pic)) #print(labels[c])
def cluster(self): model = MSTClustering(cutoff_scale=self.classifyer, approximate=False) self.colors = model.fit_predict(self.positions)
['lightblue', model.labels_]): segments = model.get_graph_segments(full_graph=full_graph) axi.plot(segments[0], segments[1], '-ok', zorder=1, lw=1) axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2) axi.axis('tight') ax[0].set_title('Full Minimum Spanning Tree', size=16) ax[1].set_title('Trimmed Minimum Spanning Tree', size=16) # create some data X, y = make_blobs(100, centers=5, cluster_std=0.90) print(X) # predict the labels with the MST algorithm model = MSTClustering(cutoff_scale=1.5, approximate=True, n_neighbors=100) labels = model.fit_predict(X) counts = np.bincount(labels) print("No. of clusters: ") clusters = len(counts) print(len(counts)) print("No. of elements in each Clusters: ") print(counts) # plot the results plt.scatter(X[0:, 0], X[0:, 1], marker='o', c=labels, cmap='rainbow') plt.show() # plot the brief model plot_mst(model) wcss = []
def get_mst(dataframe): model = MSTClustering(cutoff_scale=2) model.fit(dataframe) return model.labels_