Example #1
0
def test_precomputed():
    X, y = make_blobs(100, random_state=42)
    D = pairwise_distances(X)

    mst1 = MSTClustering(cutoff=0.1)
    mst2 = MSTClustering(cutoff=0.1, metric='precomputed')

    assert_equal(mst1.fit_predict(X), mst2.fit_predict(D))
Example #2
0
def test_precomputed_metric():
    N = 30
    n_neighbors = 10
    rng = np.random.RandomState(42)
    X = rng.rand(N, 3)

    G_sparse = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance')
    G_dense = G_sparse.toarray()
    G_dense[G_dense == 0] = np.nan

    kwds = dict(cutoff=0.1)
    y1 = MSTClustering(n_neighbors=n_neighbors, **kwds).fit_predict(X)
    y2 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_sparse)
    y3 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_dense)

    assert_allclose(y1, y2)
    assert_allclose(y2, y3)
Example #3
0
 def _check(n, min_cluster_size):
     y_pred = MSTClustering(cutoff=n,
                            n_neighbors=2,
                            min_cluster_size=min_cluster_size,
                            approximate=True).fit_predict(X)
     labels, counts = np.unique(y_pred, return_counts=True)
     counts = counts[labels >= 0]
     if len(counts):
         assert_(counts.min() >= min_cluster_size)
Example #4
0
def test_bad_arguments():
    X, y = make_blobs(100, random_state=42)

    mst = MSTClustering()
    assert_raises_regex(ValueError,
                        "Must specify either cutoff or cutoff_frac", mst.fit,
                        X, y)

    mst = MSTClustering(cutoff=-1)
    assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X)

    mst = MSTClustering()
    msg = "Must call fit\(\) before get_graph_segments()"
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)

    mst = MSTClustering(cutoff=0, metric='precomputed')
    mst.fit(pairwise_distances(X))
    msg = "Cannot use ``get_graph_segments`` with precomputed metric."
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)
Example #5
0
def test_precomputed_metric_with_duplicates():
    N = 30
    n_neighbors = N - 1
    rng = np.random.RandomState(42)

    # make data with duplicate points
    X = rng.rand(N, 3)
    X[-5:] = X[:5]

    # compute sparse distances
    G_sparse = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance')

    # compute dense distances
    G_dense = pairwise_distances(X, X)

    kwds = dict(cutoff=0.1)
    y1 = MSTClustering(n_neighbors=n_neighbors, **kwds).fit_predict(X)
    y2 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_sparse)
    y3 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_dense)

    assert_allclose(y1, y2)
    assert_allclose(y2, y3)
Example #6
0
    def check_shape(ndim, cutoff, N=10):
        X = np.random.rand(N, ndim)
        mst = MSTClustering(cutoff=cutoff).fit(X)

        segments = mst.get_graph_segments()
        print(ndim, cutoff, segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments)

        segments = mst.get_graph_segments(full_graph=True)
        print(segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1) for seg in segments)
Example #7
0
def do_clustering():

    # create some data with four clusters
    # X, y = make_blobs(200, centers=4, random_state=42)
    X = np.genfromtxt('./file16.csv', delimiter=',')
    print(X.shape)
    X = X[:, 1:]

    # predict the labels with the MST algorithm
    model = MSTClustering(cutoff_scale=2)
    labels = model.fit_predict(X)

    # plot the results
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow', marker='.')
    plt.savefig('./mst.png')
Example #8
0
 def __init__(self,
              df,
              cutoff_scale=None,
              min_cluster_size=None,
              n_neighbors=None,
              set_mst=None,
              labels=None,
              segments=None,
              seps=None):
     self.df = df
     self.cutoff_scale = cutoff_scale
     self.min_cluster_size = min_cluster_size
     self.n_neighbors = n_neighbors
     self.set_mst = MSTClustering(cutoff_scale=cutoff_scale,
                                  min_cluster_size=min_cluster_size,
                                  n_neighbors=n_neighbors)
     pos = np.array([list(i) for i in zip(df.ra, df.dec)])
     self.labels = self.set_mst.fit_predict(pos)
     self.segments = self.set_mst.get_graph_segments(full_graph=True)
     self.seps = self.get_sep_mst()
Example #9
0
def MST_clustering(filename):
    with open(filename, 'r') as f:
        words = f.readlines()
    words = [word.rstrip() for word in words if len(word) > 4]
    words = np.asarray(words)
    jac_similarity = np.array([[jaccard(w1, w2) for w1 in words[:500]]
                               for w2 in words[:500]])

    #pdb.set_trace()
    mst = MSTClustering(min_cluster_size=10,
                        cutoff_scale=1)  # cut-off scale ??
    mst.fit(jac_similarity)
    mst_matrix = mst.full_tree_

    X_tsne = TSNE(learning_rate=100).fit_transform(mst_matrix.todense())
    labels = mst.labels_
    pdb.set_trace()
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels)
    #plot_mst(mst)
    plt.show()
Example #10
0
def PRIM_algo(X_train, y_train):
    # predict the labels with the MST algorithm
    silhouette_score_list = []
    X_train = PCA(2, svd_solver="full").fit_transform(X_train)
    for i in range(2, 10):
        model = MSTClustering(cutoff_scale=i)
        labels = model.fit_predict(X_train, y_train)
        plt.title(str(i) + " scatter")
        x = [item[0] for item in X_train]
        y = [item[1] for item in X_train]
        print("this is x: ", x)
        print("this is y: ", y)
        plt.scatter(x, y, c=labels, cmap=cm.jet)
        plt.title("PRIM - " + str(i) + " scatter")
        plt.show()

        try:
            if (len(list(set(labels))) > 1):
                silhouette_score_list.append(
                    metrics.silhouette_score(X_train,
                                             labels,
                                             metric='euclidean'))
            else:
                silhouette_score_list.append(-1)
        except:
            print("silhouette_score did not work")
        # print("Silhouette: ",silhouette_score(df,cluster_of_each_point_in_data))

        # #Computing "the Silhouette Score"
        # print("Silhouette Coefficient: %0.3f"
        #       % metrics.silhouette_score(X_train, labels, metric='euclidean'))
        t_Test(X_train, labels)
        print(labels)
    if (len(silhouette_score_list) != 0):
        kn = KneeLocator([i + 1 for i in range(len(silhouette_score_list))],
                         silhouette_score_list,
                         curve='convex',
                         direction='decreasing')
        print(kn.elbow)
    create_graph(silhouette_score_list, y_text="SSE", start_point=2)
Example #11
0
            matrix.append(row)
            row = [float(w)]
        else:
            row.append(float(w))
        old_sample=sample

matrix.append(row)
mat=np.array(matrix)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=3,
                   dissimilarity="precomputed", n_jobs=1)
pos = mds.fit(mat).embedding_
clf = PCA(n_components=2)
pos = clf.fit_transform(pos)
fig, ax = plt.subplots()

model = MSTClustering(cutoff_scale=200, approximate=False)
labels = model.fit_predict(pos)


#### Om man vill ha kanter:
#X = model.X_fit_
#segments = model.get_graph_segments(full_graph=False)
#ax.plot(segments[0], segments[1], '-k', zorder=1, lw=1)
#ax.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap='rainbow', zorder=2)
#ax.axis('tight')
#####

#### Utan kanter:
plt.scatter(pos[:, 0], pos[:, 1], c=labels, s=100, lw=0)
####
Example #12
0
 def _check_n(n):
     y_pred = MSTClustering(cutoff=n).fit_predict(X)
     assert_equal(len(np.unique(y_pred)), n + 1)
Example #13
0
 def _check_n(n):
     y_pred = MSTClustering(cutoff=n, n_neighbors=2,
                            approximate=True).fit_predict(X)
     assert_equal(len(np.unique(y_pred)), n + 1)
Example #14
0
def check_graph_segments_vals():
    X = np.arange(5)[:, None]**2
    mst = MSTClustering(cutoff=0).fit(X)
    segments = mst.get_graph_segments()
    assert len(segments) == 1
    assert_allclose(segments[0], [[0, 4, 4, 9], [1, 1, 9, 16]])
Example #15
0
 def _check_params(kwds):
     y_pred = MSTClustering(n_neighbors=100, **kwds).fit_predict(X)
     assert_equal(len(np.unique(y_pred)), 3)
     assert_allclose([np.std(y[y == i]) for i in range(3)], 0)
Example #16
0
    for axi, full_graph, colors in zip(ax, [True, False],
                                       ['lightblue', model.labels_]):
        segments = model.get_graph_segments(full_graph=full_graph)
        axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1)
        axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2)
        axi.axis('tight')

    ax[0].set_title('Full Minimum Spanning Tree', size=16)
    ax[1].set_title('Trimmed Minimum Spanning Tree', size=16)


X, y = make_blobs(200, centers=4, random_state=42)
plt.scatter(X[:, 0], X[:, 1], c='lightblue')
plt.show()

model = MSTClustering(cutoff_scale=2, approximate=False)
labels = model.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow')
plt.show()

plot_minimum_spanning_tree(model)
plt.show()

rng = np.random.RandomState(int(100 * y[-1]))
noise = -14 + 28 * rng.rand(200, 2)

X_noisy = np.vstack([X, noise])
y_noisy = np.concatenate([y, np.full(200, -1, dtype=int)])

plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c='lightblue', cmap='spectral_r')
plt.xlim(-15, 15)
Example #17
0
        #print(k)
        #print(pkl[k[n]])
        #print(list(pkl)[0:5])

        X = pkl[k[n]]
        for i in range(0, len(X)):
            for j in range(0, len(X)):
                if i == j:
                    #print(i,j)
                    X[i, j] = maxa

        #print(X)
        #print(X[0])
        cut = 1.4
        from mst_clustering import MSTClustering
        model = MSTClustering(cutoff_scale=maxa * cut, approximate=False)
        labels = model.fit_predict(X)
        #print(labels)

        # model2 = MSTClustering(cutoff_scale=maxa*0.9, approximate=False)
        # labels2 = model2.fit_predict(X)
        # print(labels2)

        data_src = data + k[n]
        #print(data_src)
        c = 0
        for pic in os.listdir(data_src):

            #print(pic)
            img = cv2.imread(os.path.join(data_src, pic))
            #print(labels[c])
Example #18
0
 def cluster(self):
     model = MSTClustering(cutoff_scale=self.classifyer, approximate=False)
     self.colors = model.fit_predict(self.positions)
Example #19
0
                                       ['lightblue', model.labels_]):
        segments = model.get_graph_segments(full_graph=full_graph)
        axi.plot(segments[0], segments[1], '-ok', zorder=1, lw=1)
        axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2)
        axi.axis('tight')

    ax[0].set_title('Full Minimum Spanning Tree', size=16)
    ax[1].set_title('Trimmed Minimum Spanning Tree', size=16)


# create some data
X, y = make_blobs(100, centers=5, cluster_std=0.90)
print(X)

# predict the labels with the MST algorithm
model = MSTClustering(cutoff_scale=1.5, approximate=True, n_neighbors=100)
labels = model.fit_predict(X)
counts = np.bincount(labels)
print("No. of clusters: ")
clusters = len(counts)
print(len(counts))
print("No. of elements in each Clusters: ")
print(counts)

# plot the results
plt.scatter(X[0:, 0], X[0:, 1], marker='o', c=labels, cmap='rainbow')
plt.show()
# plot the brief model
plot_mst(model)

wcss = []
Example #20
0
def get_mst(dataframe):
    model = MSTClustering(cutoff_scale=2)

    model.fit(dataframe)
    return model.labels_