def perform_clustering(G, clusters):
    n_clusters = len(np.unique(clusters))
    nodes = G.nodes()
    distances = np.zeros((len(nodes), len(nodes)))

    i = 0
    for n1 in nodes:
        j = 0
        for n2 in nodes:
            if n1 == n2:
                continue

            distances[i, j] = nx.shortest_path_length(G, n1, n2)
            j += 1
        i += 1

    import genieclust
    g = genieclust.Genie(n_clusters=n_clusters, gini_threshold=0.3)

    labels = g.fit_predict(distances)

    pos = nx.spring_layout(G)
    nx.draw(G, pos=pos, node_color=clusters)
    plt.title('Correct labels')
    plt.show()

    nx.draw(G, pos=pos, node_color=labels)
    plt.title('Clustering result')
    plt.show()
Exemple #2
0
def Genie_with_n_threads(X, n_clusters, n_threads, **kwargs):
    n_threads_old = os.environ["OMP_NUM_THREADS"]
    os.environ["OMP_NUM_THREADS"] = str(n_threads)
    g = genieclust.Genie(n_clusters=n_clusters, **kwargs)
    g.fit(X)
    os.environ["OMP_NUM_THREADS"] = n_threads_old
    return g
Exemple #3
0
def __test_string(affinity='leven'):
    if nmslib is None: return

    np.random.seed(123)
    X = []
    for i in range(1, 100):
        X.append("a" * i)

    genieclust.Genie(affinity=affinity, exact=False, cast_float32=False).fit(X)
Exemple #4
0
def __test_sparse(affinity='euclidean_sparse'):
    if nmslib is None: return

    np.random.seed(123)
    X = np.random.choice(np.arange(-2.0, 3.0), 1000).reshape(100, -1)
    X *= np.random.rand(*X.shape)

    X = scipy.sparse.csr_matrix(X)

    genieclust.Genie(affinity=affinity, exact=False).fit(X)
Exemple #5
0
def test_mutreach():
    for dataset in ["jain", "pathbased"]:#, "s1", "Aggregation", "WUT_Smile", "unbalance", "a1"]:
        X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2)
        labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intc)
        label_counts = np.unique(labels,return_counts=True)[1]
        k = len(label_counts)
        D = scipy.spatial.distance.pdist(X)
        D = scipy.spatial.distance.squareform(D)

        for M in [2, 3, 5, 10]:
            gc.collect()
            t0 = time.time()
            D1 = genieclust.tools._mutual_reachability_distance(
                D,
                genieclust.tools._core_distance(D, M))
            print("%-20s\tM=%2d\tt=%.3f" % (dataset, M, time.time()-t0), end="\t")
            t0 = time.time()

            D2 = mutual_reachability_distance_old(D, M)
            print("t_old=%.3f" % (time.time()-t0,))
            dist = np.mean((D1 - D2)**2)
            assert dist < 1e-12

            for g in [0.01, 0.3, 0.5, 0.7, 1.0]:
                for k in [2, 3, 5]:
                    cl = genieclust.Genie(k, gini_threshold=g, M=M).fit_predict(X)+1
                    assert max(cl) == k
                    assert np.unique(cl).shape[0] == k+1

                    cl2 = genieclust.Genie(k, gini_threshold=g, M=M, postprocess='all').fit_predict(X)+1
                    assert np.all(cl2[cl>0] == cl[cl>0])

                    cl3 = genieclust.Genie(k, gini_threshold=g, M=M, postprocess='none').fit_predict(X)+1
                    assert np.all(cl3[cl3>0] == cl[cl3>0])
                    assert np.all(cl3[cl3>0] == cl2[cl3>0])

                    # TODO: what other tests?

            D1 = None
            D2 = None
Exemple #6
0
def __test_genie_approx(metric='euclidean'):
    if nmslib is None: return

    for dataset in [
            "t4_8k", "h2mg_64_50"
    ]:  #, "bigger"]:#[, "bigger""s1", "Aggregation", "unbalance", "h2mg_64_50"]:#, "h2mg_1024_50", "t4_8k", "bigger"]:
        if dataset == "bigger":
            np.random.seed(123)
            n = 10_000
            d = 10
            X = np.random.normal(size=(n, d))
            labels = np.random.choice(np.r_[1, 2, 3, 4, 5, 6, 7, 8], n)
        else:
            X = np.loadtxt("%s/%s.data.gz" % (path, dataset), ndmin=2)
            labels = np.loadtxt("%s/%s.labels0.gz" % (path, dataset),
                                dtype=np.intp) - 1

        k = len(np.unique(labels[labels >= 0]))
        # center X + scale (NOT: standardize!)
        X = (X - X.mean(axis=0)) / X.std(axis=None, ddof=1)
        X += np.random.normal(0, 0.0001, X.shape)

        for M in [1, 2, 25]:
            for g in [0.01, 0.3, 0.7]:
                gc.collect()

                #D = scipy.spatial.distance.pdist(X)
                #D = scipy.spatial.distance.squareform(D)

                print("%-20s M=%2d g=%.2f n=%7d d=%4d" %
                      (dataset, M, g, X.shape[0], X.shape[1]),
                      end="\t")

                if not (metric in ['maximum']):
                    t01 = time.time()
                    res1 = genieclust.Genie(k,
                                            gini_threshold=g,
                                            exact=True,
                                            affinity=metric,
                                            verbose=verbose,
                                            M=M).fit_predict(X) + 1
                    t11 = time.time()
                    print("t_py=%.3f" % (t11 - t01), end="\t")
                else:
                    res1 = None

                #assert len(np.unique(res1[res1>=0])) == k

                if stats is not None and genie is not None and M == 1 and not (
                        metric in ['cosine', 'maximum']):
                    t02 = time.time()
                    res2 = stats.cutree(
                        genie.hclust2(objects=X, d=metric, thresholdGini=g), k)
                    t12 = time.time()
                    print("t_r=%.3f" % (t12 - t02), end="\t")
                    res2 = np.array(res2, np.intp)
                    assert len(np.unique(res2)) == k

                    ari = genieclust.compare_partitions.adjusted_rand_score(
                        res1, res2)
                    print("ARI=%.3f" % ari, end="\t")
                    assert ari > 1.0 - 1e-12

                    print("t_rel=%.3f" % ((t11 - t01) / (t12 - t02), ),
                          end="\t")

                t03 = time.time()
                res3 = genieclust.Genie(k,
                                        gini_threshold=g,
                                        exact=False,
                                        affinity=metric,
                                        verbose=verbose,
                                        M=M).fit_predict(X) + 1
                t13 = time.time()
                print("t_py2=%.3f" % (t13 - t03), end="\t")

                if res1 is not None:
                    print("t_rel=%.3f" % ((t03 - t13) / (t01 - t11), ),
                          end="\t")

                    ari = genieclust.compare_partitions.adjusted_rand_score(
                        res1, res3)
                    print("ARI2=%.3f" % ari, end="\t")
                    if ari < 1.0 - 1e-12:
                        warnings.warn(
                            "(exact=False) ARI=%.3f for dataset=%s, g=%.2f, affinity=%s"
                            % (ari, dataset, g, metric))

                    res1, res2 = None, None

                print("")
Exemple #7
0
                            layer='counts')
sc.pp.scale(ad, max_value=8)
sc.pp.pca(ad)

sc.pp.neighbors(ad)

sc.tl.umap(ad)
sc.tl.louvain(ad, resolution=0.2)

X_hidim = ad.X
X_lodim = ad.obsm['X_pca']

import genieclust
import numpy as np

g = genieclust.Genie(n_clusters=4, affinity='cosine', M=25, postprocess="all")
labels = g.fit_predict(X_hidim)
ad.obs['genie_labels'] = labels.astype(str)
sc.pl.umap(ad, color='genie_labels')

g = genieclust.Genie(n_clusters=3, affinity='cosine')
labels = g.fit_predict(X_lodim)
ad.obs['genie_labels'] = labels.astype(str)
sc.pl.umap(ad, color='genie_labels')

sc.pl.umap(ad, color='louvain')

mst = genieclust.internal.mst_from_distance(X_hidim)
genieclust.plots.plot_segments(mst[1], ad.obsm["X_umap"])

X_hidim_std = (X_hidim - X_hidim.mean(axis=0)) / (X_hidim.std(axis=0, ddof=1))
Exemple #8
0
def test_warnerr(metric='euclidean'):
    np.random.seed(123)
    n = 1_000
    d = 10
    K = 2
    X = np.random.normal(size=(n,d))
    labels = np.random.choice(np.r_[0:K], n)

    k = len(np.unique(labels[labels>=0]))

    # center X + scale (NOT: standardize!)
    X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1)
    X += np.random.normal(0, 0.0001, X.shape)


    with pytest.raises(Exception): genieclust.Genie(n_clusters=-1).fit(X)
    with pytest.raises(Exception): genieclust.Genie(gini_threshold=-1e-12).fit(X)
    with pytest.raises(Exception): genieclust.Genie(gini_threshold=1+1e-12).fit(X)
    with pytest.raises(Exception): genieclust.Genie(affinity="euclidianne").fit(X)
    with pytest.raises(Exception): genieclust.Genie(affinity="precomputed").fit(X)
    with pytest.raises(Exception): genieclust.Genie(M=0).fit(X)
    with pytest.raises(Exception): genieclust.Genie(M=n+1).fit(X)
    with pytest.raises(Exception): genieclust.Genie(exact=True).fit(scipy.sparse.csr_matrix(X))
    with pytest.raises(Exception): genieclust.Genie(postprocess="say what??").fit(X)
    with pytest.raises(Exception): genieclust.Genie(mlpack_enabled="say what??").fit(X)

    with pytest.raises(Exception): genieclust.GIc(add_clusters=-1).fit(X)
    with pytest.raises(Exception): genieclust.GIc(gini_thresholds=[-1e-12]).fit(X)
    with pytest.raises(Exception): genieclust.GIc(affinity="precomputed").fit(scipy.spatial.distance.pdist(X))

    with pytest.warns(Warning): genieclust.Genie(M=2, compute_full_tree=True).fit(X)

    if mlpack is None:
        with pytest.raises(Exception): genieclust.Genie(mlpack_enabled=True).fit(X)
    else:
        with pytest.raises(Exception): genieclust.Genie(mlpack_enabled=True, affinity="l1").fit(X)
        with pytest.raises(Exception): genieclust.Genie(mlpack_enabled=True, M=2).fit(X)

    if nmslib is None:
        with pytest.raises(Exception): genieclust.Genie(exact=False).fit(X)
    else:
        with pytest.raises(Exception): genieclust.Genie(affinity="leven", exact=False, cast_float32=True).fit(X)
        with pytest.raises(Exception): genieclust.Genie(affinity="precomputed", exact=False).fit(X)
        with pytest.raises(Exception): genieclust.Genie(M=10, nmslib_n_neighbors=8, exact=False).fit(X)
        with pytest.raises(Exception): genieclust.Genie(nmslib_params_init=[], exact=False).fit(X)
        with pytest.raises(Exception): genieclust.Genie(nmslib_params_index=[], exact=False).fit(X)
        with pytest.raises(Exception): genieclust.Genie(nmslib_params_query=[], exact=False).fit(X)
        with pytest.warns(Warning): genieclust.Genie(nmslib_params_index=dict(indexThreadQty=3), exact=False).fit(X)
        with pytest.warns(Warning): genieclust.Genie(nmslib_params_init=dict(space="outer"), exact=False).fit(X)
Exemple #9
0
np.set_printoptions(precision=5, threshold=10, edgeitems=5)
pd.set_option("min_rows", 20)
plt.style.use('seaborn-whitegrid')
#plt.rcParams["figure.figsize"] = (8,4)

X, labels_true, dataset = load_dataset("sipu/aggregation", benchmarks_path)
X = ((X-X.mean(axis=0))/X.std(axis=None, ddof=1))
X = X.astype(np.float32, order="C", copy=False)
labels_true = [l-1 for l in labels_true] # noise class==-1

labels_true = labels_true[0]
n_clusters = int(len(np.unique(labels_true))-(np.min(labels_true)==-1))


g = genieclust.Genie(n_clusters=n_clusters,
            gini_threshold=0.3,
            M=1)
labels_g = g.fit_predict(X)
print(genieclust.compare_partitions.compare_partitions2(labels_true, labels_g))
print(labels_g)

gic = genieclust.GIc(n_clusters=n_clusters,
            gini_thresholds=[0.1, 0.3, 0.5, 0.7],
            add_clusters=10,
            M=1)
labels_gic = gic.fit_predict(X)
print(labels_gic)
print(genieclust.compare_partitions.compare_partitions2(labels_gic, labels_g))

#%%eof
Exemple #10
0
def test_gic():
    for dataset in ["jain", "Aggregation", "unbalance", "h2mg_64_50"]:#, "h2mg_1024_50", "t4_8k", "bigger"]:
        if dataset == "bigger":
            np.random.seed(123)
            n = 100000
            X = np.random.normal(size=(n,32))
            labels = np.random.choice(np.r_[1,2], n)
        else:
            X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2)
            labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intc)-1

        K = len(np.unique(labels[labels>=0]))
        # center X + scale (NOT: standardize!)
        X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1)
        X += np.random.normal(0, 0.0001, X.shape)

        print("%-20s n=%7d d=%4d"%(dataset,X.shape[0],X.shape[1]))
        for g in [ np.r_[0.1],  np.r_[0.2],  np.r_[0.3], np.r_[0.4], np.r_[0.5], np.r_[0.6], np.r_[0.7] ]:
            print(g, end="\t")
            gc.collect()

            t01 = time.time()
            _gic = genieclust.GIc(n_clusters=K, gini_thresholds=g, compute_full_tree=True)
            labels_gic = _gic.fit_predict(X)
            t11 = time.time()
            print("t_py=%.3f" % (t11-t01), end="\t")


            labels_g = genieclust.Genie(n_clusters=K, gini_threshold=g[0]).fit_predict(X)

            assert np.all(np.diff(_gic.distances_)>= 0.0)
            assert len(np.unique(labels_gic[labels_gic>=0])) == K
            assert genieclust.compare_partitions.adjusted_rand_score(labels_gic, labels_g)>1-1e-6
            print()

        for g in [ np.arange(1, 8)/10, np.empty(0)]:
            print(g, end="\t")
            gc.collect()

            t01 = time.time()
            _gic = genieclust.GIc(
                n_clusters=K, gini_thresholds=g, compute_full_tree=True)
            labels_gic = _gic.fit_predict(X)
            t11 = time.time()
            print("t_py=%.3f" % (t11-t01), end="\t")

            assert np.all(np.diff(_gic.distances_)>= 0.0)


            t01 = time.time()
            _gic = genieclust.GIc(
                n_clusters=K, gini_thresholds=g, M=10)
            labels_gic = _gic.fit_predict(X)
            t11 = time.time()
            print("t_py=%.3f" % (t11-t01), end="\t")

            #assert np.all(np.diff(_gic.distances_)>= 0.0)

            # what tests should be added here???

            assert len(np.unique(labels_gic[labels_gic>=0])) == K
            print()
Exemple #11
0
def __test_genie(metric='euclidean'):
    for dataset in ["s1", "Aggregation", "unbalance", "h2mg_64_50", "bigger"]:#, "h2mg_1024_50", "t4_8k", "bigger"]:
        if dataset == "bigger":
            np.random.seed(123)
            n = 10_000
            d = 10
            K = 2
            X = np.random.normal(size=(n,d))
            labels = np.random.choice(np.r_[0:K], n)
        else:
            X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2)
            labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intp)-1

        k = len(np.unique(labels[labels>=0]))

        # center X + scale (NOT: standardize!)
        X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1)
        X += np.random.normal(0, 0.0001, X.shape)

        #t01 = time.time()
        #hdbscan.RobustSingleLinkage().fit_predict(X)
        #t11 = time.time()
        #print("t_robustsl=%.3f" % (t11-t01), end="\t")

        #t01 = time.time()
        #hdbscan.HDBSCAN().fit_predict(X)
        #t11 = time.time()
        #print("t_hdbscan=%.3f" % (t11-t01), end="\t")

        for g in [0.01, 0.3, 0.5, 0.7, 1.0]:
            gc.collect()

            #D = scipy.spatial.distance.pdist(X)
            #D = scipy.spatial.distance.squareform(D)

            print("%-20s g=%.2f n=%5d d=%2d"%(dataset,g,X.shape[0],X.shape[1]), end="\t")

            t01 = time.time()
            _res1 = genieclust.Genie(
                k, gini_threshold=g, exact=True, affinity=metric, compute_full_tree=True)
            res1 = _res1.fit_predict(X)+1
            t11 = time.time()
            print("t_py=%.3f" % (t11-t01), end="\t")

            assert np.all(np.diff(_res1.distances_)>= 0.0)
            assert len(np.unique(res1)) == k

            if stats is not None and genie is not None and metric != 'cosine':
                t02 = time.time()
                res2 = stats.cutree(genie.hclust2(objects=X, d=metric, thresholdGini=g), k)
                t12 = time.time()
                print("t_r=%.3f" % (t12-t02), end="\t")
                res2 = np.array(res2, np.intp)
                assert len(np.unique(res2)) == k

                ari = genieclust.compare_partitions.adjusted_rand_score(res1, res2)
                print("ARI=%.3f" % ari, end="\t")
                assert ari>1.0-1e-12

                print("t_rel=%.3f" % ((t11-t01)/(t12-t02),), end="\t")


            res1, res2 = None, None
            print("")
Exemple #12
0
def test_genie_precomputed():
    for dataset in ["x1", "s1", "Aggregation"]:#, "h2mg_1024_50", "t4_8k", "bigger"]:
        if dataset == "bigger":
            np.random.seed(123)
            n = 10000
            X = np.random.normal(size=(n,2))
            labels = np.random.choice(np.r_[1,2], n)
        else:
            X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2)
            labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intp)-1

        k = len(np.unique(labels[labels>=0]))

        # center X + scale (NOT: standardize!)
        X = X+np.random.normal(0, 0.0001, X.shape)
        X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1)
        X = X.astype("float32")

        D = scipy.spatial.distance.pdist(X)
        if np.random.rand(1) > 0.5:
            D = scipy.spatial.distance.squareform(D)

        for g in [0.01, 0.3, 0.5, 0.7, 1.0]:
            gc.collect()

            print("%-20s g=%.2f n=%5d d=%2d"%(dataset,g,X.shape[0],X.shape[1]), end="\t")

            _res1 = genieclust.Genie(
                k, gini_threshold=g, exact=True, affinity="precomputed")
            res1 = _res1.fit_predict(D)+1

            if mlpack is not None:
                _res2 = genieclust.Genie(
                    k, gini_threshold=g, exact=True, compute_full_tree=True,
                    affinity="euclidean", mlpack_enabled=True)
                res2 = _res2.fit_predict(X)+1
                ari = genieclust.compare_partitions.adjusted_rand_score(res1, res2)
                print("ARI=%.3f" % ari, end="\t")
                assert ari>1.0-1e-12
                assert np.all(np.diff(_res2.distances_)>= 0.0)

            _res2 = genieclust.Genie(
                k, gini_threshold=g, exact=True,
                affinity="euclidean", compute_full_tree=True, mlpack_enabled=False)
            res2 = _res2.fit_predict(X)+1
            ari = genieclust.compare_partitions.adjusted_rand_score(res1, res2)
            print("ARI=%.3f" % ari, end="\t")
            assert ari>1.0-1e-12
            assert np.all(np.diff(_res2.distances_)>= 0.0)

            _res1, _res2 = None, None
            res1, res2 = None, None
            print("")


        # test compute_all_cuts
        K = 16
        g = 0.1
        res1 = genieclust.Genie(K, gini_threshold=g, exact=True, affinity="euclidean",
            compute_all_cuts=True, M=2).fit_predict(X)
        assert res1.shape[1] == X.shape[0]
        # assert res1.shape[0] == K+1   #  that's not necessarily true!
        for k in range(1, res1.shape[0]):
            res2 = genieclust.Genie(k, gini_threshold=g, exact=True, affinity="euclidean",
                M=2).fit_predict(X)
            assert np.all(res2 == res1[k,:])

        # test compute_all_cuts
        K = 16
        g = 0.1
        res1 = genieclust.Genie(K, gini_threshold=g, exact=True, affinity="euclidean",
            compute_all_cuts=True, M=25).fit_predict(X)
        assert res1.shape[1] == X.shape[0]
        # assert res1.shape[0] == K+1   #  that's not necessarily true!
        for k in range(1, res1.shape[0]):
            res2 = genieclust.Genie(k, gini_threshold=g, exact=True, affinity="euclidean",
                M=25).fit_predict(X)
            assert np.all(res2 == res1[k,:])
Exemple #13
0
def get_timing(n, d, s, mu1, mu2, random_state):
    dataset = "g2mg_%d_%s"%(d,s)
    s_cor = s*np.sqrt(d/2)
    assert n % 2 == 0
    X, labels0, labels1 = generate_gKmg(
                d,
                np.r_[n//2, n//2],
                np.array([ [mu1]*d, [mu2]*d ]),
                np.r_[s_cor, s_cor],
                random_state)

    labels_true = [labels0, labels1]

    res = list()
    gini_thresholds = [0.1, 0.3, 0.5, 0.7, 1.0]

    t0 = time.time()
    last_g = genieclust.Genie(n_clusters=2, exact=False)
    labels_pred = last_g.fit_predict(X)
    t1 = time.time()
    res.append(register_result(
        random_state, dataset, n, d,
        "Genie_0.3_approx", 2, os.environ["OMP_NUM_THREADS"], t1-t0,
        labels_pred, labels_true))
    #print(res[-1])

    ## test the "cached" version of Genie(exact=True):
    for gini_threshold in gini_thresholds:
        t0 = time.time()
        last_g.set_params(gini_threshold=gini_threshold)
        labels_pred = last_g.fit_predict(X)
        t1 = time.time()
        res.append(register_result(
            random_state, dataset, n, d,
            "Genie_%.1f_approx"%gini_threshold, 2, 0, t1-t0,
            labels_pred, labels_true))
        #print(res[-1])


    if d <= 10:
        t0 = time.time()
        last_g = genieclust.Genie(n_clusters=2, mlpack_enabled=True)
        labels_pred = last_g.fit_predict(X)
        t1 = time.time()
        res.append(register_result(
            random_state, dataset, n, d,
            "Genie_0.3_mlpack", 2, 1, t1-t0,
            labels_pred, labels_true))
        #print(res[-1])

    t0 = time.time()
    last_g = genieclust.Genie(n_clusters=2, mlpack_enabled=False)
    labels_pred = last_g.fit_predict(X)
    t1 = time.time()
    res.append(register_result(
        random_state, dataset, n, d,
        "Genie_0.3_nomlpack", 2, os.environ["OMP_NUM_THREADS"], t1-t0,
        labels_pred, labels_true))
    #print(res[-1])

    ## test the "cached" version of Genie(exact=True):
    for gini_threshold in gini_thresholds:
        t0 = time.time()
        last_g.set_params(gini_threshold=gini_threshold)
        labels_pred = last_g.fit_predict(X)
        t1 = time.time()
        res.append(register_result(
            random_state, dataset, n, d,
            "Genie_%.1f"%gini_threshold, 2, 0, t1-t0,
            labels_pred, labels_true))
        #print(res[-1])

    return res