def perform_clustering(G, clusters): n_clusters = len(np.unique(clusters)) nodes = G.nodes() distances = np.zeros((len(nodes), len(nodes))) i = 0 for n1 in nodes: j = 0 for n2 in nodes: if n1 == n2: continue distances[i, j] = nx.shortest_path_length(G, n1, n2) j += 1 i += 1 import genieclust g = genieclust.Genie(n_clusters=n_clusters, gini_threshold=0.3) labels = g.fit_predict(distances) pos = nx.spring_layout(G) nx.draw(G, pos=pos, node_color=clusters) plt.title('Correct labels') plt.show() nx.draw(G, pos=pos, node_color=labels) plt.title('Clustering result') plt.show()
def Genie_with_n_threads(X, n_clusters, n_threads, **kwargs): n_threads_old = os.environ["OMP_NUM_THREADS"] os.environ["OMP_NUM_THREADS"] = str(n_threads) g = genieclust.Genie(n_clusters=n_clusters, **kwargs) g.fit(X) os.environ["OMP_NUM_THREADS"] = n_threads_old return g
def __test_string(affinity='leven'): if nmslib is None: return np.random.seed(123) X = [] for i in range(1, 100): X.append("a" * i) genieclust.Genie(affinity=affinity, exact=False, cast_float32=False).fit(X)
def __test_sparse(affinity='euclidean_sparse'): if nmslib is None: return np.random.seed(123) X = np.random.choice(np.arange(-2.0, 3.0), 1000).reshape(100, -1) X *= np.random.rand(*X.shape) X = scipy.sparse.csr_matrix(X) genieclust.Genie(affinity=affinity, exact=False).fit(X)
def test_mutreach(): for dataset in ["jain", "pathbased"]:#, "s1", "Aggregation", "WUT_Smile", "unbalance", "a1"]: X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2) labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intc) label_counts = np.unique(labels,return_counts=True)[1] k = len(label_counts) D = scipy.spatial.distance.pdist(X) D = scipy.spatial.distance.squareform(D) for M in [2, 3, 5, 10]: gc.collect() t0 = time.time() D1 = genieclust.tools._mutual_reachability_distance( D, genieclust.tools._core_distance(D, M)) print("%-20s\tM=%2d\tt=%.3f" % (dataset, M, time.time()-t0), end="\t") t0 = time.time() D2 = mutual_reachability_distance_old(D, M) print("t_old=%.3f" % (time.time()-t0,)) dist = np.mean((D1 - D2)**2) assert dist < 1e-12 for g in [0.01, 0.3, 0.5, 0.7, 1.0]: for k in [2, 3, 5]: cl = genieclust.Genie(k, gini_threshold=g, M=M).fit_predict(X)+1 assert max(cl) == k assert np.unique(cl).shape[0] == k+1 cl2 = genieclust.Genie(k, gini_threshold=g, M=M, postprocess='all').fit_predict(X)+1 assert np.all(cl2[cl>0] == cl[cl>0]) cl3 = genieclust.Genie(k, gini_threshold=g, M=M, postprocess='none').fit_predict(X)+1 assert np.all(cl3[cl3>0] == cl[cl3>0]) assert np.all(cl3[cl3>0] == cl2[cl3>0]) # TODO: what other tests? D1 = None D2 = None
def __test_genie_approx(metric='euclidean'): if nmslib is None: return for dataset in [ "t4_8k", "h2mg_64_50" ]: #, "bigger"]:#[, "bigger""s1", "Aggregation", "unbalance", "h2mg_64_50"]:#, "h2mg_1024_50", "t4_8k", "bigger"]: if dataset == "bigger": np.random.seed(123) n = 10_000 d = 10 X = np.random.normal(size=(n, d)) labels = np.random.choice(np.r_[1, 2, 3, 4, 5, 6, 7, 8], n) else: X = np.loadtxt("%s/%s.data.gz" % (path, dataset), ndmin=2) labels = np.loadtxt("%s/%s.labels0.gz" % (path, dataset), dtype=np.intp) - 1 k = len(np.unique(labels[labels >= 0])) # center X + scale (NOT: standardize!) X = (X - X.mean(axis=0)) / X.std(axis=None, ddof=1) X += np.random.normal(0, 0.0001, X.shape) for M in [1, 2, 25]: for g in [0.01, 0.3, 0.7]: gc.collect() #D = scipy.spatial.distance.pdist(X) #D = scipy.spatial.distance.squareform(D) print("%-20s M=%2d g=%.2f n=%7d d=%4d" % (dataset, M, g, X.shape[0], X.shape[1]), end="\t") if not (metric in ['maximum']): t01 = time.time() res1 = genieclust.Genie(k, gini_threshold=g, exact=True, affinity=metric, verbose=verbose, M=M).fit_predict(X) + 1 t11 = time.time() print("t_py=%.3f" % (t11 - t01), end="\t") else: res1 = None #assert len(np.unique(res1[res1>=0])) == k if stats is not None and genie is not None and M == 1 and not ( metric in ['cosine', 'maximum']): t02 = time.time() res2 = stats.cutree( genie.hclust2(objects=X, d=metric, thresholdGini=g), k) t12 = time.time() print("t_r=%.3f" % (t12 - t02), end="\t") res2 = np.array(res2, np.intp) assert len(np.unique(res2)) == k ari = genieclust.compare_partitions.adjusted_rand_score( res1, res2) print("ARI=%.3f" % ari, end="\t") assert ari > 1.0 - 1e-12 print("t_rel=%.3f" % ((t11 - t01) / (t12 - t02), ), end="\t") t03 = time.time() res3 = genieclust.Genie(k, gini_threshold=g, exact=False, affinity=metric, verbose=verbose, M=M).fit_predict(X) + 1 t13 = time.time() print("t_py2=%.3f" % (t13 - t03), end="\t") if res1 is not None: print("t_rel=%.3f" % ((t03 - t13) / (t01 - t11), ), end="\t") ari = genieclust.compare_partitions.adjusted_rand_score( res1, res3) print("ARI2=%.3f" % ari, end="\t") if ari < 1.0 - 1e-12: warnings.warn( "(exact=False) ARI=%.3f for dataset=%s, g=%.2f, affinity=%s" % (ari, dataset, g, metric)) res1, res2 = None, None print("")
layer='counts') sc.pp.scale(ad, max_value=8) sc.pp.pca(ad) sc.pp.neighbors(ad) sc.tl.umap(ad) sc.tl.louvain(ad, resolution=0.2) X_hidim = ad.X X_lodim = ad.obsm['X_pca'] import genieclust import numpy as np g = genieclust.Genie(n_clusters=4, affinity='cosine', M=25, postprocess="all") labels = g.fit_predict(X_hidim) ad.obs['genie_labels'] = labels.astype(str) sc.pl.umap(ad, color='genie_labels') g = genieclust.Genie(n_clusters=3, affinity='cosine') labels = g.fit_predict(X_lodim) ad.obs['genie_labels'] = labels.astype(str) sc.pl.umap(ad, color='genie_labels') sc.pl.umap(ad, color='louvain') mst = genieclust.internal.mst_from_distance(X_hidim) genieclust.plots.plot_segments(mst[1], ad.obsm["X_umap"]) X_hidim_std = (X_hidim - X_hidim.mean(axis=0)) / (X_hidim.std(axis=0, ddof=1))
def test_warnerr(metric='euclidean'): np.random.seed(123) n = 1_000 d = 10 K = 2 X = np.random.normal(size=(n,d)) labels = np.random.choice(np.r_[0:K], n) k = len(np.unique(labels[labels>=0])) # center X + scale (NOT: standardize!) X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1) X += np.random.normal(0, 0.0001, X.shape) with pytest.raises(Exception): genieclust.Genie(n_clusters=-1).fit(X) with pytest.raises(Exception): genieclust.Genie(gini_threshold=-1e-12).fit(X) with pytest.raises(Exception): genieclust.Genie(gini_threshold=1+1e-12).fit(X) with pytest.raises(Exception): genieclust.Genie(affinity="euclidianne").fit(X) with pytest.raises(Exception): genieclust.Genie(affinity="precomputed").fit(X) with pytest.raises(Exception): genieclust.Genie(M=0).fit(X) with pytest.raises(Exception): genieclust.Genie(M=n+1).fit(X) with pytest.raises(Exception): genieclust.Genie(exact=True).fit(scipy.sparse.csr_matrix(X)) with pytest.raises(Exception): genieclust.Genie(postprocess="say what??").fit(X) with pytest.raises(Exception): genieclust.Genie(mlpack_enabled="say what??").fit(X) with pytest.raises(Exception): genieclust.GIc(add_clusters=-1).fit(X) with pytest.raises(Exception): genieclust.GIc(gini_thresholds=[-1e-12]).fit(X) with pytest.raises(Exception): genieclust.GIc(affinity="precomputed").fit(scipy.spatial.distance.pdist(X)) with pytest.warns(Warning): genieclust.Genie(M=2, compute_full_tree=True).fit(X) if mlpack is None: with pytest.raises(Exception): genieclust.Genie(mlpack_enabled=True).fit(X) else: with pytest.raises(Exception): genieclust.Genie(mlpack_enabled=True, affinity="l1").fit(X) with pytest.raises(Exception): genieclust.Genie(mlpack_enabled=True, M=2).fit(X) if nmslib is None: with pytest.raises(Exception): genieclust.Genie(exact=False).fit(X) else: with pytest.raises(Exception): genieclust.Genie(affinity="leven", exact=False, cast_float32=True).fit(X) with pytest.raises(Exception): genieclust.Genie(affinity="precomputed", exact=False).fit(X) with pytest.raises(Exception): genieclust.Genie(M=10, nmslib_n_neighbors=8, exact=False).fit(X) with pytest.raises(Exception): genieclust.Genie(nmslib_params_init=[], exact=False).fit(X) with pytest.raises(Exception): genieclust.Genie(nmslib_params_index=[], exact=False).fit(X) with pytest.raises(Exception): genieclust.Genie(nmslib_params_query=[], exact=False).fit(X) with pytest.warns(Warning): genieclust.Genie(nmslib_params_index=dict(indexThreadQty=3), exact=False).fit(X) with pytest.warns(Warning): genieclust.Genie(nmslib_params_init=dict(space="outer"), exact=False).fit(X)
np.set_printoptions(precision=5, threshold=10, edgeitems=5) pd.set_option("min_rows", 20) plt.style.use('seaborn-whitegrid') #plt.rcParams["figure.figsize"] = (8,4) X, labels_true, dataset = load_dataset("sipu/aggregation", benchmarks_path) X = ((X-X.mean(axis=0))/X.std(axis=None, ddof=1)) X = X.astype(np.float32, order="C", copy=False) labels_true = [l-1 for l in labels_true] # noise class==-1 labels_true = labels_true[0] n_clusters = int(len(np.unique(labels_true))-(np.min(labels_true)==-1)) g = genieclust.Genie(n_clusters=n_clusters, gini_threshold=0.3, M=1) labels_g = g.fit_predict(X) print(genieclust.compare_partitions.compare_partitions2(labels_true, labels_g)) print(labels_g) gic = genieclust.GIc(n_clusters=n_clusters, gini_thresholds=[0.1, 0.3, 0.5, 0.7], add_clusters=10, M=1) labels_gic = gic.fit_predict(X) print(labels_gic) print(genieclust.compare_partitions.compare_partitions2(labels_gic, labels_g)) #%%eof
def test_gic(): for dataset in ["jain", "Aggregation", "unbalance", "h2mg_64_50"]:#, "h2mg_1024_50", "t4_8k", "bigger"]: if dataset == "bigger": np.random.seed(123) n = 100000 X = np.random.normal(size=(n,32)) labels = np.random.choice(np.r_[1,2], n) else: X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2) labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intc)-1 K = len(np.unique(labels[labels>=0])) # center X + scale (NOT: standardize!) X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1) X += np.random.normal(0, 0.0001, X.shape) print("%-20s n=%7d d=%4d"%(dataset,X.shape[0],X.shape[1])) for g in [ np.r_[0.1], np.r_[0.2], np.r_[0.3], np.r_[0.4], np.r_[0.5], np.r_[0.6], np.r_[0.7] ]: print(g, end="\t") gc.collect() t01 = time.time() _gic = genieclust.GIc(n_clusters=K, gini_thresholds=g, compute_full_tree=True) labels_gic = _gic.fit_predict(X) t11 = time.time() print("t_py=%.3f" % (t11-t01), end="\t") labels_g = genieclust.Genie(n_clusters=K, gini_threshold=g[0]).fit_predict(X) assert np.all(np.diff(_gic.distances_)>= 0.0) assert len(np.unique(labels_gic[labels_gic>=0])) == K assert genieclust.compare_partitions.adjusted_rand_score(labels_gic, labels_g)>1-1e-6 print() for g in [ np.arange(1, 8)/10, np.empty(0)]: print(g, end="\t") gc.collect() t01 = time.time() _gic = genieclust.GIc( n_clusters=K, gini_thresholds=g, compute_full_tree=True) labels_gic = _gic.fit_predict(X) t11 = time.time() print("t_py=%.3f" % (t11-t01), end="\t") assert np.all(np.diff(_gic.distances_)>= 0.0) t01 = time.time() _gic = genieclust.GIc( n_clusters=K, gini_thresholds=g, M=10) labels_gic = _gic.fit_predict(X) t11 = time.time() print("t_py=%.3f" % (t11-t01), end="\t") #assert np.all(np.diff(_gic.distances_)>= 0.0) # what tests should be added here??? assert len(np.unique(labels_gic[labels_gic>=0])) == K print()
def __test_genie(metric='euclidean'): for dataset in ["s1", "Aggregation", "unbalance", "h2mg_64_50", "bigger"]:#, "h2mg_1024_50", "t4_8k", "bigger"]: if dataset == "bigger": np.random.seed(123) n = 10_000 d = 10 K = 2 X = np.random.normal(size=(n,d)) labels = np.random.choice(np.r_[0:K], n) else: X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2) labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intp)-1 k = len(np.unique(labels[labels>=0])) # center X + scale (NOT: standardize!) X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1) X += np.random.normal(0, 0.0001, X.shape) #t01 = time.time() #hdbscan.RobustSingleLinkage().fit_predict(X) #t11 = time.time() #print("t_robustsl=%.3f" % (t11-t01), end="\t") #t01 = time.time() #hdbscan.HDBSCAN().fit_predict(X) #t11 = time.time() #print("t_hdbscan=%.3f" % (t11-t01), end="\t") for g in [0.01, 0.3, 0.5, 0.7, 1.0]: gc.collect() #D = scipy.spatial.distance.pdist(X) #D = scipy.spatial.distance.squareform(D) print("%-20s g=%.2f n=%5d d=%2d"%(dataset,g,X.shape[0],X.shape[1]), end="\t") t01 = time.time() _res1 = genieclust.Genie( k, gini_threshold=g, exact=True, affinity=metric, compute_full_tree=True) res1 = _res1.fit_predict(X)+1 t11 = time.time() print("t_py=%.3f" % (t11-t01), end="\t") assert np.all(np.diff(_res1.distances_)>= 0.0) assert len(np.unique(res1)) == k if stats is not None and genie is not None and metric != 'cosine': t02 = time.time() res2 = stats.cutree(genie.hclust2(objects=X, d=metric, thresholdGini=g), k) t12 = time.time() print("t_r=%.3f" % (t12-t02), end="\t") res2 = np.array(res2, np.intp) assert len(np.unique(res2)) == k ari = genieclust.compare_partitions.adjusted_rand_score(res1, res2) print("ARI=%.3f" % ari, end="\t") assert ari>1.0-1e-12 print("t_rel=%.3f" % ((t11-t01)/(t12-t02),), end="\t") res1, res2 = None, None print("")
def test_genie_precomputed(): for dataset in ["x1", "s1", "Aggregation"]:#, "h2mg_1024_50", "t4_8k", "bigger"]: if dataset == "bigger": np.random.seed(123) n = 10000 X = np.random.normal(size=(n,2)) labels = np.random.choice(np.r_[1,2], n) else: X = np.loadtxt("%s/%s.data.gz" % (path,dataset), ndmin=2) labels = np.loadtxt("%s/%s.labels0.gz" % (path,dataset), dtype=np.intp)-1 k = len(np.unique(labels[labels>=0])) # center X + scale (NOT: standardize!) X = X+np.random.normal(0, 0.0001, X.shape) X = (X-X.mean(axis=0))/X.std(axis=None, ddof=1) X = X.astype("float32") D = scipy.spatial.distance.pdist(X) if np.random.rand(1) > 0.5: D = scipy.spatial.distance.squareform(D) for g in [0.01, 0.3, 0.5, 0.7, 1.0]: gc.collect() print("%-20s g=%.2f n=%5d d=%2d"%(dataset,g,X.shape[0],X.shape[1]), end="\t") _res1 = genieclust.Genie( k, gini_threshold=g, exact=True, affinity="precomputed") res1 = _res1.fit_predict(D)+1 if mlpack is not None: _res2 = genieclust.Genie( k, gini_threshold=g, exact=True, compute_full_tree=True, affinity="euclidean", mlpack_enabled=True) res2 = _res2.fit_predict(X)+1 ari = genieclust.compare_partitions.adjusted_rand_score(res1, res2) print("ARI=%.3f" % ari, end="\t") assert ari>1.0-1e-12 assert np.all(np.diff(_res2.distances_)>= 0.0) _res2 = genieclust.Genie( k, gini_threshold=g, exact=True, affinity="euclidean", compute_full_tree=True, mlpack_enabled=False) res2 = _res2.fit_predict(X)+1 ari = genieclust.compare_partitions.adjusted_rand_score(res1, res2) print("ARI=%.3f" % ari, end="\t") assert ari>1.0-1e-12 assert np.all(np.diff(_res2.distances_)>= 0.0) _res1, _res2 = None, None res1, res2 = None, None print("") # test compute_all_cuts K = 16 g = 0.1 res1 = genieclust.Genie(K, gini_threshold=g, exact=True, affinity="euclidean", compute_all_cuts=True, M=2).fit_predict(X) assert res1.shape[1] == X.shape[0] # assert res1.shape[0] == K+1 # that's not necessarily true! for k in range(1, res1.shape[0]): res2 = genieclust.Genie(k, gini_threshold=g, exact=True, affinity="euclidean", M=2).fit_predict(X) assert np.all(res2 == res1[k,:]) # test compute_all_cuts K = 16 g = 0.1 res1 = genieclust.Genie(K, gini_threshold=g, exact=True, affinity="euclidean", compute_all_cuts=True, M=25).fit_predict(X) assert res1.shape[1] == X.shape[0] # assert res1.shape[0] == K+1 # that's not necessarily true! for k in range(1, res1.shape[0]): res2 = genieclust.Genie(k, gini_threshold=g, exact=True, affinity="euclidean", M=25).fit_predict(X) assert np.all(res2 == res1[k,:])
def get_timing(n, d, s, mu1, mu2, random_state): dataset = "g2mg_%d_%s"%(d,s) s_cor = s*np.sqrt(d/2) assert n % 2 == 0 X, labels0, labels1 = generate_gKmg( d, np.r_[n//2, n//2], np.array([ [mu1]*d, [mu2]*d ]), np.r_[s_cor, s_cor], random_state) labels_true = [labels0, labels1] res = list() gini_thresholds = [0.1, 0.3, 0.5, 0.7, 1.0] t0 = time.time() last_g = genieclust.Genie(n_clusters=2, exact=False) labels_pred = last_g.fit_predict(X) t1 = time.time() res.append(register_result( random_state, dataset, n, d, "Genie_0.3_approx", 2, os.environ["OMP_NUM_THREADS"], t1-t0, labels_pred, labels_true)) #print(res[-1]) ## test the "cached" version of Genie(exact=True): for gini_threshold in gini_thresholds: t0 = time.time() last_g.set_params(gini_threshold=gini_threshold) labels_pred = last_g.fit_predict(X) t1 = time.time() res.append(register_result( random_state, dataset, n, d, "Genie_%.1f_approx"%gini_threshold, 2, 0, t1-t0, labels_pred, labels_true)) #print(res[-1]) if d <= 10: t0 = time.time() last_g = genieclust.Genie(n_clusters=2, mlpack_enabled=True) labels_pred = last_g.fit_predict(X) t1 = time.time() res.append(register_result( random_state, dataset, n, d, "Genie_0.3_mlpack", 2, 1, t1-t0, labels_pred, labels_true)) #print(res[-1]) t0 = time.time() last_g = genieclust.Genie(n_clusters=2, mlpack_enabled=False) labels_pred = last_g.fit_predict(X) t1 = time.time() res.append(register_result( random_state, dataset, n, d, "Genie_0.3_nomlpack", 2, os.environ["OMP_NUM_THREADS"], t1-t0, labels_pred, labels_true)) #print(res[-1]) ## test the "cached" version of Genie(exact=True): for gini_threshold in gini_thresholds: t0 = time.time() last_g.set_params(gini_threshold=gini_threshold) labels_pred = last_g.fit_predict(X) t1 = time.time() res.append(register_result( random_state, dataset, n, d, "Genie_%.1f"%gini_threshold, 2, 0, t1-t0, labels_pred, labels_true)) #print(res[-1]) return res