def evalutate(self, memberships): groundTruth = self.groundTruth n_graphs = self.n_graphs individual_nmi = np.zeros([n_graphs]) individual_ari = np.zeros([n_graphs]) individual_mcr = np.zeros([n_graphs]) for n in range(n_graphs): # print(n) individual_nmi[n] = nmi(memberships[n], groundTruth[n]) individual_ari[n] = ari(memberships[n], groundTruth[n]) individual_mcr[n] = mcr(memberships[n], groundTruth[n]) trueMemberships_stacked = np.reshape(np.hstack(groundTruth), [-1]) memberships_stacked = np.hstack(memberships) overall_nmi = nmi(memberships_stacked, trueMemberships_stacked) overall_ari = ari(memberships_stacked, trueMemberships_stacked) overall_mcr = mcr(memberships_stacked, trueMemberships_stacked) return { "NMI": { 'nmi': np.mean(individual_nmi), 'overall_nmi': overall_nmi }, "ARI": { 'ari': np.mean(individual_ari), 'overall_ari': overall_ari }, "MCR": { 'mcr': np.mean(individual_mcr), 'overall_mcr': overall_mcr } }
def test_leiden(self): m, w, ll = uncurl.run_state_estimation(self.data_subset, 8, max_iters=20, inner_max_iters=50) print('nmi basic: ' + str(nmi(self.labels, w.argmax(0)))) g = clustering_methods.create_graph(w.T, metric='cosine') leiden_clustering = clustering_methods.run_leiden(g) self.assertTrue(nmi(self.labels, leiden_clustering) >= 0.7) louvain_clustering = clustering_methods.run_louvain(g) self.assertTrue(nmi(self.labels, louvain_clustering) >= 0.7)
def test_clustering(n_runs=20, alpha=0.5): nmis_both = [] nmis_attributes = [] nmis_structure = [] for i in range(n_runs): print("Run number {0}".format(i)) ensemble_density_huge('file.csv', "'\t'") dist_dense = pd.read_csv("./matrix.csv", delimiter="\t", header=None).values dist_dense = dist_dense[:, :-1] sims_attributes = ensemble_attributes("file_attributes.csv", "\t") sim_attributes = pd.read_csv("./matrix_uet.csv", delimiter="\t", header=None).values sim_attributes = sim_attributes[:, :-1] dist_attributes = sim_to_dist(np.array(sim_attributes)) dist = alpha * dist_dense + (1 - alpha) * dist_attributes dist = dist / 2 model_kmeans = KMeans(n_clusters=len(set(true))) scaler = QuantileTransformer(n_quantiles=10) dist_scaled = scaler.fit_transform(dist) dist_dense_scaled = scaler.fit_transform(dist_dense) dist_attributes_scaled = scaler.fit_transform(dist_attributes) results_dense = TSNE( metric="precomputed").fit_transform(dist_dense_scaled) results_dense_both = TSNE( metric="precomputed").fit_transform(dist_scaled) results_dense_attributes = TSNE( metric="precomputed").fit_transform(dist_attributes_scaled) labels_dense_kmeans_both = model_kmeans.fit_predict(results_dense_both) labels_dense_kmeans_attributes = model_kmeans.fit_predict( results_dense_attributes) labels_dense_kmeans_structure = model_kmeans.fit_predict(results_dense) nmis_both.append( nmi(labels_dense_kmeans_both, true, average_method="arithmetic")) nmis_attributes.append( nmi(labels_dense_kmeans_attributes, true, average_method="arithmetic")) nmis_structure.append( nmi(labels_dense_kmeans_structure, true, average_method="arithmetic")) print("Structure : {0}, {1}".format(np.mean(nmis_structure), np.std(nmis_structure))) print("Attributes : {0}, {1}".format(np.mean(nmis_attributes), np.std(nmis_attributes))) print("Both : {0}, {1}".format(np.mean(nmis_both), np.std(nmis_both))) return (nmis_structure, nmis_attributes, nmis_both)
def test_run_uncurl(self): sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, frac=0.2, data_filename='data.mtx', max_iters=20, inner_max_iters=50) sca.run_uncurl() self.assertTrue(sca.has_w) self.assertTrue(sca.has_m) self.assertTrue(sca.w.shape[0] == 8) self.assertTrue(sca.w.shape[1] == self.data.shape[1]) self.assertTrue(os.path.exists(sca.w_f)) self.assertTrue(os.path.exists(sca.m_f)) print(nmi(sca.labels, self.labs)) self.assertTrue(nmi(sca.labels, self.labs) > 0.65)
def test_add_color_track(self): sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='MDS', clustering_method='leiden', cell_frac=1.0, max_iters=20, inner_max_iters=10) sca.add_color_track('true_labels', self.labs, is_discrete=True) true_labels, is_discrete = sca.get_color_track('true_labels') self.assertTrue(nmi(true_labels, self.labs) > 0.99) top_genes, top_pvals = sca.calculate_diffexp('true_labels') self.assertEqual(len(top_genes), 8) self.assertEqual(len(top_pvals), 8) sca.add_color_track('true_labels_2', self.labs, is_discrete=False) true_labels_2, _ = sca.get_color_track('true_labels_2') self.assertTrue((true_labels_2.astype(int) == self.labs).all()) pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels', mode='pairwise') self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape) pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels', mode='pairwise') self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape) self.assertEqual(pairwise_genes.shape[0], 8) top_genes, top_pvals = sca.calculate_diffexp('true_labels') self.assertEqual(len(top_genes[0]), len(sca.gene_names)) self.assertEqual(len(top_genes), 8) self.assertEqual(len(top_pvals), 8)
def check_clusterpurity(f, cluster, dataset, gnd_label, sys_label): """ check cluster purity with respect to xvectors belonging to single speakers """ clusterpurity = [] clustervar = [] fullclasslabel = [] clean_ind = [] for c in cluster: classlabel = [] clustervar.append(np.var(c)) for a in c: if len(gnd_label[a]) == 1: classlabel.append(gnd_label[a][0]) clean_ind.append(a) # else: # sys_label = np.delete(sys_label,a) fullclasslabel.extend(classlabel) classlabel = np.array(classlabel) if len(classlabel) == 0: clusterpurity.append(0) continue unilabel = mostFrequent(classlabel, len(classlabel)) purity = (len(np.where(classlabel == unilabel)[0]) / len(classlabel)) * 100 clusterpurity.append(purity) sys_label = sys_label[clean_ind] Nmi_score = nmi(fullclasslabel, sys_label.tolist()) print('NMI score for n_cluster:{} is {}'.format(len(cluster), Nmi_score)) print('cluster purity for n_cluster:{} is {}'.format( len(cluster), clusterpurity))
def clust(data_path, label_path, pca_com, phate_com): input_path = data_path + ".csv" label_path = label_path + ".csv" X = pd.read_csv(input_path, header=None) X = X.drop(0) X = np.array(X) X = X.transpose() pca = PCA(n_components=pca_com) b = pca.fit_transform(X) phate_op = phate.PHATE(n_components=phate_com) data_phate = phate_op.fit_transform(b) label = pd.read_csv(label_path) y = np.array(label) label = y.ravel() c = label.max() centList, clusterAssment = biKmeans(data_phate, c) julei = clusterAssment[:, 0] y = np.array(julei) julei = y.ravel() print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten())) print('ARI value is %f \n' % ari(julei.flatten(), label.flatten())) print('HOM value is %f \n' % metrics.homogeneity_score(julei, label)) print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei)) return julei
def eval_cluster_on_test(self): # Embedding points in the test data to the latent space inp_encoder = self.data_test latent_matrix = self.sess.run(self.z, feed_dict={ self.x_input: inp_encoder, self.keep_prob: 1.0 }) labels = self.labels_test K = np.size(np.unique(labels)) kmeans = KMeans(n_clusters=K, random_state=0).fit(latent_matrix) y_pred = kmeans.labels_ print('Computing NMI ...') NMI = nmi(labels.flatten(), y_pred.flatten()) print('Done !') print('NMI = {}'.format(NMI)) if not os.path.exists('Res_DRA/tune_logs'): os.makedirs('Res_DRA/tune_logs') out_file_name = 'Res_DRA/tune_logs/Metrics_{}.txt'.format( self.dataset_name) f = open(out_file_name, 'a') f.write('\n{}, NMI = {}'.format(self.model_dir, NMI)) f.close()
def training_and_testing(): k = 40 #X_train,y_train=data_collect_for_wefcm() X_train, X_test, y_train, y_test = data_collection_from_file() #print np.shape(X) #X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.2) start = timeit.default_timer() #U=ARWEFCM(X_train,k) print(np.shape(X_train)) U, C = WEFCM(X_train, k) #X_test,y_test=data_collection_from_test_file() #this is ofcm #print "c start" #C=calculateClusterCenter(U,X_train,k) #my_df = pd.DataFrame(C) #my_df.to_csv('out.csv',index=False, header=False) #print C[0:2] #print "c end" y1_train = label_cluster(X_train, y_train, C) pre_labels = getpredicted_labels(U, len(y_train)) stop = timeit.default_timer() print('run time:= ', stop - start) r1 = nmi(y_train, pre_labels) r2 = ari(y_train, pre_labels) print('NMI:= ', r1) print('ARI:= ', r2) #print y1_train #print len(X_train) y1_test = test_data(X_test, C, y1_train) #print C accuracy = (float(np.sum(y1_test == y_test))) / len(y_test) print('accuracy:= ', accuracy) #print(classification_report(y_test, y1_test, target_names=y_test)) '''f = open("result1.ods","a+")
def graph_learning_perf_eval(L_orig, L_pred): """ L_orig : groundtruth graph Laplacian L_pred : learned graph Laplacian """ # evaluate the performance of graph learning algorithms n = L_orig.shape[0] idx_non_diag = np.triu_indices(n, 1) # excluding diagonal L_orig_nd = np.diag(np.diag(L_orig)) - L_orig edges_groundtruth = (L_orig_nd > 1e-4)[idx_non_diag] + 0 L_pred_nd = np.diag(np.diag(L_pred)) - L_pred edges_learned = (L_pred_nd > 1e-4)[idx_non_diag] + 0 condition_positive = np.sum(edges_groundtruth) prediction_positive = np.sum(edges_learned) true_positive = np.sum(np.logical_and(edges_groundtruth, edges_learned)) print(f"condition positive:{condition_positive}, prediction positive:{prediction_positive}, true_positive:{true_positive}") precision = true_positive / prediction_positive recall = true_positive / condition_positive if precision == 0 or recall == 0: f = 0 else: f = 2 * precision * recall / (precision + recall) NMI = nmi(edges_groundtruth, edges_learned) R, _ = pearsonr(L_orig[idx_non_diag], L_pred[idx_non_diag]) return precision, recall, f, NMI, R
def testSparsePoissonLookup(self): data = self.data labels = [] for i in range(data.shape[1]): cell = data[:,i] scores = bulk_lookup(self.bulk_means, cell) labels.append(scores[0][0]) nmi_val = nmi(self.labs, labels) self.assertTrue(nmi_val > 0.99)
def testCorrLookup(self): data_dense = self.data_dense labels = [] for i in range(data_dense.shape[1]): cell = data_dense[:,i] scores = bulk_lookup(self.bulk_means, cell, method='corr') labels.append(scores[0][0]) nmi_val = nmi(self.labs, labels) self.assertTrue(nmi_val > 0.85)
def test_merge(self): # create distance matrix # find the min distance between two cluster pairs m = self.m w = self.w data_subset = self.data_subset clusters_to_merge = [0, 1, 2] # merge the min distance pair m_merge, w_merge = relabeling.merge_clusters(data_subset, m, w, clusters_to_merge, max_iters=20, inner_max_iters=50) nmi_base = nmi(self.labels, w.argmax(0)) nmi_merge = nmi(self.labels, w_merge.argmax(0)) print('nmi after merging the closest pairs: ' + str(nmi_merge)) self.assertTrue(nmi_merge >= nmi_base - 0.3) self.assertEqual(w_merge.shape[0], w.shape[0] - 2)
def eval_clustering(y_true, y_pred): _, y_true = np.unique(y_true, return_inverse=True) _, y_pred = np.unique(y_pred, return_inverse=True) acc_score = accuracy_clustering(y_true, y_pred) pu_score = purity(y_true, y_pred) nmi_score = nmi(y_true, y_pred, average_method='geometric') # average_method='arithmetic' ri_score = ri(y_true, y_pred) return acc_score, pu_score, nmi_score, ri_score
def test_json(self): sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='umap', normalize=1, cell_frac=0.5, max_iters=20, inner_max_iters=20, use_fdr=1) sca.run_full_analysis() sca.save_json_reset() # delete the whole sca, re-load it from json del sca sca = sc_analysis.SCAnalysis(self.data_dir) sca = sca.load_params_from_folder() self.assertEqual(sca.params['clusters'], 8) self.assertEqual(sca.params['baseline_dim_red'], 'tsvd') self.assertEqual(sca.params['dim_red_option'], 'umap') self.assertEqual(sca.params['cell_frac'], 0.5) self.assertEqual(sca.params['genes_frac'], 0.2) self.assertTrue(sca.params['normalize']) self.assertTrue(sca.params['use_fdr']) self.assertEqual(sca.uncurl_kwargs['max_iters'], 20) self.assertTrue(sca.has_dim_red) self.assertTrue(sca.has_w) self.assertTrue(sca.has_m) self.assertEqual(sca.cell_subset.shape[0], 400) means = sca.cluster_means self.assertEqual(means.shape[1], 8) self.assertEqual(means.shape[0], self.data.shape[0]) # TODO: do re-clustering sca.add_color_track('true_labels', self.labs, is_discrete=True) old_labels = sca.labels sca.relabel('louvain') self.assertFalse((old_labels == sca.labels).all()) true_labels, is_discrete = sca.get_color_track('true_labels') self.assertTrue(nmi(sca.labels, true_labels) > 0.65) sca.relabel('leiden') self.assertTrue(nmi(sca.labels, true_labels) > 0.65)
def test_clustering_structure(n_runs=20): nmis_gt = [] nmis_mcl = [] nmis_louvain = [] for i in range(n_runs): print("Run number {0}".format(i)) ensemble_density_huge("file.csv", "\\t") dist_dense = pd.read_csv("./matrix.csv", delimiter="\t", header=None).values dist_dense = dist_dense[:, :-1] scaler = QuantileTransformer(n_quantiles=10) dist_dense_scaled = scaler.fit_transform(dist_dense) results_dense = TSNE( metric="precomputed").fit_transform(dist_dense_scaled) model_kmeans = KMeans(n_clusters=len(set(true))) labels_dense_kmeans = model_kmeans.fit_predict(results_dense) clusters_mcl = [0 for i in range(len(adj))] result_mcl = mc.run_mcl(adj) # run MCL with default parameters clusters = mc.get_clusters(result_mcl) # get clusters i = 0 for cluster in clusters: for j in cluster: clusters_mcl[j] = i i += 1 partition = louvain.best_partition(G) labels_spectral = [v for k, v in partition.items()] nmis_gt.append( nmi(labels_dense_kmeans, true, average_method="arithmetic")) nmis_mcl.append(nmi(clusters_mcl, true, average_method="arithmetic")) nmis_louvain.append( nmi(labels_spectral, true, average_method="arithmetic")) print("GT : {0}, {1}".format(np.mean(nmis_gt), np.std(nmis_gt))) print("MCL : {0}, {1}".format(np.mean(nmis_mcl), np.std(nmis_mcl))) print("Louvain : {0}, {1}".format(np.mean(nmis_louvain), np.std(nmis_louvain))) return ((nmis_gt, nmis_mcl, nmis_louvain))
def test_split(self): # 5. building a distance matrix between clusters, find closest pair # 6. run split_cluster - split the largest cluster m = self.m w = self.w data_subset = self.data_subset labels = self.labels clusters = w.argmax(0) cluster_counts = Counter(clusters) top_cluster, top_count = cluster_counts.most_common()[0] m_split, w_split = relabeling.split_cluster(data_subset, m, w, top_cluster, max_iters=20, inner_max_iters=50) nmi_base = nmi(labels, w.argmax(0)) nmi_split = nmi(labels, w_split.argmax(0)) print('nmi after splitting the largest cluster: ' + str(nmi_split)) self.assertTrue(nmi_split >= nmi_base - 0.02) self.assertEqual(w_split.shape[0], w.shape[0] + 1)
def test_new(self): """ Tests creating a new cluster from a selection of cells """ data_subset = self.data_subset m = self.m w = self.w selected_cells = list(range(375, w.shape[1])) m_new, w_new = relabeling.new_cluster(data_subset, m, w, selected_cells, max_iters=20, inner_max_iters=50) nmi_base = nmi(self.labels, w.argmax(0)) nmi_new = nmi(self.labels, w_new.argmax(0)) self.assertTrue(w_new.shape[0] == 9) print('nmi after creating a new cluster: ' + str(nmi_new)) self.assertTrue(nmi_new >= nmi_base - 0.1) self.assertEqual(w_new.shape[0], w.shape[0] + 1) self.assertTrue( sum((w_new.argmax(0)[selected_cells] == 8 )) >= len(selected_cells) / 2)
def setUp(self): data = scipy.io.loadmat('data/10x_pooled_400.mat') data_csc = data['data'] self.labels = data['labels'].flatten() #gene_names = data['gene_names'] # 2. gene selection genes = uncurl.max_variance_genes(data_csc) self.data_subset = data_csc[genes, :] #gene_names_subset = gene_names[genes] # 3. run uncurl m, w, ll = uncurl.run_state_estimation(self.data_subset, 8, max_iters=20, inner_max_iters=50) print('nmi basic: ' + str(nmi(self.labels, w.argmax(0)))) self.m = m self.w = w
def test_10x_auto_cluster(self): """ Test using automatic cluster size determination """ from sklearn.metrics.cluster import normalized_mutual_info_score as nmi # gene selection genes = uncurl.max_variance_genes(self.data) data_subset = self.data[genes, :] # smaller # of iterations than default so it finishes faster... M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, max_iters=10, inner_max_iters=80) labels = W.argmax(0) # NMI should be > 0.75 on 10x_pure_pooled # (accounting for lower than default iter count) self.assertTrue(nmi(self.labs, labels) > 0.6) # test RMSE test_data = np.dot(M, W) error = data_subset.toarray() - test_data error = np.sqrt(np.mean(error**2)) print('data subset RMSE:', error) self.assertTrue(error < 2.0)
n_col_clusters=4, max_iter=100) model_2._fit_single(X, random_state=None) # In[5]: model_2.fit(X) # In[8]: model_2.row_labels_ # In[9]: predicted_labels_2 = model_2.row_labels_ print(nmi(true_labels, predicted_labels_2), acc(true_labels, predicted_labels_2), ars(true_labels, predicted_labels_2), amis(true_labels, predicted_labels_2)) # In[11]: model_5 = NMTFcoclus_ONM3F.ONM3F(n_row_clusters=4, n_col_clusters=4) model_5.fit(X) # In[15]: predicted_labels_5 = model_5.row_labels_ print(nmi(true_labels, predicted_labels_5), acc(true_labels, predicted_labels_5), ars(true_labels, predicted_labels_5),
# -*- coding: utf-8 -*- """ Created on Fri Mar 2 19:00:21 2018 @author: jimmybow """ from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES from sklearn.metrics.cluster import normalized_mutual_info_score as nmi from waveCluster import * data = np.array(read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS)) tags = waveCluster(data, scale=144, threshold=-0.5, plot=True) true_tags = np.arange(len(data)) >= 400 draw2Darray(data[:, 0], data[:, 1], tags) draw2Darray(data[:, 0], data[:, 1], true_tags) print(pd.Series.value_counts(tags)) # 標準化的互信息評分: normalized_mutual_info_score print(nmi(true_tags, tags))
""" import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn import metrics from sklearn.metrics.cluster import adjusted_rand_score as ari from sklearn.metrics.cluster import normalized_mutual_info_score as nmi X = pd.read_csv('yan/yan.csv', header=None) X = np.array(X) X = X.transpose() label = pd.read_csv('yan/yan_label.csv') y = np.array(label) label = y.ravel() pca = PCA(n_components=2) A = pca.fit_transform(X) c = label.max() kk = KMeans(n_clusters=c) julei = kk.fit(A) julei = julei.labels_ print('NMI value is %f \n' % nmi(julei.flatten(), label.flatten())) print('ARI value is %f \n' % ari(julei.flatten(), label.flatten())) print('HOM value is %f \n' % metrics.homogeneity_score(julei, label)) print('AMI value is %f \n' % metrics.adjusted_mutual_info_score(label, julei))
def evaluate(result, clu_list): eva = nmi(clu_list, result, average_method='arithmetic') # print("采用NMI评测方法,预测正确率为:%s" % eva) # print(nmi(clu_list, result, average_method='warn')) return eva
filename = address + str(noise) + '.csv' data = [] with open(filename) as f: f_csv = csv.reader(f) for row in f_csv: data.append(row) data = np.array(data).astype(float) # finished reading, start clustering normData = normalizeData(data) scale = 128 dim = 2 wavelet = 'db2' wavelength = {'db1': 0, 'db2': 1, 'bior1.3': 2} dataDic = map2ScaleDomain(normData, scale) dwtResult = ndWT(dataDic, 2, scale, wavelet) threshold = getThreshold(dwtResult) print("threshold:") print(threshold) #show threshold on the chart showThreshold(dwtResult, threshold) lineLen = scale / 2 + wavelength.get(wavelet) result = thresholding(dwtResult, threshold, lineLen, dim) tags = markData(normData, result, lineLen) #show the result after clustering draw2Darray(normData[:, 0], normData[:, 1], np.array(tags)) quality = nmi(list(normData[:, normData.shape[1] - 1]), tags) print("AMI:") print(quality)
sca.cell_sample sca.cell_subset sca.cell_subset.shape labels = sca.w.argmax(0) from sklearn.metrics.cluster import normalized_mutual_info_score as nmi import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns labels_b = pd.read_csv('80k_cluster_numbers.csv') labels_b = labels_b.iloc[:, 1] labels_b = labels_b.values labels_b_subset = labels_b[sca.cell_subset] nmi(labels_b_subset, labels) from sklearn.metrics.cluster import adjusted_rand_score as ari ari(labels_b_subset, labels) cluster_counts = np.zeros((len(set(labels_b)), len(set(labels)))) for i, j in zip(labels_b_subset, labels): cluster_counts[i, j] += 1 plt.figure(figsize=(10, 25)) sns.heatmap(cluster_counts / cluster_counts.sum(1)[:, np.newaxis], yticklabels=sorted(list(set(labels_b_subset))), vmin=0, vmax=1, linewidths=0.5) plt.xlabel('UNCURL clusters')
X, sigma, K, W, bound_, SLK_option, C_init, bound_lambda=lmbda, bound_iterations=200) if ts: trivial[count] = 1 continue # Evaluate the performance on validation set current_nmi = nmi(gnd_val, l[val_ind]) acc, _ = get_accuracy(gnd_val, l[val_ind]) print('lambda = ', lmbda, ' : NMI= %0.4f' % current_nmi) print('accuracy %0.4f' % acc) if current_nmi > bestnmi: bestnmi = current_nmi best_lambda_nmi = lmbda if acc > bestacc: bestacc = acc best_lambda_acc = lmbda best_C_init = C_init.copy() print('Best result: NMI= %0.4f' % bestnmi, '|NMI lambda = ',
if __name__=="__main__": import sys import math as m import numpy as np from sklearn.metrics.cluster import normalized_mutual_info_score as nmi fp=open(sys.argv[1],'r') featLen=len(fp.readline().strip().split("\t"))-1 fp.close() # dlt=sys.argv[2] # if dlt == "0": dlt="\t" col=int(sys.argv[2]) train=np.loadtxt(sys.argv[1],dtype=str,delimiter=dlt,usecols=(range(featLen))) target=np.loadtxt(sys.argv[1],dtype=str,delimiter=dlt,usecols=(featLen,)) mi=[] for i in range(featLen): mi.append((nmi(train[:,i],target),i)) # sys.exit() print sorted(mi, reverse=True) # print "Relevance = ",mi target=train[:,col] mi=[] for i in range(0,featLen): if i==col: continue data=train[:,i] mi.append((nmi(data,target),i)) print 'Redundance of ',sys.argv[2],sorted(mi, reverse=True)
if __name__ == '__main__': import time # load/subset data data_mat = scipy.io.loadmat('../data/10x_pooled_400.mat') data = data_mat['data'] gene_subset = uncurl.max_variance_genes(data) data_subset = data[gene_subset, :] # run bnpy clustering? true_labels = data_mat['labels'].flatten() t0 = time.time() selected_k, labels = bnpy_select_clusters(data_subset) print(selected_k) print('nmi: ' + str(nmi(true_labels, labels))) print('time: ' + str(time.time() - t0)) data_mat_2 = scipy.io.loadmat('../../uncurl_python/data/SCDE_k2_sup.mat') data = data_mat_2['Dat'] t0 = time.time() selected_k, labels = bnpy_select_clusters(data) true_labels = data_mat_2['Lab'].flatten() print(selected_k) print('nmi: ' + str(nmi(true_labels, labels))) print('time: ' + str(time.time() - t0)) # Zeisel 7-cluster dataset data_mat_3 = scipy.io.loadmat('../../uncurl_python/data/GSE60361_dat.mat') data = data_mat_3['Dat'] gene_subset = uncurl.max_variance_genes(data)
def nmi_acc(U, labels): X = np.argmax(U, axis=1) return nmi(X, labels), acc(X, labels)
# 1. load data data = scipy.io.loadmat('data/10x_pooled_400.mat') data_csc = data['data'] labels = data['labels'].flatten() gene_names = data['gene_names'] # 2. gene selection genes = uncurl.max_variance_genes(data_csc) data_subset = data_csc[genes,:] gene_names_subset = gene_names[genes] # 3. run uncurl m, w, ll = uncurl.run_state_estimation(data_subset, 8) print('nmi basic: ' + str(nmi(labels, w.argmax(0)))) # 4. run clustering for metric in ['euclidean', 'cosine']: for n_neighbors in [10, 15, 20]: print('n_neighbors: ', n_neighbors, ' metric: ', metric) w_graph = clustering_methods.create_graph(w.T, n_neighbors=n_neighbors, metric=metric) clusters = clustering_methods.run_leiden(w_graph) print('nmi leiden: ' + str(nmi(labels, clusters))) clusters_louvain = clustering_methods.run_louvain(w_graph) print('nmi louvain: ' + str(nmi(labels, clusters_louvain))) # 5. try running clustering w/o uncurl clustering_result = clustering_methods.baseline_cluster(data_subset) # TODO: figure out cuts