def testMaxVarGenes(self): """ test max variance genes for dense and sparse matrices """ n_genes =self.data_sparse.shape[0] genes1 = uncurl.max_variance_genes(self.data_dense, nbins=1, frac=0.5) genes2 = uncurl.max_variance_genes(self.data_sparse, nbins=1, frac=0.5) self.assertEqual(set(genes1), set(genes2)) self.assertEqual(len(genes1), int(0.5*n_genes)) genes1 = uncurl.max_variance_genes(self.data_dense, nbins=5, frac=0.2) genes2 = uncurl.max_variance_genes(self.data_sparse, nbins=5, frac=0.2) self.assertEqual(set(genes1), set(genes2)) self.assertEqual(len(genes1), 5*int((n_genes/5)*0.2))
def setUp(self): dat = scipy.io.loadmat('data/10x_pooled_400.mat') self.data = scipy.sparse.csc_matrix(dat['data']) self.labels = dat['labels'].flatten() # 2. gene selection genes = uncurl.max_variance_genes(self.data) self.data_subset = self.data[genes, :]
def preproc_data(data): """ basic data preprocessing """ import uncurl from uncurl.preprocessing import log1p, cell_normalize from sklearn.decomposition import TruncatedSVD gene_subset = uncurl.max_variance_genes(data) data_subset = data[gene_subset, :] tsvd = TruncatedSVD(8) data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T) return data_tsvd
def test_real_data_pairwise(self): mat = scipy.io.loadmat('data/10x_pooled_400.mat') data = mat['data'] # do uncurl, followed by update_m selected_genes = uncurl.max_variance_genes(data) data_subset = data[selected_genes, :] m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50) m = uncurl.update_m(data, m, w, selected_genes) # test pairwise all_pvs, all_ratios = poisson_diffexp.uncurl_test_pairwise(m, w, mode='counts') self.assertEqual(all_pvs.shape, (data.shape[0], 8, 8)) self.assertEqual(all_ratios.shape, (data.shape[0], 8, 8)) self.assertTrue((all_pvs < 0.001).sum() < data.shape[0]) self.assertTrue((all_pvs < 0.01).sum() > 100)
def preproc_data(data, gene_subset=False, **kwargs): """ basic data preprocessing before running gap score Assumes that data is a matrix of shape (genes, cells). Returns a matrix of shape (cells, 8), using the first 8 SVD components. Why 8? It's an arbitrary selection... """ import uncurl from uncurl.preprocessing import log1p, cell_normalize from sklearn.decomposition import TruncatedSVD data_subset = data if gene_subset: gene_subset = uncurl.max_variance_genes(data) data_subset = data[gene_subset, :] tsvd = TruncatedSVD(min(8, data_subset.shape[0] - 1)) data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T) return data_tsvd
def run_partition(data, smallk, largek, method, max_depth): """ Very simple recursive partitioning-based state estimation system. Args: data smallk (int): k for each individual clustering largek (int): k for the whole global clustering """ # what if some cell subsets have zero gene expression values? # we reduce the gene subset and then re-position m print('run partition: data shape={0}, smallk={1}, largek={2}'.format( data.shape, smallk, largek)) genes = uncurl.max_variance_genes(data, nbins=1, frac=1.0) results, ll = method.run(data[genes, :]) w = results[0] m_ = results[1] m = np.zeros((data.shape[0], smallk)) m[genes, :] = m_ clusters_0 = w.argmax(0) if max_depth == 0: print('return at depth 0') return m, w m_new = np.zeros((m.shape[0], largek)) w_new = np.zeros((largek, w.shape[1])) # the size of each sub-cluster n_k = largek / smallk for i in range(smallk): # TODO: how to deal with uncertain (high entropy) cells? # soft-cluster n percentile of the cells with the highest entropy # (include them in both subsets), # after returning, use the sub-cluster with lower entropy. data_c0 = data[:, clusters_0 == i] m_s1, w_s1 = run_partition(data_c0, smallk, largek / 2, method, max_depth - 1) print(m_s1.shape) print(w_s1.shape) # place the sub-results for m and w back into the big one k_range = range(i * n_k, (i + 1) * n_k) m_new[:, k_range] = m_s1 w_new[np.ix_(k_range, clusters_0 == i)] = w_s1 return m_new, w_new
def setUp(self): data = scipy.io.loadmat('data/10x_pooled_400.mat') data_csc = data['data'] self.labels = data['labels'].flatten() #gene_names = data['gene_names'] # 2. gene selection genes = uncurl.max_variance_genes(data_csc) self.data_subset = data_csc[genes, :] #gene_names_subset = gene_names[genes] # 3. run uncurl m, w, ll = uncurl.run_state_estimation(self.data_subset, 8, max_iters=20, inner_max_iters=50) print('nmi basic: ' + str(nmi(self.labels, w.argmax(0)))) self.m = m self.w = w
def test_10x_update_m(self): """ Test after updating M """ from uncurl.state_estimation import update_m genes = uncurl.max_variance_genes(self.data) data_subset = self.data[genes, :] # smaller # of iterations than default so it finishes faster... M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, max_iters=10, inner_max_iters=50) new_M = update_m(self.data, M, W, genes) self.assertEqual(new_M.shape, (self.data.shape[0], W.shape[0])) self.assertFalse(np.isnan(new_M).any()) # test RMSE test_data = np.dot(new_M, W) error = self.data.toarray() - test_data error = np.sqrt(np.mean(error**2)) print('M update RMSE:', error) self.assertTrue(error < 2.0)
def test_real_data_1_vs_rest(self): mat = scipy.io.loadmat('data/10x_pooled_400.mat') data = mat['data'] # do uncurl, followed by update_m selected_genes = uncurl.max_variance_genes(data) data_subset = data[selected_genes, :] m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50) m = uncurl.update_m(data, m, w, selected_genes) # TODO: how should the p-values be tested? all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w) all_pvs = np.array(all_pvs) all_ratios = np.array(all_ratios) self.assertTrue((all_pvs < 0.05).sum() > 100) self.assertTrue((all_ratios > 10).sum() > 100) self.assertEqual(all_pvs.shape, (data.shape[0], 8)) all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w, mode='counts') all_pvs = np.array(all_pvs) all_ratios = np.array(all_ratios) self.assertEqual(all_pvs.shape, (data.shape[0], 8)) self.assertTrue((all_pvs < 0.01).sum() > 100) self.assertTrue((all_pvs < 0.01).sum() < data.shape[0]) self.assertTrue((all_ratios > 10).sum() > 100)
def test_Zeisel(self): # gene selection genes = uncurl.max_variance_genes(self.data_z) data_subset = self.data_z[genes, :] # smaller # of iterations than default so it finishes faster... se = uncurl.experiment_runner.PoissonSE(clusters=7, max_iters=10, inner_max_iters=80) argmax = uncurl.experiment_runner.Argmax(n_classes=7) km = uncurl.experiment_runner.KM(n_classes=7) methods = [(se, [argmax, km])] results, names, other = uncurl.experiment_runner.run_experiment( methods, data_subset, 7, self.labs_z, n_runs=1, use_purity=False, use_nmi=True) print(results) # NMI should be > 0.75 on Zeisel subset as well self.assertTrue(results[0][0] > 0.75) self.assertTrue(results[0][1] > 0.75)
def test_10x_auto_cluster(self): """ Test using automatic cluster size determination """ from sklearn.metrics.cluster import normalized_mutual_info_score as nmi # gene selection genes = uncurl.max_variance_genes(self.data) data_subset = self.data[genes, :] # smaller # of iterations than default so it finishes faster... M, W, ll = uncurl.run_state_estimation(data_subset, clusters=0, max_iters=10, inner_max_iters=80) labels = W.argmax(0) # NMI should be > 0.75 on 10x_pure_pooled # (accounting for lower than default iter count) self.assertTrue(nmi(self.labs, labels) > 0.6) # test RMSE test_data = np.dot(M, W) error = data_subset.toarray() - test_data error = np.sqrt(np.mean(error**2)) print('data subset RMSE:', error) self.assertTrue(error < 2.0)
def test_10xSE(self): # gene selection genes = uncurl.max_variance_genes(self.data) data_subset = self.data[genes, :] # smaller # of iterations than default so it finishes faster... se = uncurl.experiment_runner.PoissonSE(clusters=8, max_iters=10, inner_max_iters=80) argmax = uncurl.experiment_runner.Argmax(n_classes=8) km = uncurl.experiment_runner.KM(n_classes=8) methods = [(se, [argmax, km])] results, names, other = uncurl.experiment_runner.run_experiment( methods, data_subset, 8, self.labs, n_runs=1, use_purity=False, use_nmi=True) print(results) # NMI should be > 0.75 on 10x_pure_pooled # (accounting for lower than default iter count) self.assertTrue(results[0][0] > 0.75) self.assertTrue(results[0][1] > 0.75)
import numpy as np import scipy.io from uncurl_analysis import clustering_methods from sklearn.metrics.cluster import normalized_mutual_info_score as nmi import uncurl # 1. load data data = scipy.io.loadmat('data/10x_pooled_400.mat') data_csc = data['data'] labels = data['labels'].flatten() gene_names = data['gene_names'] # 2. gene selection genes = uncurl.max_variance_genes(data_csc) data_subset = data_csc[genes,:] gene_names_subset = gene_names[genes] # 3. run uncurl m, w, ll = uncurl.run_state_estimation(data_subset, 8) print('nmi basic: ' + str(nmi(labels, w.argmax(0)))) # 4. run clustering for metric in ['euclidean', 'cosine']: for n_neighbors in [10, 15, 20]: print('n_neighbors: ', n_neighbors, ' metric: ', metric) w_graph = clustering_methods.create_graph(w.T, n_neighbors=n_neighbors, metric=metric) clusters = clustering_methods.run_leiden(w_graph) print('nmi leiden: ' + str(nmi(labels, clusters)))
import time import numpy as np from uncurl_analysis import poisson_diffexp import scipy.io import uncurl mat = scipy.io.loadmat('data/10x_pooled_400.mat') data = mat['data'] # do uncurl, followed by update_m selected_genes = uncurl.max_variance_genes(data) data_subset = data[selected_genes, :] m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50) m = uncurl.update_m(data, m, w, selected_genes) t0 = time.time() all_pvs, all_ratios = poisson_diffexp.uncurl_poisson_test_1_vs_rest( m, w, mode='counts') print('diffexp time: ', time.time() - t0) t0 = time.time() all_pvs_2, all_ratios_2 = poisson_diffexp.uncurl_poisson_test_pairwise( m, w, mode='counts') print('pairwise diffexp time: ', time.time() - t0) # test on simulated data # plotting mw import matplotlib.pyplot as plt
l3 = np.copy(l1) np.random.shuffle(l3) print(m_ndcg(l1, l2, l3)) print(m_ndcg(l1, l1, l1)) print(m_ndcg(l1, l2, l1)) print(m_ndcg(l1, l2, l2)) print(m_ndcg(l1, l3, l3)) print(m_ndcg(l1, l1, l3)) exit(0) X1 = scipy.io.mmread('data_8000_cells.mtx') X1 = X1.tocsc() true_labels1 = np.loadtxt('labels_8000_cells.txt').astype(int).flatten() k = 8 frac = 0.2 genes = uncurl.max_variance_genes(X1, nbins=5, frac=frac) data_subset = X1[genes, :] n_genes = data_subset.shape[0] # TODO: run uncurl se_mw = uncurl.experiment_runner.PoissonSE(clusters=k, return_m=True) se_mw_2 = uncurl.experiment_runner.PoissonSE(clusters=2, return_m=True) # layer 1: t0 = time.time() print('starting recursive uncurl') m, w = run_partition(data_subset, 2, 8, se_mw_2, 2) print('time elapsed: {0}'.format(time.time() - t0)) print('nmi: {0}'.format(nmi(w.argmax(0), true_labels1)))
mw = torch.matmul(m, w) return mw.numpy() if __name__ == '__main__': import uncurl from uncurl.state_estimation import objective from uncurl.preprocessing import cell_normalize, log1p import scipy.io from sklearn.cluster import KMeans from sklearn.metrics.cluster import normalized_mutual_info_score as nmi mat = scipy.io.loadmat('data/10x_pooled_400.mat') actual_labels = mat['labels'].squeeze() X = mat['data'].toarray().astype(np.float32) genes = uncurl.max_variance_genes(X, 5, 0.2) X_subset = X[genes, :] X_log_norm = log1p(cell_normalize(X_subset)).astype(np.float32) uncurl_net = UncurlNet(X_log_norm, 8, use_reparam=False, use_decoder=False, use_batch_norm=True, hidden_layers=2, hidden_units=200, loss='mse') m_init = torch.tensor(uncurl_net.M) uncurl_net.pre_train_encoder(None, lr=1e-3, n_epochs=20, log_interval=10) uncurl_net.train_model(None, lr=1e-3, n_epochs=50, log_interval=10)
import uncurl from uncurl.sparse_utils import symmetric_kld from uncurl.vis import visualize_dim_red # note: this whole script should finish in under a few minutes. if __name__ == '__main__': # 1. load data - 753 cells, 19971 genes dat = loadmat('data/GSE60361_dat.mat') data = dat['Dat'] true_labels = dat['ActLabs'].flatten() data_csc = sparse.csc_matrix(data) # 2. gene selection genes = uncurl.max_variance_genes(data_csc, nbins=5, frac=0.2) data_subset = data_csc[genes, :] # 3. state estimation k = 7 # number of clusters to use M, W, ll = uncurl.poisson_estimate_state(data_subset, k) argmax_labels = W.argmax(0) # 4. visualization # mds visualization mds_proj = uncurl.mds(M, W, 2) visualize_dim_red(mds_proj, true_labels, 'GSE60361_mds_true_labels.png', title='MDS',
def generate_uncurl_analysis(data, output_dir, data_type='dense', gene_names=None, gene_sub=True, **uncurl_kwargs): """ Performs an uncurl analysis of the data, writing the results in the given directory. Outputs: output_dir/m.txt output_dir/w.txt output_dir/labels.txt (integer labels) output_dir/top_genes.txt (json of a dict mapping cluster ids to a list of (gene_id : c_score) sorted by c_score) output_dir/mds_means.txt (mds of the means) output_dir/mds_data.txt (mds projection of data) output_dir/gene_subset.txt (gene subset selected by uncurl) output_dir/gene_names.txt (list of all gene names in data subset) Args: data (array or str): either a data array, or a string containing the path to a data array.. output_dir (str): directory to write output to. data_type (str): if data is a path, this indicates whether the data is a dense or sparse array. gene_names (list or array): list of all gene names gene_sub (bool): whether or not to use gene subset selection (max_variance_genes) **uncurl_kwargs: arguments to pass to uncurl.run_state_estimation. has to include clusters=k. """ try: os.makedirs(output_dir) except: print('could not make output dir: {0}'.format(output_dir)) if isinstance(data, str): if data_type == 'dense': data = np.loadtxt(data) elif data_type == 'sparse': data = scipy.io.mmread(data) data = sparse.csc_matrix(data) if isinstance(gene_names, str): gene_names = np.loadtxt(gene_names, dtype=str) # run uncurl if gene_sub: genes_subset = np.array(uncurl.max_variance_genes(data)) np.savetxt(os.path.join(output_dir, 'gene_subset.txt'), genes_subset, fmt='%d') data = data[genes_subset, :] if gene_names is not None: gene_names = gene_names[genes_subset] print(uncurl_kwargs) m, w, ll = uncurl.run_state_estimation(data, **uncurl_kwargs) np.savetxt(os.path.join(output_dir, 'm.txt'), m) np.savetxt(os.path.join(output_dir, 'w.txt'), w) labels = w.argmax(0) np.savetxt(os.path.join(output_dir, 'labels.txt'), labels, fmt='%d') # find overexpressed genes for clusters top_genes = uncurl_analysis.find_overexpressed_genes(data, w.argmax(0)) with open(os.path.join(output_dir, 'top_genes.txt'), 'w') as f: json.dump(top_genes, f) # run mds mds_output = uncurl.dim_reduce(m, w, 2) print(mds_output.shape) np.savetxt(os.path.join(output_dir, 'mds_means.txt'), mds_output.T) mds_data = uncurl.mds(m, w, 2) np.savetxt(os.path.join(output_dir, 'mds_data.txt'), mds_data) if gene_names is not None: np.savetxt(os.path.join(output_dir, 'gene_names.txt'), gene_names, fmt='%s')
import os import pandas as pd import scipy.io from purity_analysis import plot_df, build_simple_table data_counts = pd.read_csv( '../uncurl_test_datasets/tasic_allen_brain_map/genes_counts.csv') X1 = data_counts.iloc[:, 1:].as_matrix() X1 = sparse.csc_matrix(X1) cell_classification = pd.read_csv( '../uncurl_test_datasets/tasic_allen_brain_map/cell_classification.csv' ) actual_labels = cell_classification.primary k = 49 genes = uncurl.max_variance_genes(X1, 5, 0.2) data_subset = X1[genes, :] log = uncurl.experiment_runner.Log() log_norm = uncurl.experiment_runner.LogNorm() uncurl_net_runner = UncurlNetRunner(k=k, loss='mse') uncurl_runner = experiment_runner.PoissonSE(clusters=k) uncurl_net_runner_2_hidden_layers = UncurlNetRunner( k=k, hidden_layers=2, loss='mse', output_names=['UncurlNetW_2_400']) uncurl_net_runner_2_hidden_layers_2 = UncurlNetRunner( k=k, hidden_layers=2, loss='mse', n_model_epochs=100, output_names=['UncurlNetW_2_400_100iters']) uncurl_net_runner_100_units = UncurlNetRunner(
moves='birth,merge,shuffle', m_startLap=5, b_startLap=2, b_Kfresh=4) selected_k = info_dict['K_history'][-1] results = trained_model.calc_local_params(data_dense_bnpy) cluster_labels = results['resp'].argmax(1) return selected_k, cluster_labels if __name__ == '__main__': import time # load/subset data data_mat = scipy.io.loadmat('../data/10x_pooled_400.mat') data = data_mat['data'] gene_subset = uncurl.max_variance_genes(data) data_subset = data[gene_subset, :] # run bnpy clustering? true_labels = data_mat['labels'].flatten() t0 = time.time() selected_k, labels = bnpy_select_clusters(data_subset) print(selected_k) print('nmi: ' + str(nmi(true_labels, labels))) print('time: ' + str(time.time() - t0)) data_mat_2 = scipy.io.loadmat('../../uncurl_python/data/SCDE_k2_sup.mat') data = data_mat_2['Dat'] t0 = time.time() selected_k, labels = bnpy_select_clusters(data) true_labels = data_mat_2['Lab'].flatten()
try: os.makedirs(dire_name) except OSError as e: if e.errno != errno.EEXIST: raise e args = parse_args() print("run with these parametres: %s" % str(args)) data = pd.read_csv(args.input, index_col=0) if args.gene_subset == 'non_zero': genes_subset = np.sum(data.values, axis=1) != 0 # select nonzero genes elif args.gene_subset == 'max_variance': genes_subset = max_variance_genes(data.values, nbins=5, frac=0.2) # select genes with max variance else: raise NotImplementedError("optin `%s` for `gene_subset` not defined." % args.gene_subset) data_subset = data.iloc[genes_subset,:] M, W, ll = run_state_estimation(data_subset.values, clusters=args.clusters, dist=args.dist, disp=True, max_iters=args.max_iters, inner_max_iters=args.inner_max_iters, initialization=args.initialization, threads=args.threads) print("ll: %f" % ll) data.iloc[genes_subset, :] = np.matmul(M, W) # imputation
m = m.numpy() mw = m.dot(w) return [w, mw], 0 else: return [w], 0 if __name__ == '__main__': import os import pandas as pd import scipy.io from purity_analysis import plot_df, build_simple_table data = scipy.io.mmread('../uncurl_test_datasets/10x_pure_pooled/data_8000_cells.mtx.gz') data = sparse.csc_matrix(data) actual_labels = np.loadtxt('../uncurl_test_datasets/10x_pure_pooled/labels_8000_cells.txt').astype(int).flatten() k = len(set(actual_labels)) genes = uncurl.max_variance_genes(data, 5, 0.2) data_subset = data[genes, :] # TODO: add experiment for NMF log = uncurl.experiment_runner.Log() log_norm = uncurl.experiment_runner.LogNorm() uncurl_net_runner = UncurlNetRunner(k=k, loss='mse') uncurl_runner = experiment_runner.PoissonSE(clusters=k) uncurl_net_runner_2_hidden_layers = UncurlNetRunner(k=k, hidden_layers=2, loss='mse', output_names=['UncurlNetW_2_400']) uncurl_net_runner_2_hidden_layers_2 = UncurlNetRunner(k=k, hidden_layers=2, loss='mse', n_model_epochs=100, output_names=['UncurlNetW_2_400']) uncurl_net_runner_100_units = UncurlNetRunner(k=k, hidden_units=100, hidden_layers=2, loss='mse', output_names=['UncurlNetW_2_100']) vis_dir = '10x_8k_vis' try: os.makedirs(vis_dir) except:
def gene_subset_creator(data_normalized, cell_subset, **kwargs): data = data_normalized[:, cell_subset] gene_subset = uncurl.max_variance_genes(data, nbins=5, frac=kwargs['frac']) gene_subset = np.array(gene_subset) return gene_subset