def test_1_v_rest_TTest(self): t_test_scores, t_test_p_vals = one_vs_rest_t(self.data, self.labs) self.assertEqual(len(t_test_scores), len(t_test_p_vals)) K = len(set(self.labs)) genes, cells = self.data.shape self.assertEqual(len(t_test_scores), K) for k in set(self.labs): print(k) print(t_test_scores[k][:10]) print(t_test_p_vals[k][:10]) pvals = np.array([x[1] for x in t_test_p_vals[k]]) cscores = np.array([x[1] for x in t_test_scores[k]]) self.assertTrue(t_test_scores[k][0][1] >= 1) self.assertTrue(t_test_p_vals[k][0][1] <= 0.05) self.assertTrue((pvals >= 0).all()) self.assertTrue((pvals <= 1).all()) self.assertTrue((cscores >= 0).all()) t_test_scores, _ = one_vs_rest_t(self.data, self.labs, calc_pvals=False) for k in set(self.labs): self.assertTrue(t_test_scores[k][0][1] >= 1)
def test_1_v_rest_simulated_data(self): from uncurl import simulation data, clusters = simulation.generate_poisson_data( np.array([[0.5, 4.0, 3.0], [10.0, 1.0, 1.0], [0.5, 0.5, 0.5]]), 100) data_csc = sparse.csc_matrix(data) ratios, pvals = one_vs_rest_t(data_csc, clusters, eps=1e-8, test='u') print(ratios) print(data[1, clusters == 0].mean() / data[1, clusters == 1].mean()) print(pvals) self.assertTrue(ratios[0][0][0] == 1) self.assertTrue(ratios[1][0][0] == 0) self.assertTrue(pvals[0][0][0] == 1) self.assertTrue(pvals[0][0][1] < 0.05) self.assertTrue(pvals[0][2][1] > 0.01) self.assertTrue(pvals[1][0][0] == 0) self.assertTrue(pvals[1][0][1] < 0.05)
def get_genes_tm(all_matrices, genes, all_labels): """ This function returns the top genes from Tabula Muris. """ labels_set = set(all_labels) n_genes = len(genes) if len(genes) == all_matrices.shape[1]: all_matrices = all_matrices.T print(all_matrices.shape) print(all_labels.shape) from uncurl_analysis import gene_extraction scores_t, pvals_t = gene_extraction.one_vs_rest_t(all_matrices, all_labels, eps=0.1, test='t') # take the top 50 genes for each cell type cell_type_genes = {} all_top_genes = set() n_gene = 50 for label in labels_set: top_genes = [genes[x[0]] for x in scores_t[label][:n_gene]] cell_type_genes[label] = top_genes all_top_genes.update(top_genes) # TODO: get genes with highest variance from uncurl import preprocessing means, vars = preprocessing.sparse_mean_var(all_matrices) nonzeros = (all_matrices > 0) genes_sum = np.array(nonzeros.sum(1)).flatten() # get the 2000 genes with the highest variance/mean that also have at least 10% nonzero mv = vars / (means + 1e-8) mv_indices = mv.argsort()[::-1] top_genes = mv_indices[:1000] top_genes = top_genes[genes_sum[top_genes] > (n_genes / 10.)] top_genes = [genes[x] for x in top_genes] all_top_genes.update(top_genes) return all_top_genes
#reject, pvals_corrected, a, b = multitest.multipletests(pvals[1,0,:]) #pvals[0,0,:] = pvals_corrected # calculate actual ratios between means of the two clusters import numpy as np actual_ratios = (data_csc[:, labels == 2].mean(1) + 0.1) / (data_csc[:, labels == 1].mean(1) + 0.1) actual_ratios = np.array(actual_ratios).flatten() # calculate p-values c_scores, c_pvals = c_scores_from_t(scores, pvals) print('c score time: ', time.time() - t0) # 1 vs rest u-test t0 = time.time() c_scores, c_pvals = one_vs_rest_t(data_csc, labels, eps=0.1, test='u') print('mann-whitney u-test time: ', time.time() - t0) # step 3: map gene ids to gene names new_pvals = {k: [] for k in c_pvals.keys()} for k, p in c_pvals.items(): for gene_id, pv in p: new_pvals[k].append((table_gene_names.iloc[gene_id], pv)) # save new_pvals with open('scde_c_score_pvals.pkl', 'wb') as f: pickle.dump(new_pvals, f) with open('scde_c_scores.pkl', 'wb') as f: pickle.dump(c_scores, f)
# save all_matrices, all_labels scipy.io.mmwrite('mca_fine_all_matrices.mtx', all_matrices) np.savetxt('mca_fine_all_labels.txt', all_labels, fmt='%s') ############################################################################################ genes = np.loadtxt('genes_mca.txt', dtype=str) all_matrices = scipy.io.mmread('mca_fine_all_matrices.mtx') all_labels = np.loadtxt('mca_fine_all_labels.txt', dtype=str, delimiter='##') # 2. calculate diffexp for each cluster from uncurl_analysis import gene_extraction import time t0 = time.time() scores_t, pvals_t = gene_extraction.one_vs_rest_t(all_matrices, all_labels, eps=0.1, test='t') print('diffexp time for t test:', time.time() - t0) t0 = time.time() #scores_u, pvals_u = gene_extraction.one_vs_rest_t(all_matrices, all_labels, eps=0.1, test='u') print('diffexp time for u test:', time.time() - t0) with open('mca_fine_t_scores.pkl', 'wb') as f: pickle.dump(scores_t, f) with open('mca_fine_t_pvals.pkl', 'wb') as f: pickle.dump(pvals_t, f) #with open('mca_fine_u_pvals.pkl', 'wb') as f: # pickle.dump(pvals_u, f) ########################################################################################### # 3. for each cluster, run cellmesh and cellmarker