def test_split_delete_cluster(self): """ Tests splitting clusters and deleting cells. """ sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='MDS', cell_frac=1.0, max_iters=20, inner_max_iters=10) sca.run_full_analysis() # split two clusters.... clusters = sca.labels cluster_counts = Counter(clusters) top_cluster, top_count = cluster_counts.most_common()[0] print(cluster_counts) print(top_cluster, top_count) sca.recluster('split', [top_cluster]) sca.run_post_analysis() self.assertEqual(sca.params['clusters'], 9) self.assertEqual(sca.w_sampled.shape[0], 9) old_cell_count = len(sca.cell_sample) cells_to_remove = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] sca.recluster('delete', cells_to_remove) sca.run_post_analysis() self.assertEqual(sca.params['clusters'], 9) new_cell_count = len(sca.cell_sample) self.assertEqual(new_cell_count, old_cell_count - len(cells_to_remove))
def test_run_full_analysis(self): sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, frac=0.2, data_filename='data.mtx', baseline_dim_red='umap', dim_red_option='umap', normalize=True, use_fdr=True, min_reads=10, max_reads=4000, cell_frac=0.5, max_iters=20, inner_max_iters=20) print(sca.data.shape) print(sca.cell_subset.shape) print(sca.cell_subset) print(sca.data_subset.shape) self.assertEqual(sca.cell_subset.shape[0], 400) self.assertTrue(sca.data_subset.shape[1] > 200) sca.run_full_analysis() self.assertTrue(sca.has_dim_red) self.assertTrue(sca.has_pvals) self.assertTrue(sca.has_top_genes_1_vs_rest) self.assertTrue(sca.has_top_genes) self.assertTrue(sca.has_baseline_vis) top_genes = sca.top_genes self.assertEqual(len(top_genes), 8) self.assertEqual(len(top_genes[0]), sca.data.shape[0]) top_genes_1_vs_rest = sca.top_genes_1_vs_rest self.assertEqual(len(top_genes_1_vs_rest), 8) self.assertEqual(len(top_genes_1_vs_rest[0]), sca.data.shape[0]) self.assertEqual(sca.dim_red.shape[0], 2)
def test_merge_new_cluster(self): """ tests merging clusters, and creating new clusters from a subset of cells. """ sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', cell_frac=0.99, max_iters=20, inner_max_iters=20) sca.run_full_analysis() print(sca.w_sampled.argmax(0)) # merge two clusters.... pair = [0, 1] sca.recluster('merge', pair) print(sca.w_sampled.argmax(0)) clusters = sca.w.argmax(0) cluster_counts = Counter(clusters) top_cluster, top_count = cluster_counts.most_common()[0] sca.run_post_analysis() self.assertEqual(sca.params['clusters'], 7) self.assertEqual(sca.w_sampled.shape[0], 7) # TODO: due to sampling, this won't actually be the cluster 7 cells... selected_cells = list(range(350, sca.w_sampled.shape[1])) sca.recluster('new', selected_cells) self.assertEqual(sca.params['clusters'], 8) self.assertEqual(sca.w_sampled.shape[0], 8)
def test_add_color_track(self): sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='MDS', clustering_method='leiden', cell_frac=1.0, max_iters=20, inner_max_iters=10) sca.add_color_track('true_labels', self.labs, is_discrete=True) true_labels, is_discrete = sca.get_color_track('true_labels') self.assertTrue(nmi(true_labels, self.labs) > 0.99) top_genes, top_pvals = sca.calculate_diffexp('true_labels') self.assertEqual(len(top_genes), 8) self.assertEqual(len(top_pvals), 8) sca.add_color_track('true_labels_2', self.labs, is_discrete=False) true_labels_2, _ = sca.get_color_track('true_labels_2') self.assertTrue((true_labels_2.astype(int) == self.labs).all()) pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels', mode='pairwise') self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape) pairwise_genes, pairwise_pvals = sca.calculate_diffexp('true_labels', mode='pairwise') self.assertEqual(pairwise_genes.shape, pairwise_pvals.shape) self.assertEqual(pairwise_genes.shape[0], 8) top_genes, top_pvals = sca.calculate_diffexp('true_labels') self.assertEqual(len(top_genes[0]), len(sca.gene_names)) self.assertEqual(len(top_genes), 8) self.assertEqual(len(top_pvals), 8)
def test_load_from_folder(self): sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, data_filename='data.mtx') self.assertEqual(sca.params['clusters'], 8) self.assertEqual(sca.data_dir, self.data_dir) self.assertEqual(sca.data_f, os.path.join(self.data_dir, 'data.mtx')) # test read couns self.assertTrue((sca.read_counts == self.data.sum(0)).all())
def test_json(self): sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='umap', normalize=1, cell_frac=0.5, max_iters=20, inner_max_iters=20, use_fdr=1) sca.run_full_analysis() sca.save_json_reset() # delete the whole sca, re-load it from json del sca sca = sc_analysis.SCAnalysis(self.data_dir) sca = sca.load_params_from_folder() self.assertEqual(sca.params['clusters'], 8) self.assertEqual(sca.params['baseline_dim_red'], 'tsvd') self.assertEqual(sca.params['dim_red_option'], 'umap') self.assertEqual(sca.params['cell_frac'], 0.5) self.assertEqual(sca.params['genes_frac'], 0.2) self.assertTrue(sca.params['normalize']) self.assertTrue(sca.params['use_fdr']) self.assertEqual(sca.uncurl_kwargs['max_iters'], 20) self.assertTrue(sca.has_dim_red) self.assertTrue(sca.has_w) self.assertTrue(sca.has_m) self.assertEqual(sca.cell_subset.shape[0], 400) means = sca.cluster_means self.assertEqual(means.shape[1], 8) self.assertEqual(means.shape[0], self.data.shape[0]) # TODO: do re-clustering sca.add_color_track('true_labels', self.labs, is_discrete=True) old_labels = sca.labels sca.relabel('louvain') self.assertFalse((old_labels == sca.labels).all()) true_labels, is_discrete = sca.get_color_track('true_labels') self.assertTrue(nmi(sca.labels, true_labels) > 0.65) sca.relabel('leiden') self.assertTrue(nmi(sca.labels, true_labels) > 0.65)
def test_dim_red_sample(self): sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', cell_frac=0.2, max_iters=20, inner_max_iters=20) mds_means = sca.mds_means self.assertEqual(mds_means.shape[0], 2) self.assertEqual(mds_means.shape[1], 8) dr = sca.dim_red self.assertEqual(dr.shape[0], 2) self.assertEqual(dr.shape[1], int(0.2 * sca.data.shape[1]))
def setUp(self): dat = loadmat('data/10x_pooled_400.mat') self.data = sparse.csc_matrix(dat['data']) self.labels = dat['labels'].flatten() self.data_dir = '/tmp/uncurl_analysis/test' try: shutil.rmtree(self.data_dir) os.makedirs(self.data_dir) except: os.makedirs(self.data_dir) scipy.io.mmwrite(os.path.join(self.data_dir, 'data.mtx'), self.data) self.sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, data_filename='data.mtx') self.sca.add_color_track('true_labels', self.labels, is_discrete=True)
def test_run_uncurl(self): sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, frac=0.2, data_filename='data.mtx', max_iters=20, inner_max_iters=50) sca.run_uncurl() self.assertTrue(sca.has_w) self.assertTrue(sca.has_m) self.assertTrue(sca.w.shape[0] == 8) self.assertTrue(sca.w.shape[1] == self.data.shape[1]) self.assertTrue(os.path.exists(sca.w_f)) self.assertTrue(os.path.exists(sca.m_f)) print(nmi(sca.labels, self.labs)) self.assertTrue(nmi(sca.labels, self.labs) > 0.65)
def test_dim_red_2(self): sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, frac=0.2, data_filename='data.mtx', dim_red_option='UMAP', baseline_dim_red='UMAP', cell_frac=0.2, max_iters=20, inner_max_iters=20) mds_means = sca.mds_means self.assertEqual(mds_means.shape[0], 2) self.assertEqual(mds_means.shape[1], 8) dr = sca.dim_red self.assertEqual(dr.shape[0], 2) self.assertEqual(dr.shape[1], int(0.2 * sca.data.shape[1])) dr_baseline = sca.baseline_vis self.assertEqual(dr_baseline.shape[0], 2) self.assertEqual(dr_baseline.shape[1], int(0.2 * sca.data.shape[1]))
def test_merge_cluster_history(self): """ Test merging with history log """ sca = sc_analysis.SCAnalysis(self.data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='MDS', cell_frac=1.0, max_iters=20, inner_max_iters=10) sca.run_full_analysis() original_labels = sca.labels.copy() # split two clusters.... clusters = sca.labels cluster_counts = Counter(clusters) top_cluster, top_count = cluster_counts.most_common()[0] print(cluster_counts) print(top_cluster, top_count) sca.recluster('merge', [0, 1], write_log_entry=True) sca.run_post_analysis() self.assertEqual(sca.params['clusters'], 7) self.assertEqual(sca.w_sampled.shape[0], 7) sca.recluster('split', [0], write_log_entry=True) sca.run_post_analysis() self.assertEqual(sca.params['clusters'], 8) # TODO: check history log = sca.log print(log) self.assertEqual(len(log), 2) entry = log[0] self.assertTrue(entry[3]) entry2 = log[1] self.assertTrue(entry2[3]) # try to re-load? sca.restore_prev(entry[1]) self.assertEqual(sca.params['clusters'], 8) self.assertEqual(sca.w_sampled.shape[0], 8) print(original_labels) print(sca.labels) self.assertTrue((sca.labels == original_labels).all())
def test_gene_names(self): sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='MDS', cell_frac=1.0, max_iters=20, inner_max_iters=10) import random values = random.sample(range(len(sca.gene_names)), 100) for i in values: gene_name = sca.gene_names[i] if (sca.gene_names == gene_name).sum() > 1: print('duplicate gene name') continue gene_info = sca.data_sampled_gene(gene_name) self.assertTrue( np.abs(gene_info - sca.data_sampled_all_genes[i, :]).sum() < 0.01) self.assertEqual(gene_info.shape[0], sca.data_sampled_all_genes.shape[1])
def generate_uncurl_analysis(data, output_dir, **uncurl_kwargs): """ Performs an uncurl analysis of the data, writing the results in the given directory. Assumes that output_dir contains a file named params.json, with all the parameters. Outputs: output_dir/data.txt or output_dir/data.mtx output_dir/m.txt output_dir/w.txt output_dir/labels.txt (integer labels) output_dir/top_genes.txt (json of a dict mapping cluster ids to a list of (gene_id : c_score) sorted by c_score) output_dir/mds_means.txt (mds of the means - 2 x k) output_dir/mds_data.txt (mds projection of data - 2 x n) output_dir/gene_subset.txt (gene subset selected by uncurl) output_dir/gene_names.txt (list of all gene names in data subset) output_dir/entropy.txt (entropy of cell labels) Args: data (array or str): either a data array, or a string containing the path to a data array.. output_dir (str): directory to write output to. contains params.json, data.mtx/.txt/.gz, and optionally gene_names.txt. **uncurl_kwargs: arguments to pass to uncurl.run_state_estimation.. """ # TODO: what about init? try: os.makedirs(output_dir) except: print('could not make output dir: {0}'.format(output_dir)) with open(os.path.join(output_dir, 'submitted'), 'w') as f: f.write('') data_is_sparse = True if not isinstance(data, np.ndarray) and not isinstance( data, sparse.spmatrix): data_filename = data if data.endswith('.mtx') or data.endswith('.mtx.gz'): data_is_sparse = True else: data_is_sparse = False else: pass with open(os.path.join(output_dir, 'uncurl_kwargs.json'), 'w') as f: json.dump(uncurl_kwargs, f) sca = sc_analysis.SCAnalysis(output_dir, data_filename=data_filename, data_is_sparse=data_is_sparse) sca.load_params_json() if os.path.exists(os.path.join( output_dir, 'samples.txt')) and 'samples' not in sca.color_tracks: samples = np.loadtxt(os.path.join(output_dir, 'samples.txt'), dtype=str) sca.add_color_track('samples', samples, True) try: sca.run_full_analysis() except Exception as e: import traceback text = traceback.format_exc() with open(os.path.join(output_dir, 'error.txt'), 'w') as f: f.write(text) return sca.save_json_reset() print('done with generate_analysis')
try: shutil.rmtree(data_dir) os.makedirs(data_dir) except: os.makedirs(data_dir) data = sparse.csc_matrix(dat['data']) # take subset of max variance genes scipy.io.mmwrite(os.path.join(data_dir, 'data.mtx'), data) shutil.copy('data/10x_pooled_400_gene_names.tsv', os.path.join(data_dir, 'gene_names.txt')) sca = sc_analysis.SCAnalysis(data_dir, frac=0.2, clusters=8, data_filename='data.mtx', baseline_dim_red='tsvd', dim_red_option='MDS', cell_frac=1.0, max_iters=20, inner_max_iters=10) sca.run_full_analysis() original_labels = sca.labels.copy() print(original_labels) original_w = sca.w.copy() print(original_w) # split two clusters.... sca.recluster('merge', [0, 1], write_log_entry=True) sca.run_post_analysis() sca.recluster('split', [0], write_log_entry=True) sca.run_post_analysis()
def test_custom_label(self): c1 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='0', and_or='or') c2 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='1', and_or='or') c3 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='2', and_or='or') label1 = custom_cell_selection.CustomLabel('label1', criteria=[c1, c2, c3]) results = label1.select_cells(self.sca) self.assertTrue(len(results) == 150) self.assertTrue( ((self.labels[results] == 0) | (self.labels[results] == 1) | (self.labels[results] == 2)).all()) c4 = custom_cell_selection.LabelCriterion(selection_type='cluster', comparison='=', target='0', and_or='and') c5 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='4', and_or='or') c6 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='6', and_or='or') c7 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='7', and_or='or') label1 = custom_cell_selection.CustomLabel( 'label1', criteria=[c1, c2, c3, c4, c5, c6, c7]) results = label1.select_cells(self.sca) if len(results) > 0: self.assertTrue(((self.labels[results] == 0) | (self.labels[results] == 1) | (self.labels[results] == 2)\ | (self.labels[results]==4) | (self.labels[results]==6) | (self.labels[results]==7)).all()) self.assertTrue((self.sca.labels[results] == 0).all()) # test colormaps label1 = custom_cell_selection.CustomLabel('label1', criteria=[c1, c2, c3]) label2 = custom_cell_selection.CustomLabel('label2', criteria=[c5, c6, c7]) cmap1 = custom_cell_selection.CustomColorMap('cmap1', [label1, label2]) labels = cmap1.label_cells(self.sca) self.assertTrue((labels == 'label1').sum() == 150) self.assertTrue((labels == 'label2').sum() == 150) self.assertTrue(((self.labels[labels == 'label1'] == 0) | (self.labels[labels == 'label1'] == 1) | (self.labels[labels == 'label1'] == 2)).all()) self.assertTrue(((self.labels[labels == 'label2'] == 4) | (self.labels[labels == 'label2'] == 6) | (self.labels[labels == 'label2'] == 7)).all()) # test json saving/loading custom_cell_selection.save_json( os.path.join(self.data_dir, 'cmap.json'), {'cmap1': cmap1}) cmap1 = custom_cell_selection.load_json( os.path.join(self.data_dir, 'cmap.json'))['cmap1'] print(custom_cell_selection.create_json(cmap1)) labels = cmap1.label_cells(self.sca) self.assertTrue((labels == 'label1').sum() == 150) self.assertTrue((labels == 'label2').sum() == 150) self.assertTrue(((self.labels[labels == 'label1'] == 0) | (self.labels[labels == 'label1'] == 1) | (self.labels[labels == 'label1'] == 2)).all()) self.assertTrue(((self.labels[labels == 'label2'] == 4) | (self.labels[labels == 'label2'] == 6) | (self.labels[labels == 'label2'] == 7)).all()) # test adding colormaps to sca self.sca.create_custom_selection('cmap1', cmap1.labels) self.sca = sc_analysis.SCAnalysis(self.data_dir, clusters=8, data_filename='data.mtx') scores, pvals = self.sca.calculate_diffexp('cmap1', mode='pairwise') print(scores.shape) self.assertTrue(scores.shape == (3, 3, self.sca.data.shape[0])) print(pvals.shape) # update color map criteria c8 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='8', and_or='or') c9 = custom_cell_selection.LabelCriterion(selection_type='true_labels', comparison='=', target='9', and_or='or') self.sca.update_custom_color_track_label('cmap1', 'label2', [c5, c8, c9]) data, is_discrete = self.sca.get_color_track('cmap1') self.assertTrue((data == 'label1').sum() == 150) self.assertTrue((data == 'label2').sum() == 150) self.assertTrue(((self.labels[data == 'label2'] == 4) | (self.labels[data == 'label2'] == 8) | (self.labels[data == 'label2'] == 9)).all())