def test_filter_and_concat_datasets(): cortex_dataset_1 = CortexDataset() cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 300)) cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset() cortex_dataset_2.subsample_genes(subset_genes=np.arange(100, 400)) cortex_dataset_2.filter_cell_types( ["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"]) cortex_dataset_2.filter_cell_types([2, 0]) cortex_dataset_merged = GeneExpressionDataset.concat_datasets( cortex_dataset_1, cortex_dataset_2) assert cortex_dataset_merged.nb_genes == 200 synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5) synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3) synthetic_merged_1 = GeneExpressionDataset.concat_datasets( synthetic_dataset_1, synthetic_dataset_2) assert synthetic_merged_1.n_batches == 5 assert synthetic_merged_1.n_labels == 5 synthetic_merged_2 = GeneExpressionDataset.concat_datasets( synthetic_dataset_1, synthetic_dataset_2, shared_labels=False) assert synthetic_merged_2.n_batches == 5 assert synthetic_merged_2.n_labels == 8 synthetic_dataset_1.filter_cell_types([0, 1, 2, 3]) assert synthetic_dataset_1.n_labels == 4 synthetic_dataset_1.subsample_cells(50) assert len(synthetic_dataset_1) == 50
def test_filter_and_concat_datasets(): cortex_dataset_1 = CortexDataset(save_path='tests/data/') cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3)) cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"]) cortex_dataset_2 = CortexDataset(save_path='tests/data/') cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4)) cortex_dataset_2.filter_cell_types(["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"]) cortex_dataset_2.filter_cell_types([2, 0]) cortex_dataset_merged = GeneExpressionDataset.concat_datasets(cortex_dataset_1, cortex_dataset_2) assert cortex_dataset_merged.nb_genes == 2 synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5) synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3) synthetic_merged_1 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2) assert synthetic_merged_1.n_batches == 5 assert synthetic_merged_1.n_labels == 5 synthetic_merged_2 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2, shared_labels=False) assert synthetic_merged_2.n_batches == 5 assert synthetic_merged_2.n_labels == 8 synthetic_dataset_1.filter_cell_types([0, 1, 2, 3]) assert synthetic_dataset_1.n_labels == 4 synthetic_dataset_1.subsample_cells(50) assert len(synthetic_dataset_1) == 50 synthetic_dataset_3 = SyntheticDataset(n_labels=6) synthetic_dataset_3.cell_types = np.arange(6).astype(np.str) synthetic_dataset_3.map_cell_types({"2": "9", ("4", "3"): "8"})
dataset1.update_cells(batch_array.ravel() == 0) count_matrix = pd.read_csv(os.path.join(save_path, "DE.obsv.4.csv"), sep=",", index_col=0).T dataset2 = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( count_matrix.values, labels=label_array, batch_indices=batch_array), gene_names=gene_names, cell_types=np.unique(label_array)) dataset2.update_cells(batch_array.ravel() == 1) gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2) # gene_dataset.subsample_genes(500) labels = [ int(gene_dataset.cell_types[i]) - 1 for i in gene_dataset.labels.ravel() ] gene_dataset.labels = np.asarray(labels).reshape(len(labels), 1) gene_dataset.cell_types = dataset2.cell_types # from scipy import sparse # gene_dataset.X = sparse.csr_matrix(gene_dataset.X ) gene_dataset.gene_names = gene_dataset.gene_names.astype('int') dataset1.gene_names = dataset1.gene_names.astype('int') dataset2.gene_names = dataset2.gene_names.astype('int') # dataset1, dataset2, gene_dataset = SubsetGenes(dataset1, dataset2, gene_dataset, plotname) CompareModels(gene_dataset, dataset1, dataset2, plotname, models)
return auc_1, auc_2, spear, kend save_path = "../symsim_scVI/symsim_result/DE/" pbmc = PbmcDataset() de_data = pbmc.de_metadata pbmc.update_cells(pbmc.batch_indices.ravel()==0) donor = Dataset10X('fresh_68k_pbmc_donor_a') donor.gene_names = donor.gene_symbols donor.labels = np.repeat(0,len(donor)).reshape(len(donor),1) donor.cell_types = ['unlabelled'] donor.subsample_genes(donor.nb_genes) gene_dataset = GeneExpressionDataset.concat_datasets(pbmc, donor) ################## Generate Mis-labels ###################################################################################### labels = np.asarray(gene_dataset.labels.ravel()) # pop1 = np.where(gene_dataset.cell_types=='B cells')[0][0] # pop2 = np.where(gene_dataset.cell_types=='Dendritic Cells')[0][0] pop1 = np.where(gene_dataset.cell_types=='CD4 T cells')[0][0] pop2 = np.where(gene_dataset.cell_types=='CD8 T cells')[0][0] mislabels = deepcopy(labels) mises = np.random.choice([0,1],len(mislabels),p=[1-misprop, misprop]) pop1cells = (labels==pop1) pop2cells = (labels==pop2) # flip the DE mislabels[np.logical_and(mises, pop1cells)] = pop2