def __init__( self, save_path: str = "data/", save_path_10X: str = None, remove_extracted_data: bool = False, delayed_populating: bool = False, ): self.save_path_10X = save_path_10X if save_path_10X is not None else save_path self.remove_extracted_data = remove_extracted_data self.barcodes = None super().__init__( urls=[ "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv", "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle", ], filenames=["gene_info_pbmc.csv", "pbmc_metadata.pickle"], save_path=save_path, delayed_populating=delayed_populating, ) # this downloads the necessary file for a future call to populate if delayed_populating: Dataset10X("pbmc8k", save_path=self.save_path_10X, delayed_populating=True) Dataset10X("pbmc4k", save_path=self.save_path_10X, delayed_populating=True)
def populate(self): self.de_metadata = pd.read_csv(os.path.join(self.save_path, "gene_info_pbmc.csv"), sep=",") pbmc_metadata = pickle.load( open(os.path.join(self.save_path, "pbmc_metadata.pickle"), "rb")) datasets = [ Dataset10X( "pbmc8k", save_path=self.save_path_10X, remove_extracted_data=self.remove_extracted_data, measurement_names_column=0, ), Dataset10X( "pbmc4k", save_path=self.save_path_10X, remove_extracted_data=self.remove_extracted_data, measurement_names_column=0, ), ] self.populate_from_datasets(datasets) # filter cells according to barcodes dict_barcodes = dict(zip(self.barcodes, np.arange(len(self.barcodes)))) subset_cells = [] barcodes_metadata = ( pbmc_metadata["barcodes"].index.values.ravel().astype(np.str)) for barcode in barcodes_metadata: if (barcode in dict_barcodes ): # barcodes with end -11 filtered on 10X website (49 cells) subset_cells += [dict_barcodes[barcode]] self.update_cells(subset_cells=np.asarray(subset_cells)) idx_metadata = np.asarray( [not barcode.endswith("11") for barcode in barcodes_metadata], dtype=np.bool) labels = pbmc_metadata["clusters"][idx_metadata].reshape(-1, 1)[:len(self)] self.labels, self.n_labels = remap_categories(labels) self.cell_types = pbmc_metadata["list_clusters"][:self.n_labels] genes_to_keep = list(self.de_metadata["ENSG"].values ) # only keep the genes for which we have de data difference = list(set(genes_to_keep).difference(set( self.gene_names))) # Non empty only for unit tests for gene in difference: genes_to_keep.remove(gene) self.filter_genes_by_attribute(genes_to_keep) self.de_metadata = self.de_metadata.head( len(genes_to_keep)) # this would only affect the unit tests self.design = pbmc_metadata["design"][idx_metadata] self.raw_qc = pbmc_metadata["raw_qc"][idx_metadata] self.qc_names = self.raw_qc.columns self.qc = self.raw_qc.values self.qc_pc = pbmc_metadata["qc_pc"][idx_metadata] self.normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]
def test_populate_and_train_one_v1(self): dataset = Dataset10X( dataset_name="cd4_t_helper", remove_extracted_data=True, save_path="tests/data/10X", ) unsupervised_training_one_epoch(dataset)
def populate(self): datasets = [] for dataset_name in self.dataset_names: dataset = Dataset10X( dataset_name, save_path=self.save_path, remove_extracted_data=self.remove_extracted_data, ) dataset.initialize_mapped_attribute( "labels", "cell_types", np.asarray([dataset_name], dtype="<U128")) datasets += [dataset] self.populate_from_datasets(datasets)
def __init__( self, save_path: str = "data/", subset_datasets: Union[List[int], np.ndarray] = None, remove_extracted_data: bool = False, delayed_populating: bool = False, ): self.dataset_names = np.asarray([ "cd4_t_helper", "regulatory_t", "naive_t", "memory_t", "cytotoxic_t", "naive_cytotoxic", "b_cells", "cd4_t_helper", "cd34", "cd56_nk", "cd14_monocytes", ]) subset_datasets = subset_datasets if subset_datasets else slice(None) self.remove_extracted_data = remove_extracted_data self.dataset_names = self.dataset_names[subset_datasets] super().__init__(save_path=save_path, delayed_populating=delayed_populating) if delayed_populating: for dataset_name in self.dataset_names: Dataset10X( dataset_name, save_path=save_path, delayed_populating=delayed_populating, ) self.filter_genes_by_count() self.filter_cells_by_count()
""" bayes_f = np.log(bayes_f + 1e-8) - np.log(1 - bayes_f + 1e-8) auc_1 = roc_auc_score(np.abs(log_fold_change) >= 0.6, np.abs(bayes_f)) auc_2 = roc_auc_score(np.abs(log_fold_change) >= 0.8, np.abs(bayes_f)) spear = spearmanr(bayes_f, log_fold_change)[0] kend = kendalltau(bayes_f, log_fold_change)[0] return auc_1, auc_2, spear, kend save_path = "../symsim_scVI/symsim_result/DE/" pbmc = PbmcDataset() de_data = pbmc.de_metadata pbmc.update_cells(pbmc.batch_indices.ravel()==0) donor = Dataset10X('fresh_68k_pbmc_donor_a') donor.gene_names = donor.gene_symbols donor.labels = np.repeat(0,len(donor)).reshape(len(donor),1) donor.cell_types = ['unlabelled'] donor.subsample_genes(donor.nb_genes) gene_dataset = GeneExpressionDataset.concat_datasets(pbmc, donor) ################## Generate Mis-labels ###################################################################################### labels = np.asarray(gene_dataset.labels.ravel()) # pop1 = np.where(gene_dataset.cell_types=='B cells')[0][0] # pop2 = np.where(gene_dataset.cell_types=='Dendritic Cells')[0][0] pop1 = np.where(gene_dataset.cell_types=='CD4 T cells')[0][0] pop2 = np.where(gene_dataset.cell_types=='CD8 T cells')[0][0]