def test_remap_categories(self): labels = [0, 0, 0, 2, 2, 3] labels, n_labels = remap_categories(labels) labels_true = [0, 0, 0, 1, 1, 2] self.assertListEqual(labels_true, labels.tolist()) self.assertEqual(3, n_labels) # with absent categories and mappings labels = [2, 2, 3] mappings_dict = {"cell_types": ["0", "1", "2", "3"]} labels, n_labels, mappings = remap_categories( labels, mappings_dict=mappings_dict) labels_true = [0, 0, 1] self.assertListEqual(labels_true, labels.tolist()) self.assertEqual(2, n_labels) self.assertListEqual(["2", "3"], mappings["cell_types"].tolist())
def populate(self): self.de_metadata = pd.read_csv(os.path.join(self.save_path, "gene_info_pbmc.csv"), sep=",") pbmc_metadata = pickle.load( open(os.path.join(self.save_path, "pbmc_metadata.pickle"), "rb")) datasets = [ Dataset10X( "pbmc8k", save_path=self.save_path_10X, remove_extracted_data=self.remove_extracted_data, measurement_names_column=0, ), Dataset10X( "pbmc4k", save_path=self.save_path_10X, remove_extracted_data=self.remove_extracted_data, measurement_names_column=0, ), ] self.populate_from_datasets(datasets) # filter cells according to barcodes dict_barcodes = dict(zip(self.barcodes, np.arange(len(self.barcodes)))) subset_cells = [] barcodes_metadata = ( pbmc_metadata["barcodes"].index.values.ravel().astype(np.str)) for barcode in barcodes_metadata: if (barcode in dict_barcodes ): # barcodes with end -11 filtered on 10X website (49 cells) subset_cells += [dict_barcodes[barcode]] self.update_cells(subset_cells=np.asarray(subset_cells)) idx_metadata = np.asarray( [not barcode.endswith("11") for barcode in barcodes_metadata], dtype=np.bool) labels = pbmc_metadata["clusters"][idx_metadata].reshape(-1, 1)[:len(self)] self.labels, self.n_labels = remap_categories(labels) self.cell_types = pbmc_metadata["list_clusters"][:self.n_labels] genes_to_keep = list(self.de_metadata["ENSG"].values ) # only keep the genes for which we have de data difference = list(set(genes_to_keep).difference(set( self.gene_names))) # Non empty only for unit tests for gene in difference: genes_to_keep.remove(gene) self.filter_genes_by_attribute(genes_to_keep) self.de_metadata = self.de_metadata.head( len(genes_to_keep)) # this would only affect the unit tests self.design = pbmc_metadata["design"][idx_metadata] self.raw_qc = pbmc_metadata["raw_qc"][idx_metadata] self.qc_names = self.raw_qc.columns self.qc = self.raw_qc.values self.qc_pc = pbmc_metadata["qc_pc"][idx_metadata] self.normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]
def extract_data_from_anndata(ad: anndata.AnnData): data, labels, batch_indices, gene_names, cell_types = None, None, None, None, None # treat all possible cases according to anndata doc if isinstance(ad.X, np.ndarray): data = ad.X.copy() if isinstance(ad.X, pd.DataFrame): data = ad.X.values if isinstance(ad.X, csr_matrix): # keep sparsity above 1 Gb in dense form if reduce(operator.mul, ad.X.shape) * ad.X.dtype.itemsize < 1e9: logger.info( "Dense size under 1Gb, casting to dense format (np.ndarray)." ) data = ad.X.toarray() else: data = ad.X.copy() gene_names = np.asarray(ad.var.index.values, dtype=str) if "batch_indices" in ad.obs.columns: batch_indices = ad.obs["batch_indices"].values if "cell_types" in ad.obs.columns: cell_types = ad.obs["cell_types"].astype(str) labels = cell_types.values cell_types = cell_types.drop_duplicates().values labels, _ = remap_categories(labels, mapping_from=cell_types) # labels = cell_types.rank(method="dense").astype("int") # labels.index = cell_types.values # cell_types = labels.drop_duplicates().sort_values().index.values.astype("str") # labels = labels.values if "labels" in ad.obs.columns: labels = ad.obs["labels"] return ( data, batch_indices, labels, gene_names, cell_types, ad.obs, ad.obsm, ad.var, ad.varm, ad.uns, )