def _match_genes(sco: SingleCellOMIC, gene_names: dict): var_names = { name: i for i, name in enumerate(sco.get_var_names('transcriptomic')) } ids = [var_names[name] for name in gene_names] X = sco.get_omic("transcriptomic")[:, ids] return X, ids
def read_scale_dataset(dsname="leukemia", filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Datasets provided by (Xiong et al. 2019), four datasets are supported: - 'breast_tumor' - 'forebrain' - 'leukemia' - 'insilico' Reference: Xiong, L. et al. SCALE method for single-cell ATAC-seq analysis via latent feature extraction. Nat Commun 10, 4576 (2019). """ datasets = {'breast_tumor', 'forebrain', 'leukemia', 'insilico'} assert dsname in datasets, \ f"Cannot find dataset with name {dsname}, available datasets are: {datasets}" download_path = os.path.join(DOWNLOAD_DIR, f"scale_dataset") preprocessed_path = os.path.join(DATA_DIR, f"scale_preprocessed") if not os.path.exists(download_path): os.makedirs(download_path) if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### Download data url = str(base64.decodebytes(_URL), 'utf-8') path = os.path.join(download_path, os.path.basename(url)) download_file(url, path, override=False, md5=_MD5) ### extract the data if len(os.listdir(preprocessed_path)) == 0: with zipfile.ZipFile(path, "r") as f: for info in f.filelist: name = os.path.basename(info.filename) if len(name) == 0: continue with open(os.path.join(preprocessed_path, name), 'wb') as fout: fout.write(f.read(info)) ### load the data cell = np.load(os.path.join(preprocessed_path, f"{dsname}_cell")) labels = np.load(os.path.join(preprocessed_path, f"{dsname}_labels")) peak = np.load(os.path.join(preprocessed_path, f"{dsname}_peak")) x = sparse.load_npz(os.path.join(preprocessed_path, f"{dsname}_x")) sco = SingleCellOMIC(X=x, cell_id=cell, gene_id=peak, omic=OMIC.atac, name=dsname) ids = {key: i for i, key in enumerate(sorted(set(labels)))} sco.add_omic(OMIC.celltype, X=one_hot(np.array([ids[i] for i in labels]), len(ids)), var_names=list(ids.keys())) return sco
def get_dataset(dataset_name, override=False, verbose=True) -> SingleCellOMIC: r""" Check `get_dataset_meta` for more information List of all dataset available: ['call', 'callall', 'mpal', 'mpalall', 'mpalatac', '100yo', '8klyall', '8kmyall', '8kly', '8kmy', '8k', '8kall', 'ecclyall', 'eccly', 'eccmyall', 'eccmy', 'ecc', 'eccall', '8kx', '8kxall', 'eccx', 'eccxall', 'vdj1x', 'vdj1xall', 'vdj4x', 'vdj4xall', 'mpalx', 'mpalxall', 'callx', 'callxall', 'pbmcciteseq', 'cbmcciteseq', 'pbmc5000', 'facs7', 'facs5', 'facs2', 'pbmcscvi', 'cortex', 'retina', 'hemato', 'vdj1', 'vdj1all', 'vdj2', 'vdj2all', 'vdj3', 'vdj3all', 'vdj4', 'vdj4all', 'vdjhs3', 'vdjhs3all', 'vdjhs4', 'vdjhs4all', 'neuron10k', 'neuron10kall', 'heart10k', 'heart10kall', 'memoryt', 'memorytall', 'naivet', 'naivetall', 'regulatoryt', 'regulatorytall', 'cd4t', 'cd4tall', '5k', '5kall', '18k', '18kall', '4k', '4kall', '10k', '10kall'] Return: mRNA data : `SingleCellOMIC` label data: `SingleCellOMIC`. If label data is not availabel, then None Example: gene, prot = get_dataset("cortex") X_train, X_test = gene.split(0.8, seed=1234) y_train, y_test = prot.split(0.8, seed=1234) X_train.assert_matching_cells(y_train) X_test.assert_matching_cells(y_test) """ data_meta = get_dataset_meta() # ====== special case: get all dataset ====== # dataset_name = str(dataset_name).lower().strip() if dataset_name not in data_meta: raise RuntimeError( 'Cannot find dataset with name: "%s", all dataset include: %s' % (dataset_name, ", ".join(list(data_meta.keys())))) with catch_warnings_ignore(FutureWarning): ds = data_meta[dataset_name](override=override, verbose=verbose) # ******************** create SCO ******************** # if isinstance(ds, SingleCellOMIC): return ds # ******************** return ******************** # validating_dataset(ds) with catch_warnings_ignore(FutureWarning): sc = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], name=dataset_name) if 'y' in ds: y = ds['y'] if is_binary_dtype(y): sc.add_omic(OMIC.celltype, y, ds['y_col']) else: sc.add_omic(OMIC.proteomic, y, ds['y_col']) return sc
def read_mouse_ATLAS(filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" sci-ATAC-seq, to profile genome-wide chromatin accessibility in ∼100,000 single cells from 13 adult mouse tissues: - The regulatory landscape of adult mouse tissues mapped by single-cell chromatin assay - Characterization of 85 distinct chromatin patterns across 13 different tissues - Annotation of key regulators and regulatory sequences in diverse mammalian cell types - Dataset allows resolution of cell types underlying common human traits and diseases References: Cusanovich, D. A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin Accessibility. Cell 174, 1309-1324.e18 (2018). Link https://atlas.gs.washington.edu/mouse-atac/ """ download_path = os.path.join(DOWNLOAD_DIR, f"mouse_atac") preprocessed_path = os.path.join(DATA_DIR, f"mouse_atac_preprocessed") if not os.path.exists(download_path): os.makedirs(download_path) if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### Download data files = {} for name, (url, md5) in _URLs.items(): filepath = os.path.join(download_path, os.path.basename(url)) files[name] = download_file(url, filepath, override=False, md5=md5) ### save counts matrix path = os.path.join(preprocessed_path, 'counts') if not os.path.exists(path): print("Reading counts matrix ...") counts = mmread(files['counts']) counts: sparse.coo_matrix counts = counts.astype(np.unit8) with open(path, 'wb') as f: sparse.save_npz(f, counts, compressed=False) ### save metadata path = os.path.join(preprocessed_path, 'metadata') if not os.path.exists(path): with open(files['cellids'], 'r') as f: cell = np.array([i for i in f.read().split('\n') if len(i) > 0]) with open(files['peakids'], 'r') as f: peak = np.array([i for i in f.read().split('\n') if len(i) > 0]) metadata = pd.read_csv(files['metadata'], sep="\t") assert metadata.shape[0] == len(cell) tissue = metadata['tissue'].to_numpy() celltype = metadata['cell_label'].to_numpy() with open(path, 'wb') as f: np.savez(f, cell=cell, peak=peak, tissue=tissue, celltype=celltype) ### Read all data and create SCO counts = sparse.csr_matrix( sparse.load_npz(os.path.join(preprocessed_path, 'counts'))) metadata = np.load(os.path.join(preprocessed_path, 'metadata'), allow_pickle=True) cell = metadata['cell'] peak = metadata['peak'] tissue = metadata['tissue'] celltype = metadata['celltype'] # need to transpose here, counts matrix is [peaks, cells] sco = SingleCellOMIC(X=counts.T, cell_id=cell, gene_id=peak, omic=OMIC.atac, name="mouse_atlas") # add celltype labels = {name: i for i, name in enumerate(sorted(set(celltype)))} sco.add_omic(OMIC.celltype, X=one_hot(np.array([labels[i] for i in celltype]), len(labels)), var_names=list(labels.keys())) # add tissue type labels = {name: i for i, name in enumerate(sorted(set(tissue)))} sco.add_omic(OMIC.tissue, X=one_hot(np.array([labels[i] for i in tissue]), len(labels)), var_names=list(labels.keys())) return sco
def read_melanoma_cisTopicData(filtered_genes=True, override=False, verbose=True): r""" melanoma ATAC data from (Bravo González-Blas, et al. 2019) Reference: Bravo González-Blas, C. et al. cisTopic: cis-regulatory topic modeling on single-cell ATAC-seq data. Nat Methods 16, 397–400 (2019). Verfaillie, A. et al. Decoding the regulatory landscape of melanoma reveals TEADS as regulators of the invasive cell state. Nat Commun 6, (2015). """ download_dir = os.path.join(DOWNLOAD_DIR, 'cistopic') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'cistopic_preprocessed') if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### downloading the data data = {} for url in _URL: fname = os.path.basename(url) fpath = os.path.join(download_dir, fname) if not os.path.exists(fpath): if verbose: print(f"Downloading file: {fname} ...") urlretrieve(url, filename=fpath) data[fname.split(".")[0]] = fpath ### preprocess data if len(os.listdir(preprocessed_path)) == 0: try: import rpy2.robjects as robjects from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter robjects.r['options'](warn=-1) robjects.r("library(Matrix)") pandas2ri.activate() except ImportError: raise ImportError("Require package 'rpy2' for reading Rdata file.") for k, v in data.items(): robjects.r['load'](v) x = robjects.r[k] outpath = os.path.join(preprocessed_path, k) if k == "counts_mel": with localconverter(robjects.default_converter + pandas2ri.converter): # dgCMatrix x = sparse.csr_matrix((x.slots["x"], x.slots["i"], x.slots["p"]), shape=tuple(robjects.r("dim")(x))[::-1], dtype=np.float32) else: x = robjects.conversion.rpy2py(x) with open(outpath, "wb") as f: pickle.dump(x, f) if verbose: print(f"Loaded file: {k} - {type(x)} - {x.shape}") pandas2ri.deactivate() ### load_data data = {} for name in os.listdir(preprocessed_path): with open(os.path.join(preprocessed_path, name), 'rb') as f: data[name] = pickle.load(f) ### sco # print(data["dm3_CtxRegions"]) x = data['counts_mel'] sco = SingleCellOMIC(X=x, cell_id=data["cellData_mel"].index, gene_id=[f"Region{i + 1}" for i in range(x.shape[1])], omic=OMIC.atac) # celltype labels = [] for i, j in zip(data["cellData_mel"]['cellLine'], data["cellData_mel"]['LineType']): labels.append(i + '_' + j.split("-")[0]) labels = np.array(labels) labels_name = {name: i for i, name in enumerate(sorted(set(labels)))} labels = np.array([labels_name[i] for i in labels]) sco.add_omic(OMIC.celltype, one_hot(labels, len(labels_name)), list(labels_name.keys())) return sco
def read_PBMCeec(subset='ly', override=False, verbose=True, filtered_genes=True) -> SingleCellOMIC: subset = str(subset).strip().lower() if subset not in ('ly', 'my', 'full'): raise ValueError( "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'") if subset in ('my', 'full'): raise NotImplementedError("No support for subset: %s - PBMCecc" % subset) download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset) if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join( DATA_DIR, f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed" ) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at path {preprocessed_path}") if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): # ====== full ====== # if subset == 'full': raise NotImplementedError # ====== ly and my ====== # else: url = str( base64.decodebytes(_URL_LYMPHOID if subset == 'ly' else _URL_MYELOID), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # ====== extract the data ====== # data = np.load(path) X_row = data['X_row'] y = data['y'] y_col = data['y_col'] if filtered_genes: X = data['X_var'] X_col = data['X_var_col'] else: X = data['X_full'] X_col = data['X_full_col'] cell_types = np.array(['ly'] * X.shape[0]) # ====== save everything ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) assert X.shape == (len(X_row), len(X_col)) assert len(X) == len(y) assert y.shape[1] == len(y_col) with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f: pickle.dump(cell_types, f) save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"ecc{subset}{'' if filtered_genes else 'all'}") sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col']) progenitor = ds['cell_types'] sco.add_omic( 'progenitor', X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor], dtype=np.float32), var_names=np.array(['myeloid', 'lymphoid']), ) return sco
def read_human_embryos(filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Transcriptional map of human embryo development, including the sequenced transcriptomes of 1529 individual cells from 88 human preimplantation embryos. These data show that cells undergo an intermediate state of co-expression of lineage-specific genes, followed by a concurrent establishment of the trophectoderm, epiblast, and primitive endoderm lineages, which coincide with blastocyst formation. References: Petropoulos S, Edsgärd D, Reinius B, et al. Single-Cell RNA-Seq Reveals Lineage and X Chromosome Dynamics in Human Preimplantation Embryos. Cell. 2016 Sep Note: Gene expression levels (RefSeq annotations) were estimated in terms of reads per kilobase exon model and per million mapped reads (RPKM) using rpkmforgenes Genes were filtered, keeping 15633/26178 genes that * were expressed in at least 5 out of 1919 sequenced cells (RPKM >= 10). and * for which cells with expression came from at least two different embryos. Cells were quality-filtered based on 4 criteria, keeping 1529/1919 cells. * First, Spearman correlations, using the RPKM expression levels of all genes, for every possible pair of cells were calculated and a histogram of the maximum correlation obtained for each cell, corresponding to the most similar cell, was used to identify 305 outlier cells with a maximum pair-wise correlations below 0.63. * Second, a histogram of the number of expressed genes per cell was used to identify 330 outlier cells with less than 5000 expressed genes. * Third, a histogram of the total transcriptional expression output from the sex chromosomes (RPKM sum) was used to identify 33 cells with indeterminable sex, or a called sex that was inconsistent with other cells of that embryo * Fourth, 13 outlier cells were identified using PCA and t-SNE dimensionality reduction. """ download_dir = os.path.join(DOWNLOAD_DIR, 'human_embryos') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'human_embryos_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download data files = [] for url, md5 in zip(_URLs, _MD5s): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files.append(path) ### preprocessing if len(os.listdir(preprocessed_path)) == 0: data_map = {} for f in files: zipname = os.path.basename(f) with zipfile.ZipFile(f, mode="r") as f: for dat_file in f.filelist: filename = dat_file.filename dat = str(f.read(filename), 'utf-8') x = [] for line in dat.split('\n'): if len(line) == 0: continue line = line.split('\t') x.append(line) x = np.asarray(x).T row_name = x[1:, 0] col_name = x[0, 1:] x = x[1:, 1:].astype(np.float32) x = sparse.coo_matrix(x) data_map[filename] = (x, row_name, col_name) print(f"Read: {zipname} - {filename}") print(f" * Matrix: {x.shape}") print(f" * Row : {row_name.shape}-{row_name[:3]}") print(f" * Col : {col_name.shape}-{col_name[:3]}") # save loaded data to disk for name, (x, row, col) in data_map.items(): with open(os.path.join(preprocessed_path, f"{name}:x"), "wb") as f: sparse.save_npz(f, x) with open(os.path.join(preprocessed_path, f"{name}:row"), "wb") as f: np.save(f, row) with open(os.path.join(preprocessed_path, f"{name}:col"), "wb") as f: np.save(f, col) del data_map ### read the data # counts.txt (1529, 26178) # ercc.counts.txt (1529, 92) # rpkm.txt (1529, 26178) # ercc.rpkm.txt (1529, 92) data = {} genes_path = os.path.join(preprocessed_path, "filtered_genes") for path in os.listdir(preprocessed_path): if path == os.path.basename(genes_path): continue name, ftype = os.path.basename(path).split(':') with open(os.path.join(preprocessed_path, path), 'rb') as f: if ftype == 'x': x = sparse.load_npz(f).tocsr() else: x = np.load(f) data[f"{name}_{ftype}"] = x rpkm = data['rpkm.txt_x'] counts = data['counts.txt_x'] genes = data['counts.txt_col'] cells = data['counts.txt_row'] ### filter genes if not os.path.exists(genes_path): # filter genes by rpkm ids = np.asarray(np.sum(rpkm, axis=0) >= 10).ravel() rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # filter genes by min 5 cells ids = np.asarray(np.sum(counts > 0, axis=0) >= 5).ravel() rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # filter highly variable genes sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes) sco.normalize(omic=OMIC.transcriptomic, log1p=True) sco.filter_highly_variable_genes(n_top_genes=2000) filtered = sco.var_names.to_numpy() with open(genes_path, 'wb') as f: pickle.dump([genes, filtered], f) del sco else: with open(genes_path, 'rb') as f: ids, filtered = pickle.load(f) ids = set(ids) ids = np.asarray([i in ids for i in genes]) rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # last filtering if filtered_genes: filtered = set(filtered) ids = np.asarray([i in filtered for i in genes]) rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] ### create the SingleCellOMIC sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes, omic=OMIC.transcriptomic, name="HumanEmbryos") sco.add_omic(omic=OMIC.rpkm, X=rpkm, var_names=genes) labels = ['.'.join(i.split('.')[:-2]) for i in sco.obs_names] labels = ['E7' if i == 'E7.4' else i for i in labels] labels_name = {j: i for i, j in enumerate(sorted(set(labels)))} labels = np.array([labels_name[i] for i in labels]) sco.add_omic(omic=OMIC.celltype, X=one_hot(labels, len(labels_name)), var_names=list(labels_name.keys())) sco.add_omic(omic=OMIC.ercc, X=data['ercc.counts.txt_x'], var_names=data['ercc.counts.txt_col']) return sco
def read_PBMC_crossdataset(name, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" This create a dataset with shared genes among multiple datasets - 'pbmc8k' (6290, 17870)->(6290, 11299) genes - 'pbmcecc' (2941, 15634)->(2941, 11299) genes - 'pbmcciteseq' (7985, 17006)->(7985, 11299) genes - 'cbmcciteseq' (8617, 20400)->(8617, 11299) genes - 'call' (37552, 33694)->(37552, 11299) genes - 'mpal' (52396, 20287)->(52396, 11299) genes - 'pbmc5k' (5247, 33538)->(5247, 11299) genes - 'vdj1' (55206, 33538)->(55206, 11299) genes - 'vdj4' (36619, 33538)->(36619, 11299) genes Total transcriptomic data: 212853(cells) 11299(genes) Highly variable genes: 2000 Arguments: name : {'pbmc8k', 'pbmcecc', 'call', 'mpal', 'pbmc5k', 'vdj1', 'vdj4'} """ assert name in _DATASETS, \ (f"Invalid dataset name='{name}', " f"available datasets are: {list(_DATASETS.keys())}") preprocessed_path = os.path.join(DATA_DIR, 'PBMC_crossdataset_preprocessed') if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at path {preprocessed_path}") if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessing ******************** # if len(os.listdir(preprocessed_path)) == 0 or \ md5_folder(preprocessed_path) != _MD5: datasets = {} for i, j in _DATASETS.items(): ds = j(verbose=verbose) datasets[i] = ds if verbose: print(f"Read dataset='{i}' shape={ds.shape}") gene_names = sorted( reduce(lambda x, y: x & y, (set(i.var_names.values) for i in datasets.values()))) # this make sure the gene order is random and consistent among all machines rand = np.random.RandomState(seed=1) rand.shuffle(gene_names) # some debugging if verbose: omics = reduce(lambda x, y: x | y, (i.omics for i in datasets.values())) n_samples = {k: v.shape[0] for k, v in datasets.items()} print(f"Select {len(gene_names)} common genes " f"among {', '.join(datasets.keys())}.") print(f"All available OMICs are: {omics}") print(f"Amount of samples: {n_samples}") # read data from all available OMICs indices = {} mRNA = [] for name, sco in datasets.items(): X, ids = _match_genes(sco, gene_names) indices[name] = ids mRNA.append(X) if verbose: print(f"Matching genes for dataset '{name}' " f"{sco.X.shape}->{X.shape} genes") mRNA = np.concatenate(mRNA, axis=0) if verbose: print("Total transcriptomic data:", f"{mRNA.shape[0]}(cells) {mRNA.shape[1]}(genes)") # filter genes seurat sco = SingleCellOMIC(mRNA, gene_id=gene_names) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) gene_subset = result.gene_subset # maker sure all marker genes included for i, gene in enumerate(gene_names): if gene in MARKER_GENES: gene_subset[i] = True sco._inplace_subset_var(gene_subset) top_genes = set(sco.var_names.values) if verbose: print(f"Filtered highly variable genes: {len(top_genes)}") del sco # save the indices and top_genes with open(os.path.join(preprocessed_path, 'gene_indices'), 'wb') as f: pickle.dump([gene_names, indices, top_genes], f) print(f"Preprocessed MD5: {md5_folder(preprocessed_path)}") # ******************** load the dataset ******************** # with open(os.path.join(preprocessed_path, 'gene_indices'), 'rb') as f: gene_names, indices, top_genes = pickle.load(f) sco = _DATASETS[name](verbose=verbose) sco._inplace_subset_var(indices[name]) if filtered_genes: top_indices = [i in top_genes for i in sco.var_names] sco._inplace_subset_var(top_indices) sco._name += 'x' return sco
def read_PBMC8k(subset='full', override=False, verbose=True, filtered_genes=True, return_arrays=False) -> SingleCellOMIC: subset = str(subset).strip().lower() if subset not in ('ly', 'my', 'full'): raise ValueError( "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'") # prepare the path download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join( DATA_DIR, f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed" ) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: # ====== pbmc 8k ====== # if subset == 'full': ly = read_PBMC8k('ly', filtered_genes=filtered_genes, return_arrays=True) my = read_PBMC8k('my', filtered_genes=filtered_genes, return_arrays=True) url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # load data data = np.load(path) X = data['X'] X_row = data['X_row'] X_col = data['X_col'].tolist() y = data['y'] y_col = data['y_col'].tolist() # merge all genes from my and ly subset all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist()) all_genes = sorted([X_col.index(i) for i in all_genes]) # same for protein all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist()) all_proteins = sorted([y_col.index(i) for i in all_proteins]) # X = X[:, all_genes] y = y[:, all_proteins] X_col = np.array(X_col)[all_genes] y_col = np.array(y_col)[all_proteins] cell_types = np.array( ['ly' if i in ly['X_row'] else 'my' for i in X_row]) # ====== pbmc ly and my ====== # else: url = str( base64.decodebytes(_URL_LYMPHOID if subset == 'ly' else _URL_MYELOID), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # extract the data data = np.load(path) X_row = data['X_row'] y = data['y'] y_col = data['y_col'] if filtered_genes: X = data['X_filt'] X_col = data['X_filt_col'] else: X = data['X_full'] X_col = data['X_full_col'] cell_types = np.array([subset] * X.shape[0]) # ====== save everything ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) assert X.shape == (len(X_row), len(X_col)) assert len(X) == len(y) assert y.shape[1] == len(y_col) with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f: pickle.dump(cell_types, f) save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) if return_arrays: return ds sco = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"8k{subset}{'' if filtered_genes else 'all'}") sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col']) progenitor = ds['cell_types'] sco.add_omic( 'progenitor', X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor], dtype=np.float32), var_names=np.array(['myeloid', 'lymphoid']), ) return sco
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True): download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = _CITEseq_CBMC_PREPROCESSED if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: if verbose: print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED) shutil.rmtree(_CITEseq_CBMC_PREPROCESSED) os.mkdir(_CITEseq_CBMC_PREPROCESSED) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) download_file(filename=zip_path, url=url, override=False, md5=r"beb76d01a67707c61c21bfb188e1b69f") # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data # ====== post-processing ====== # X = np.array(data_dict['X'].astype('float32')) X_row, X_col = data_dict['X_row'], data_dict['X_col'] X, X_col = remove_allzeros_columns(matrix=X, colname=X_col) assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) sco._inplace_subset_var(result.gene_subset) with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(set(sco.var_names.values), f) del sco # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"cbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col']) if filtered_genes: with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) sco._inplace_subset_var([i in top_genes for i in sco.var_names]) return sco
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
def read_CITEseq_PBMC(override=False, verbose=True, filtered_genes=False) -> SingleCellOMIC: download_path = os.path.join( DOWNLOAD_DIR, "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq')) if not os.path.exists(download_path): os.makedirs(download_path) preprocessed_path = (_5000_PBMC_PREPROCESSED if filtered_genes else _CITEseq_PBMC_PREPROCESSED) if override: shutil.rmtree(preprocessed_path) os.makedirs(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # download_files = {} for url, md5 in zip( [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN], [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]): url = str(base64.decodebytes(url), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) download_files[base_name] = (path, md5) # ====== extract the data ====== # n = set() for name, (path, md5) in sorted(download_files.items()): if verbose: print(f"Extracting {name} ...") binary_data = decrypt_aes(path, password=_PASSWORD) md5_ = md5_checksum(binary_data) assert md5_ == md5, f"MD5 checksum mismatch for file: {name}" with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f: for name in f.namelist(): data = str(f.read(name), 'utf8') for line in data.split('\n'): if len(line) == 0: continue line = line.strip().split(',') n.add(len(line)) if 'Protein' in name: y.append(line) else: X.append(line) # ====== post-processing ====== # assert len(n) == 1, \ "Number of samples inconsistent between raw count and protein count" if verbose: print("Processing gene count ...") X = np.array(X).T X_row, X_col = X[1:, 0], X[0, 1:] X = X[1:, 1:].astype('float32') # ====== filter mouse genes ====== # human_cols = [True if "HUMAN_" in i else False for i in X_col] if verbose: print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...") X = X[:, human_cols] X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]]) X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) # ====== protein ====== # if verbose: print("Processing protein count ...") y = np.array(y).T y_row, y_col = y[1:, 0], y[0, 1:] y = y[1:, 1:].astype('float32') assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"pbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col'])
def read_leukemia_MixedPhenotypes(filtered_genes=True, omic='rna', ignore_na=True, override=False, verbose=True) -> SingleCellOMIC: r""" Integrates highly multiplexed protein quantification, transcriptome profiling, and chromatin accessibility analysis. Using this approach, we establish a normal epigenetic baseline for healthy blood development, which we then use to deconvolve aberrant molecular features within blood from mixed-phenotype acute leukemia (MPAL) patients. scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow, peripheral blood, and MPAL donors References: Granja JM et al., 2019. "Single-cell multiomic analysis identifies regulatory programs in mixed-phenotype acute leukemia". Nature Biotechnology. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369 https://github.com/GreenleafLab/MPAL-Single-Cell-2019 """ ### prepare the path download_dir = os.path.join(DOWNLOAD_DIR, 'mpal') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download files = {} for name, (url, md5) in _URL.items(): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files[name] = path ### read the files if omic == 'atac': del files['rna'] del files['adt'] elif omic == 'rna': del files['atac'] else: raise NotImplementedError(f"No support for omic type: {omic}") all_data = {} for name, data in MPI(jobs=list(files.items()), func=partial(_read_data, verbose=True, preprocessed_path=preprocessed_path), batch=1, ncpu=4): all_data[name] = data.load() ### load scRNA and ADT if omic == 'rna': rna = all_data['rna'] adt = all_data['adt'] cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode'])) # barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])} ids = [barcode2ids[i] for i in cell_id] X_rna = rna.X[ids].astype(np.float32) classification = rna.celldata['ProjectClassification'][ids].values # barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])} X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32) # if filtered_genes: top_genes_path = os.path.join(preprocessed_path, 'top_genes') if os.path.exists(top_genes_path): with open(top_genes_path, 'rb') as f: top_genes = set(pickle.load(f)) ids = [i for i, j in enumerate(rna.genenames) if j in top_genes] sco = SingleCellOMIC(X_rna[:, ids], cell_id=cell_id, gene_id=rna.genenames[ids], omic=OMIC.transcriptomic, name='mpalRNA') else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNA') sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) # make sure all marker genes are included gene_subset = result.gene_subset gene_indices = sco.get_var_indices() for gene in MARKER_GENES: idx = gene_indices.get(gene, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) with open(top_genes_path, 'wb') as f: pickle.dump(sco.var_names.values, f) else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNAall') # loading dataset if ignore_na: ids = np.logical_not(np.isnan(np.max(X_adt, axis=0))) sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids]) else: sco.add_omic(OMIC.proteomic, X_adt, adt.genenames) y, labels = _celltypes(classification) sco.add_omic(OMIC.celltype, y, labels) exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values} sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names], dtype=np.float32) ### load ATAC else: atac = all_data['atac'] sco = SingleCellOMIC(atac.X.astype(np.float32), cell_id=atac.celldata['Barcode'], gene_id=atac.genenames, omic=OMIC.atac, name='mpalATAC') y, labels = _celltypes(atac.celldata['ProjectClassification'].values) sco.add_omic(OMIC.celltype, y, labels) sco.obs['clusters'] = atac.celldata['Clusters'].values sco.var['score'] = atac.genedata['score'].values return sco