def read_scale_dataset(dsname="leukemia", filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Datasets provided by (Xiong et al. 2019), four datasets are supported: - 'breast_tumor' - 'forebrain' - 'leukemia' - 'insilico' Reference: Xiong, L. et al. SCALE method for single-cell ATAC-seq analysis via latent feature extraction. Nat Commun 10, 4576 (2019). """ datasets = {'breast_tumor', 'forebrain', 'leukemia', 'insilico'} assert dsname in datasets, \ f"Cannot find dataset with name {dsname}, available datasets are: {datasets}" download_path = os.path.join(DOWNLOAD_DIR, f"scale_dataset") preprocessed_path = os.path.join(DATA_DIR, f"scale_preprocessed") if not os.path.exists(download_path): os.makedirs(download_path) if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### Download data url = str(base64.decodebytes(_URL), 'utf-8') path = os.path.join(download_path, os.path.basename(url)) download_file(url, path, override=False, md5=_MD5) ### extract the data if len(os.listdir(preprocessed_path)) == 0: with zipfile.ZipFile(path, "r") as f: for info in f.filelist: name = os.path.basename(info.filename) if len(name) == 0: continue with open(os.path.join(preprocessed_path, name), 'wb') as fout: fout.write(f.read(info)) ### load the data cell = np.load(os.path.join(preprocessed_path, f"{dsname}_cell")) labels = np.load(os.path.join(preprocessed_path, f"{dsname}_labels")) peak = np.load(os.path.join(preprocessed_path, f"{dsname}_peak")) x = sparse.load_npz(os.path.join(preprocessed_path, f"{dsname}_x")) sco = SingleCellOMIC(X=x, cell_id=cell, gene_id=peak, omic=OMIC.atac, name=dsname) ids = {key: i for i, key in enumerate(sorted(set(labels)))} sco.add_omic(OMIC.celltype, X=one_hot(np.array([ids[i] for i in labels]), len(ids)), var_names=list(ids.keys())) return sco
def read_mouse_ATLAS(filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" sci-ATAC-seq, to profile genome-wide chromatin accessibility in ∼100,000 single cells from 13 adult mouse tissues: - The regulatory landscape of adult mouse tissues mapped by single-cell chromatin assay - Characterization of 85 distinct chromatin patterns across 13 different tissues - Annotation of key regulators and regulatory sequences in diverse mammalian cell types - Dataset allows resolution of cell types underlying common human traits and diseases References: Cusanovich, D. A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin Accessibility. Cell 174, 1309-1324.e18 (2018). Link https://atlas.gs.washington.edu/mouse-atac/ """ download_path = os.path.join(DOWNLOAD_DIR, f"mouse_atac") preprocessed_path = os.path.join(DATA_DIR, f"mouse_atac_preprocessed") if not os.path.exists(download_path): os.makedirs(download_path) if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### Download data files = {} for name, (url, md5) in _URLs.items(): filepath = os.path.join(download_path, os.path.basename(url)) files[name] = download_file(url, filepath, override=False, md5=md5) ### save counts matrix path = os.path.join(preprocessed_path, 'counts') if not os.path.exists(path): print("Reading counts matrix ...") counts = mmread(files['counts']) counts: sparse.coo_matrix counts = counts.astype(np.unit8) with open(path, 'wb') as f: sparse.save_npz(f, counts, compressed=False) ### save metadata path = os.path.join(preprocessed_path, 'metadata') if not os.path.exists(path): with open(files['cellids'], 'r') as f: cell = np.array([i for i in f.read().split('\n') if len(i) > 0]) with open(files['peakids'], 'r') as f: peak = np.array([i for i in f.read().split('\n') if len(i) > 0]) metadata = pd.read_csv(files['metadata'], sep="\t") assert metadata.shape[0] == len(cell) tissue = metadata['tissue'].to_numpy() celltype = metadata['cell_label'].to_numpy() with open(path, 'wb') as f: np.savez(f, cell=cell, peak=peak, tissue=tissue, celltype=celltype) ### Read all data and create SCO counts = sparse.csr_matrix( sparse.load_npz(os.path.join(preprocessed_path, 'counts'))) metadata = np.load(os.path.join(preprocessed_path, 'metadata'), allow_pickle=True) cell = metadata['cell'] peak = metadata['peak'] tissue = metadata['tissue'] celltype = metadata['celltype'] # need to transpose here, counts matrix is [peaks, cells] sco = SingleCellOMIC(X=counts.T, cell_id=cell, gene_id=peak, omic=OMIC.atac, name="mouse_atlas") # add celltype labels = {name: i for i, name in enumerate(sorted(set(celltype)))} sco.add_omic(OMIC.celltype, X=one_hot(np.array([labels[i] for i in celltype]), len(labels)), var_names=list(labels.keys())) # add tissue type labels = {name: i for i, name in enumerate(sorted(set(tissue)))} sco.add_omic(OMIC.tissue, X=one_hot(np.array([labels[i] for i in tissue]), len(labels)), var_names=list(labels.keys())) return sco
def read_PBMCeec(subset='ly', override=False, verbose=True, filtered_genes=True) -> SingleCellOMIC: subset = str(subset).strip().lower() if subset not in ('ly', 'my', 'full'): raise ValueError( "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'") if subset in ('my', 'full'): raise NotImplementedError("No support for subset: %s - PBMCecc" % subset) download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset) if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join( DATA_DIR, f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed" ) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at path {preprocessed_path}") if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): # ====== full ====== # if subset == 'full': raise NotImplementedError # ====== ly and my ====== # else: url = str( base64.decodebytes(_URL_LYMPHOID if subset == 'ly' else _URL_MYELOID), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # ====== extract the data ====== # data = np.load(path) X_row = data['X_row'] y = data['y'] y_col = data['y_col'] if filtered_genes: X = data['X_var'] X_col = data['X_var_col'] else: X = data['X_full'] X_col = data['X_full_col'] cell_types = np.array(['ly'] * X.shape[0]) # ====== save everything ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) assert X.shape == (len(X_row), len(X_col)) assert len(X) == len(y) assert y.shape[1] == len(y_col) with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f: pickle.dump(cell_types, f) save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"ecc{subset}{'' if filtered_genes else 'all'}") sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col']) progenitor = ds['cell_types'] sco.add_omic( 'progenitor', X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor], dtype=np.float32), var_names=np.array(['myeloid', 'lymphoid']), ) return sco
def read_human_embryos(filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Transcriptional map of human embryo development, including the sequenced transcriptomes of 1529 individual cells from 88 human preimplantation embryos. These data show that cells undergo an intermediate state of co-expression of lineage-specific genes, followed by a concurrent establishment of the trophectoderm, epiblast, and primitive endoderm lineages, which coincide with blastocyst formation. References: Petropoulos S, Edsgärd D, Reinius B, et al. Single-Cell RNA-Seq Reveals Lineage and X Chromosome Dynamics in Human Preimplantation Embryos. Cell. 2016 Sep Note: Gene expression levels (RefSeq annotations) were estimated in terms of reads per kilobase exon model and per million mapped reads (RPKM) using rpkmforgenes Genes were filtered, keeping 15633/26178 genes that * were expressed in at least 5 out of 1919 sequenced cells (RPKM >= 10). and * for which cells with expression came from at least two different embryos. Cells were quality-filtered based on 4 criteria, keeping 1529/1919 cells. * First, Spearman correlations, using the RPKM expression levels of all genes, for every possible pair of cells were calculated and a histogram of the maximum correlation obtained for each cell, corresponding to the most similar cell, was used to identify 305 outlier cells with a maximum pair-wise correlations below 0.63. * Second, a histogram of the number of expressed genes per cell was used to identify 330 outlier cells with less than 5000 expressed genes. * Third, a histogram of the total transcriptional expression output from the sex chromosomes (RPKM sum) was used to identify 33 cells with indeterminable sex, or a called sex that was inconsistent with other cells of that embryo * Fourth, 13 outlier cells were identified using PCA and t-SNE dimensionality reduction. """ download_dir = os.path.join(DOWNLOAD_DIR, 'human_embryos') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'human_embryos_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download data files = [] for url, md5 in zip(_URLs, _MD5s): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files.append(path) ### preprocessing if len(os.listdir(preprocessed_path)) == 0: data_map = {} for f in files: zipname = os.path.basename(f) with zipfile.ZipFile(f, mode="r") as f: for dat_file in f.filelist: filename = dat_file.filename dat = str(f.read(filename), 'utf-8') x = [] for line in dat.split('\n'): if len(line) == 0: continue line = line.split('\t') x.append(line) x = np.asarray(x).T row_name = x[1:, 0] col_name = x[0, 1:] x = x[1:, 1:].astype(np.float32) x = sparse.coo_matrix(x) data_map[filename] = (x, row_name, col_name) print(f"Read: {zipname} - {filename}") print(f" * Matrix: {x.shape}") print(f" * Row : {row_name.shape}-{row_name[:3]}") print(f" * Col : {col_name.shape}-{col_name[:3]}") # save loaded data to disk for name, (x, row, col) in data_map.items(): with open(os.path.join(preprocessed_path, f"{name}:x"), "wb") as f: sparse.save_npz(f, x) with open(os.path.join(preprocessed_path, f"{name}:row"), "wb") as f: np.save(f, row) with open(os.path.join(preprocessed_path, f"{name}:col"), "wb") as f: np.save(f, col) del data_map ### read the data # counts.txt (1529, 26178) # ercc.counts.txt (1529, 92) # rpkm.txt (1529, 26178) # ercc.rpkm.txt (1529, 92) data = {} genes_path = os.path.join(preprocessed_path, "filtered_genes") for path in os.listdir(preprocessed_path): if path == os.path.basename(genes_path): continue name, ftype = os.path.basename(path).split(':') with open(os.path.join(preprocessed_path, path), 'rb') as f: if ftype == 'x': x = sparse.load_npz(f).tocsr() else: x = np.load(f) data[f"{name}_{ftype}"] = x rpkm = data['rpkm.txt_x'] counts = data['counts.txt_x'] genes = data['counts.txt_col'] cells = data['counts.txt_row'] ### filter genes if not os.path.exists(genes_path): # filter genes by rpkm ids = np.asarray(np.sum(rpkm, axis=0) >= 10).ravel() rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # filter genes by min 5 cells ids = np.asarray(np.sum(counts > 0, axis=0) >= 5).ravel() rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # filter highly variable genes sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes) sco.normalize(omic=OMIC.transcriptomic, log1p=True) sco.filter_highly_variable_genes(n_top_genes=2000) filtered = sco.var_names.to_numpy() with open(genes_path, 'wb') as f: pickle.dump([genes, filtered], f) del sco else: with open(genes_path, 'rb') as f: ids, filtered = pickle.load(f) ids = set(ids) ids = np.asarray([i in ids for i in genes]) rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] # last filtering if filtered_genes: filtered = set(filtered) ids = np.asarray([i in filtered for i in genes]) rpkm = rpkm[:, ids] counts = counts[:, ids] genes = genes[ids] ### create the SingleCellOMIC sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes, omic=OMIC.transcriptomic, name="HumanEmbryos") sco.add_omic(omic=OMIC.rpkm, X=rpkm, var_names=genes) labels = ['.'.join(i.split('.')[:-2]) for i in sco.obs_names] labels = ['E7' if i == 'E7.4' else i for i in labels] labels_name = {j: i for i, j in enumerate(sorted(set(labels)))} labels = np.array([labels_name[i] for i in labels]) sco.add_omic(omic=OMIC.celltype, X=one_hot(labels, len(labels_name)), var_names=list(labels_name.keys())) sco.add_omic(omic=OMIC.ercc, X=data['ercc.counts.txt_x'], var_names=data['ercc.counts.txt_col']) return sco
def read_PBMC8k(subset='full', override=False, verbose=True, filtered_genes=True, return_arrays=False) -> SingleCellOMIC: subset = str(subset).strip().lower() if subset not in ('ly', 'my', 'full'): raise ValueError( "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'") # prepare the path download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join( DATA_DIR, f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed" ) if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: # ====== pbmc 8k ====== # if subset == 'full': ly = read_PBMC8k('ly', filtered_genes=filtered_genes, return_arrays=True) my = read_PBMC8k('my', filtered_genes=filtered_genes, return_arrays=True) url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # load data data = np.load(path) X = data['X'] X_row = data['X_row'] X_col = data['X_col'].tolist() y = data['y'] y_col = data['y_col'].tolist() # merge all genes from my and ly subset all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist()) all_genes = sorted([X_col.index(i) for i in all_genes]) # same for protein all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist()) all_proteins = sorted([y_col.index(i) for i in all_proteins]) # X = X[:, all_genes] y = y[:, all_proteins] X_col = np.array(X_col)[all_genes] y_col = np.array(y_col)[all_proteins] cell_types = np.array( ['ly' if i in ly['X_row'] else 'my' for i in X_row]) # ====== pbmc ly and my ====== # else: url = str( base64.decodebytes(_URL_LYMPHOID if subset == 'ly' else _URL_MYELOID), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) # extract the data data = np.load(path) X_row = data['X_row'] y = data['y'] y_col = data['y_col'] if filtered_genes: X = data['X_filt'] X_col = data['X_filt_col'] else: X = data['X_full'] X_col = data['X_full_col'] cell_types = np.array([subset] * X.shape[0]) # ====== save everything ====== # X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) assert X.shape == (len(X_row), len(X_col)) assert len(X) == len(y) assert y.shape[1] == len(y_col) with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f: pickle.dump(cell_types, f) save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ******************** read preprocessed data ******************** # ds = Dataset(preprocessed_path, read_only=True) if return_arrays: return ds sco = SingleCellOMIC(X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"8k{subset}{'' if filtered_genes else 'all'}") sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col']) progenitor = ds['cell_types'] sco.add_omic( 'progenitor', X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor], dtype=np.float32), var_names=np.array(['myeloid', 'lymphoid']), ) return sco
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True): download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = _CITEseq_CBMC_PREPROCESSED if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: if verbose: print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED) shutil.rmtree(_CITEseq_CBMC_PREPROCESSED) os.mkdir(_CITEseq_CBMC_PREPROCESSED) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) download_file(filename=zip_path, url=url, override=False, md5=r"beb76d01a67707c61c21bfb188e1b69f") # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data # ====== post-processing ====== # X = np.array(data_dict['X'].astype('float32')) X_row, X_col = data_dict['X_row'], data_dict['X_col'] X, X_col = remove_allzeros_columns(matrix=X, colname=X_col) assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) sco._inplace_subset_var(result.gene_subset) with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(set(sco.var_names.values), f) del sco # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"cbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col']) if filtered_genes: with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) sco._inplace_subset_var([i in top_genes for i in sco.var_names]) return sco
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
def read_CITEseq_PBMC(override=False, verbose=True, filtered_genes=False) -> SingleCellOMIC: download_path = os.path.join( DOWNLOAD_DIR, "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq')) if not os.path.exists(download_path): os.makedirs(download_path) preprocessed_path = (_5000_PBMC_PREPROCESSED if filtered_genes else _CITEseq_PBMC_PREPROCESSED) if override: shutil.rmtree(preprocessed_path) os.makedirs(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # download_files = {} for url, md5 in zip( [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN], [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]): url = str(base64.decodebytes(url), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) download_files[base_name] = (path, md5) # ====== extract the data ====== # n = set() for name, (path, md5) in sorted(download_files.items()): if verbose: print(f"Extracting {name} ...") binary_data = decrypt_aes(path, password=_PASSWORD) md5_ = md5_checksum(binary_data) assert md5_ == md5, f"MD5 checksum mismatch for file: {name}" with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f: for name in f.namelist(): data = str(f.read(name), 'utf8') for line in data.split('\n'): if len(line) == 0: continue line = line.strip().split(',') n.add(len(line)) if 'Protein' in name: y.append(line) else: X.append(line) # ====== post-processing ====== # assert len(n) == 1, \ "Number of samples inconsistent between raw count and protein count" if verbose: print("Processing gene count ...") X = np.array(X).T X_row, X_col = X[1:, 0], X[0, 1:] X = X[1:, 1:].astype('float32') # ====== filter mouse genes ====== # human_cols = [True if "HUMAN_" in i else False for i in X_col] if verbose: print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...") X = X[:, human_cols] X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]]) X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) # ====== protein ====== # if verbose: print("Processing protein count ...") y = np.array(y).T y_row, y_col = y[1:, 0], y[0, 1:] y = y[1:, 1:].astype('float32') assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"pbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col'])
def read_leukemia_MixedPhenotypes(filtered_genes=True, omic='rna', ignore_na=True, override=False, verbose=True) -> SingleCellOMIC: r""" Integrates highly multiplexed protein quantification, transcriptome profiling, and chromatin accessibility analysis. Using this approach, we establish a normal epigenetic baseline for healthy blood development, which we then use to deconvolve aberrant molecular features within blood from mixed-phenotype acute leukemia (MPAL) patients. scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow, peripheral blood, and MPAL donors References: Granja JM et al., 2019. "Single-cell multiomic analysis identifies regulatory programs in mixed-phenotype acute leukemia". Nature Biotechnology. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369 https://github.com/GreenleafLab/MPAL-Single-Cell-2019 """ ### prepare the path download_dir = os.path.join(DOWNLOAD_DIR, 'mpal') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download files = {} for name, (url, md5) in _URL.items(): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files[name] = path ### read the files if omic == 'atac': del files['rna'] del files['adt'] elif omic == 'rna': del files['atac'] else: raise NotImplementedError(f"No support for omic type: {omic}") all_data = {} for name, data in MPI(jobs=list(files.items()), func=partial(_read_data, verbose=True, preprocessed_path=preprocessed_path), batch=1, ncpu=4): all_data[name] = data.load() ### load scRNA and ADT if omic == 'rna': rna = all_data['rna'] adt = all_data['adt'] cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode'])) # barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])} ids = [barcode2ids[i] for i in cell_id] X_rna = rna.X[ids].astype(np.float32) classification = rna.celldata['ProjectClassification'][ids].values # barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])} X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32) # if filtered_genes: top_genes_path = os.path.join(preprocessed_path, 'top_genes') if os.path.exists(top_genes_path): with open(top_genes_path, 'rb') as f: top_genes = set(pickle.load(f)) ids = [i for i, j in enumerate(rna.genenames) if j in top_genes] sco = SingleCellOMIC(X_rna[:, ids], cell_id=cell_id, gene_id=rna.genenames[ids], omic=OMIC.transcriptomic, name='mpalRNA') else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNA') sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) # make sure all marker genes are included gene_subset = result.gene_subset gene_indices = sco.get_var_indices() for gene in MARKER_GENES: idx = gene_indices.get(gene, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) with open(top_genes_path, 'wb') as f: pickle.dump(sco.var_names.values, f) else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNAall') # loading dataset if ignore_na: ids = np.logical_not(np.isnan(np.max(X_adt, axis=0))) sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids]) else: sco.add_omic(OMIC.proteomic, X_adt, adt.genenames) y, labels = _celltypes(classification) sco.add_omic(OMIC.celltype, y, labels) exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values} sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names], dtype=np.float32) ### load ATAC else: atac = all_data['atac'] sco = SingleCellOMIC(atac.X.astype(np.float32), cell_id=atac.celldata['Barcode'], gene_id=atac.genenames, omic=OMIC.atac, name='mpalATAC') y, labels = _celltypes(atac.celldata['ProjectClassification'].values) sco.add_omic(OMIC.celltype, y, labels) sco.obs['clusters'] = atac.celldata['Clusters'].values sco.var['score'] = atac.genedata['score'].values return sco