def read_PBMC_crossdataset(name, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" This create a dataset with shared genes among multiple datasets - 'pbmc8k' (6290, 17870)->(6290, 11299) genes - 'pbmcecc' (2941, 15634)->(2941, 11299) genes - 'pbmcciteseq' (7985, 17006)->(7985, 11299) genes - 'cbmcciteseq' (8617, 20400)->(8617, 11299) genes - 'call' (37552, 33694)->(37552, 11299) genes - 'mpal' (52396, 20287)->(52396, 11299) genes - 'pbmc5k' (5247, 33538)->(5247, 11299) genes - 'vdj1' (55206, 33538)->(55206, 11299) genes - 'vdj4' (36619, 33538)->(36619, 11299) genes Total transcriptomic data: 212853(cells) 11299(genes) Highly variable genes: 2000 Arguments: name : {'pbmc8k', 'pbmcecc', 'call', 'mpal', 'pbmc5k', 'vdj1', 'vdj4'} """ assert name in _DATASETS, \ (f"Invalid dataset name='{name}', " f"available datasets are: {list(_DATASETS.keys())}") preprocessed_path = os.path.join(DATA_DIR, 'PBMC_crossdataset_preprocessed') if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at path {preprocessed_path}") if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessing ******************** # if len(os.listdir(preprocessed_path)) == 0 or \ md5_folder(preprocessed_path) != _MD5: datasets = {} for i, j in _DATASETS.items(): ds = j(verbose=verbose) datasets[i] = ds if verbose: print(f"Read dataset='{i}' shape={ds.shape}") gene_names = sorted( reduce(lambda x, y: x & y, (set(i.var_names.values) for i in datasets.values()))) # this make sure the gene order is random and consistent among all machines rand = np.random.RandomState(seed=1) rand.shuffle(gene_names) # some debugging if verbose: omics = reduce(lambda x, y: x | y, (i.omics for i in datasets.values())) n_samples = {k: v.shape[0] for k, v in datasets.items()} print(f"Select {len(gene_names)} common genes " f"among {', '.join(datasets.keys())}.") print(f"All available OMICs are: {omics}") print(f"Amount of samples: {n_samples}") # read data from all available OMICs indices = {} mRNA = [] for name, sco in datasets.items(): X, ids = _match_genes(sco, gene_names) indices[name] = ids mRNA.append(X) if verbose: print(f"Matching genes for dataset '{name}' " f"{sco.X.shape}->{X.shape} genes") mRNA = np.concatenate(mRNA, axis=0) if verbose: print("Total transcriptomic data:", f"{mRNA.shape[0]}(cells) {mRNA.shape[1]}(genes)") # filter genes seurat sco = SingleCellOMIC(mRNA, gene_id=gene_names) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) gene_subset = result.gene_subset # maker sure all marker genes included for i, gene in enumerate(gene_names): if gene in MARKER_GENES: gene_subset[i] = True sco._inplace_subset_var(gene_subset) top_genes = set(sco.var_names.values) if verbose: print(f"Filtered highly variable genes: {len(top_genes)}") del sco # save the indices and top_genes with open(os.path.join(preprocessed_path, 'gene_indices'), 'wb') as f: pickle.dump([gene_names, indices, top_genes], f) print(f"Preprocessed MD5: {md5_folder(preprocessed_path)}") # ******************** load the dataset ******************** # with open(os.path.join(preprocessed_path, 'gene_indices'), 'rb') as f: gene_names, indices, top_genes = pickle.load(f) sco = _DATASETS[name](verbose=verbose) sco._inplace_subset_var(indices[name]) if filtered_genes: top_indices = [i in top_genes for i in sco.var_names] sco._inplace_subset_var(top_indices) sco._name += 'x' return sco
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True): download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = _CITEseq_CBMC_PREPROCESSED if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) elif override: if verbose: print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED) shutil.rmtree(_CITEseq_CBMC_PREPROCESSED) os.mkdir(_CITEseq_CBMC_PREPROCESSED) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # url = str(base64.decodebytes(_URL), 'utf-8') base_name = os.path.basename(url) zip_path = os.path.join(download_path, base_name) download_file(filename=zip_path, url=url, override=False, md5=r"beb76d01a67707c61c21bfb188e1b69f") # ====== extract the data ====== # data_dict = {} for name, data in crypto.unzip_aes(zip_path, password=_PASSWORD, verbose=False): base_name = os.path.splitext(name)[0] if '.npz' in name: data = sp.sparse.load_npz(BytesIO(data)).todense() elif '.csv' in name: data = np.loadtxt(StringIO(str(data, 'utf-8')), dtype=str, delimiter=',') else: raise RuntimeError("Unknown format: %s" % name) data_dict[base_name] = data # ====== post-processing ====== # X = np.array(data_dict['X'].astype('float32')) X_row, X_col = data_dict['X_row'], data_dict['X_col'] X, X_col = remove_allzeros_columns(matrix=X, colname=X_col) assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1] y = data_dict['y'].astype('float32') y_row, y_col = data_dict['y_row'], data_dict['y_col'] assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1] assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col) sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) sco._inplace_subset_var(result.gene_subset) with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(set(sco.var_names.values), f) del sco # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) sco = SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"cbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col']) if filtered_genes: with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) sco._inplace_subset_var([i in top_genes for i in sco.var_names]) return sco
def read_leukemia_MixedPhenotypes(filtered_genes=True, omic='rna', ignore_na=True, override=False, verbose=True) -> SingleCellOMIC: r""" Integrates highly multiplexed protein quantification, transcriptome profiling, and chromatin accessibility analysis. Using this approach, we establish a normal epigenetic baseline for healthy blood development, which we then use to deconvolve aberrant molecular features within blood from mixed-phenotype acute leukemia (MPAL) patients. scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow, peripheral blood, and MPAL donors References: Granja JM et al., 2019. "Single-cell multiomic analysis identifies regulatory programs in mixed-phenotype acute leukemia". Nature Biotechnology. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369 https://github.com/GreenleafLab/MPAL-Single-Cell-2019 """ ### prepare the path download_dir = os.path.join(DOWNLOAD_DIR, 'mpal') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download files = {} for name, (url, md5) in _URL.items(): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files[name] = path ### read the files if omic == 'atac': del files['rna'] del files['adt'] elif omic == 'rna': del files['atac'] else: raise NotImplementedError(f"No support for omic type: {omic}") all_data = {} for name, data in MPI(jobs=list(files.items()), func=partial(_read_data, verbose=True, preprocessed_path=preprocessed_path), batch=1, ncpu=4): all_data[name] = data.load() ### load scRNA and ADT if omic == 'rna': rna = all_data['rna'] adt = all_data['adt'] cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode'])) # barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])} ids = [barcode2ids[i] for i in cell_id] X_rna = rna.X[ids].astype(np.float32) classification = rna.celldata['ProjectClassification'][ids].values # barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])} X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32) # if filtered_genes: top_genes_path = os.path.join(preprocessed_path, 'top_genes') if os.path.exists(top_genes_path): with open(top_genes_path, 'rb') as f: top_genes = set(pickle.load(f)) ids = [i for i, j in enumerate(rna.genenames) if j in top_genes] sco = SingleCellOMIC(X_rna[:, ids], cell_id=cell_id, gene_id=rna.genenames[ids], omic=OMIC.transcriptomic, name='mpalRNA') else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNA') sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) # make sure all marker genes are included gene_subset = result.gene_subset gene_indices = sco.get_var_indices() for gene in MARKER_GENES: idx = gene_indices.get(gene, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) with open(top_genes_path, 'wb') as f: pickle.dump(sco.var_names.values, f) else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNAall') # loading dataset if ignore_na: ids = np.logical_not(np.isnan(np.max(X_adt, axis=0))) sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids]) else: sco.add_omic(OMIC.proteomic, X_adt, adt.genenames) y, labels = _celltypes(classification) sco.add_omic(OMIC.celltype, y, labels) exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values} sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names], dtype=np.float32) ### load ATAC else: atac = all_data['atac'] sco = SingleCellOMIC(atac.X.astype(np.float32), cell_id=atac.celldata['Barcode'], gene_id=atac.genenames, omic=OMIC.atac, name='mpalATAC') y, labels = _celltypes(atac.celldata['ProjectClassification'].values) sco.add_omic(OMIC.celltype, y, labels) sco.obs['clusters'] = atac.celldata['Clusters'].values sco.var['score'] = atac.genedata['score'].values return sco