def _load_annotation_simulation( name: str, save_path: str = "data/", run_setup_anndata: bool = True ) -> AnnData: """\ Simulated datasets for scANVI tutorials name One of "1", "2", or "3" """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/simulation/simulation_{}.loom".format( name ) save_fn = "simulation_{}.loom".format(name) _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["labels"] = adata.obs.ClusterID.values del adata.obs["ClusterID"] adata.obs["batch"] = adata.obs.BatchID.values del adata.obs["BatchID"] if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_seqfishplus( save_path: str = "data/", tissue_region: str = "subventricular cortex", run_setup_anndata: bool = True, ) -> anndata.AnnData: if tissue_region == "subventricular cortex": file_prefix = "cortex_svz" elif tissue_region == "olfactory bulb": file_prefix = "ob" else: raise ValueError( '`tissue_type` must be "subventricular cortex" or "olfactory bulb", but got {}' .format(tissue_region)) save_path = os.path.abspath(save_path) url = "https://github.com/CaiGroup/seqFISH-PLUS/raw/master/sourcedata.zip" save_fn = "seqfishplus.zip" _download(url, save_path, save_fn) adata = _load_seqfishplus_data(os.path.join(save_path, save_fn), file_prefix, save_path, gene_by_cell=False) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_cortex( save_path: str = "data/", run_setup_anndata: bool = True ) -> anndata.AnnData: """Loads cortex dataset.""" save_path = os.path.abspath(save_path) url = "https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt" save_fn = "expression.bin" _download(url, save_path, save_fn) adata = _load_cortex_txt(os.path.join(save_path, save_fn)) if run_setup_anndata: setup_anndata(adata, labels_key="labels") return adata
def _load_seqfish(save_path: str = "data/", run_setup_anndata: bool = True) -> anndata.AnnData: save_path = os.path.abspath(save_path) url = "https://www.cell.com/cms/attachment/2080562255/2072099886/mmc6.xlsx" save_fn = "SeqFISH.xlsx" _download(url, save_path, save_fn) adata = _load_seqfish_data(os.path.join(save_path, save_fn)) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_smfish( save_path: str = "data/", use_high_level_cluster: bool = True, run_setup_anndata: bool = True, ) -> anndata.AnnData: save_path = os.path.abspath(save_path) url = "http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom" save_fn = "osmFISH_SScortex_mouse_all_cell.loom" _download(url, save_path, save_fn) adata = _load_smfish_data(os.path.join(save_path, save_fn), use_high_level_cluster=use_high_level_cluster) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: setup_anndata(adata, labels_key="labels", batch_key="batch") return adata
def _load_mouse_ob_dataset(save_path: str = "data/", run_setup_anndata: bool = True): save_path = os.path.abspath(save_path) url = "http://www.spatialtranscriptomicsresearch.org/wp-content/uploads/2016/07/Rep11_MOB_count_matrix-1.tsv" save_fn = "Rep11_MOB_count_matrix-1.tsv" _download(url, save_path, save_fn) adata = _load_csv( os.path.join(save_path, save_fn), delimiter="\t", gene_by_cell=False ) adata.obs["batch"] = np.zeros(adata.shape[0]).astype(int) adata.obs["labels"] = np.zeros(adata.shape[0]).astype(int) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_frontalcortex_dropseq( save_path: str = "data/", run_setup_anndata: bool = True ) -> AnnData: save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom" save_fn = "fc-dropseq.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["batch"] = adata.obs["Clusters"] del adata.obs["Clusters"] adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) # reorder labels such that layers of the cortex are in order # order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13] # self.reorder_cell_types(self.cell_types[order_labels]) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_retina(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: """\ Loads retina dataset The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and 13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author. We also extract their normalized data with Combat and use it for benchmarking. """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/retina.loom" save_fn = "retina.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) cell_types = [ "RBC", "MG", "BC5A", "BC7", "BC6", "BC5C", "BC1A", "BC3B", "BC1B", "BC2", "BC5D", "BC3A", "BC5B", "BC4", "BC8_9", ] adata.obs["labels"] = [ cell_types[i] for i in adata.obs["ClusterID"].values.astype(int).ravel() ] del adata.obs["ClusterID"] adata.obs["batch"] = pd.Categorical(adata.obs["BatchID"].values.copy()) del adata.obs["BatchID"] if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_prefrontalcortex_starmap( save_path: str = "data/", run_setup_anndata: bool = True ) -> AnnData: """\ Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018) """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom" save_fn = "mpfc-starmap.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["labels"] = adata.obs.Clusters.values del adata.obs["Clusters"] adata.obs["batch"] = adata.obs.BatchID.values del adata.obs["BatchID"] adata.obs["x_coord"] = adata.obsm["Spatial_coordinates"][:, 0] adata.obs["y_coord"] = adata.obsm["Spatial_coordinates"][:, 1] if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_brainlarge_dataset( save_path: str = "data/", run_setup_anndata: bool = True, sample_size_gene_var: int = 10000, max_cells_to_keep: int = None, n_genes_to_keep: int = 720, loading_batch_size: int = 100000, ) -> anndata.AnnData: """Loads brain-large dataset.""" url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5" save_fn = "brain_large.h5" _download(url, save_path, save_fn) adata = _load_brainlarge_file( os.path.join(save_path, save_fn), sample_size_gene_var=sample_size_gene_var, max_cells_to_keep=max_cells_to_keep, n_genes_to_keep=n_genes_to_keep, loading_batch_size=loading_batch_size, ) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_purified_pbmc_dataset( save_path: str = "data/", subset_datasets: List[str] = None, run_setup_anndata: bool = True, ) -> anndata.AnnData: url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad" save_fn = "PurifiedPBMCDataset.h5ad" _download(url, save_path, save_fn) path_to_file = os.path.join(save_path, save_fn) adata = anndata.read(path_to_file) dataset_names = [ "cd4_t_helper", "regulatory_t", "naive_t", "memory_t", "cytotoxic_t", "naive_cytotoxic", "b_cells", "cd4_t_helper", "cd34", "cd56_nk", "cd14_monocytes", ] if subset_datasets is not None: row_indices = [] for dataset in subset_datasets: assert dataset in dataset_names idx = np.where(adata.obs["cell_types"] == dataset)[0] row_indices.append(idx) row_indices = np.concatenate(row_indices) adata = adata[row_indices].copy() if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_pbmcs_10x_cite_seq( save_path: str = "data/", protein_join: str = "inner", run_setup_anndata: bool = True, ): """Filtered PBMCs from 10x Genomics profiled with RNA and protein Datasets were filtered for doublets and other outliers as in https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py Parameters ---------- protein_join Whether to take an inner join or outer join of proteins Returns ------- `AnnData` with `.obsm["protein_expression"] Missing protein values are zero, and are identified during `AnnData` setup. """ url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true" save_fn = "pbmc_10k_protein_v3.h5ad" _download(url, save_path, save_fn) dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn)) url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_5k_protein_v3.h5ad?raw=true" save_fn = "pbmc_5k_protein_v3.h5ad" _download(url, save_path, save_fn) dataset2 = anndata.read_h5ad( os.path.join(save_path, "pbmc_5k_protein_v3.h5ad")) common_genes = dataset1.var_names.intersection(dataset2.var_names) dataset1 = dataset1[:, common_genes] dataset2 = dataset2[:, common_genes] dataset1.obsm["protein_expression"] = pd.DataFrame( dataset1.obsm["protein_expression"], columns=dataset1.uns["protein_names"], index=dataset1.obs_names, ) dataset2.obsm["protein_expression"] = pd.DataFrame( dataset2.obsm["protein_expression"], columns=dataset2.uns["protein_names"], index=dataset2.obs_names, ) del dataset1.uns["protein_names"] del dataset2.uns["protein_names"] dataset = dataset1.concatenate(dataset2, join=protein_join) dataset.obsm["protein_expression"] = dataset.obsm[ "protein_expression"].fillna(0) dataset.obs["labels"] = np.zeros(dataset.shape[0], dtype=np.int64) dataset.obs["batch"] = dataset.obs["batch"].astype(np.int64) if run_setup_anndata: setup_anndata( dataset, batch_key="batch", labels_key="labels", protein_expression_obsm_key="protein_expression", ) return dataset
def _load_pbmc_dataset( save_path: str = "data/", run_setup_anndata: bool = True, remove_extracted_data: bool = True, ) -> anndata.AnnData: urls = [ "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv", "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle", ] save_fns = ["gene_info_pbmc.csv", "pbmc_metadata.pickle"] for i in range(len(urls)): _download(urls[i], save_path, save_fns[i]) de_metadata = pd.read_csv(os.path.join(save_path, "gene_info_pbmc.csv"), sep=",") pbmc_metadata = pickle.load( open(os.path.join(save_path, "pbmc_metadata.pickle"), "rb") ) pbmc8k = _load_dataset_10x( "pbmc8k", save_path=save_path, var_names="gene_ids", remove_extracted_data=remove_extracted_data, ) pbmc4k = _load_dataset_10x( "pbmc4k", save_path=save_path, var_names="gene_ids", remove_extracted_data=remove_extracted_data, ) barcodes = np.concatenate((pbmc8k.obs_names, pbmc4k.obs_names)) adata = pbmc8k.concatenate(pbmc4k) adata.obs_names = barcodes dict_barcodes = dict(zip(barcodes, np.arange(len(barcodes)))) subset_cells = [] barcodes_metadata = pbmc_metadata["barcodes"].index.values.ravel().astype(np.str) for barcode in barcodes_metadata: if ( barcode in dict_barcodes ): # barcodes with end -11 filtered on 10X website (49 cells) subset_cells += [dict_barcodes[barcode]] adata = adata[np.asarray(subset_cells), :].copy() idx_metadata = np.asarray( [not barcode.endswith("11") for barcode in barcodes_metadata], dtype=np.bool ) genes_to_keep = list( de_metadata["ENSG"].values ) # only keep the genes for which we have de data difference = list( set(genes_to_keep).difference(set(adata.var_names)) ) # Non empty only for unit tests for gene in difference: genes_to_keep.remove(gene) adata = adata[:, genes_to_keep].copy() design = pbmc_metadata["design"][idx_metadata] raw_qc = pbmc_metadata["raw_qc"][idx_metadata] normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata] design.index = adata.obs_names raw_qc.index = adata.obs_names normalized_qc.index = adata.obs_names adata.obs["batch"] = adata.obs["batch"].astype(np.int64) adata.obsm["design"] = design adata.obsm["raw_qc"] = raw_qc adata.obsm["normalized_qc"] = normalized_qc adata.obsm["qc_pc"] = pbmc_metadata["qc_pc"][idx_metadata] labels = pbmc_metadata["clusters"][idx_metadata] cell_types = pbmc_metadata["list_clusters"] adata.obs["labels"] = labels adata.uns["cell_types"] = cell_types adata.obs["str_labels"] = [cell_types[i] for i in labels] adata.var["n_counts"] = np.squeeze(np.asarray(np.sum(adata.X, axis=0))) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_dataset_10x( dataset_name: str = None, filename: str = None, save_path: str = "data/10X", url: str = None, return_filtered: bool = True, remove_extracted_data: bool = False, **scanpy_read_10x_kwargs, ): try: import scanpy except ImportError: raise ImportError("Please install scanpy -- `pip install scanpy`") # form data url and filename unless manual override if dataset_name is not None: if url is not None: logger.warning("dataset_name provided, manual url is disregarded.") if filename is not None: logger.warning("dataset_name provided, manual filename is disregarded.") group = dataset_to_group[dataset_name] url_skeleton = group_to_url_skeleton[group] filter_type = "filtered" if return_filtered else "raw" url = url_skeleton.format(group, dataset_name, dataset_name, filter_type) filename_skeleton = group_to_filename_skeleton[group] filename = filename_skeleton.format(filter_type) save_path = os.path.join(save_path, dataset_name) elif filename is not None and url is not None: logger.info("Loading 10X dataset with custom url and filename") elif filename is not None and url is None: logger.info("Loading local 10X dataset with custom filename") else: logger.info("Loading extracted local 10X dataset with custom filename") _download(url, save_path=save_path, filename=filename) file_path = os.path.join(save_path, filename) # untar download_is_targz = url[-7:] == ".tar.gz" was_extracted = False if download_is_targz is True: if not os.path.exists(file_path[:-7]): # nothing extracted yet if tarfile.is_tarfile(file_path): logger.info("Extracting tar file") tar = tarfile.open(file_path, "r:gz") tar.extractall(path=save_path) was_extracted = True tar.close() path_to_data_folder, suffix = _find_path_to_mtx(save_path) adata = scanpy.read_10x_mtx(path_to_data_folder, **scanpy_read_10x_kwargs) if was_extracted and remove_extracted_data: folders_in_save_path = path_to_data_folder[len(save_path) + 1 :].split("/") extracted_folder_path = save_path + "/" + folders_in_save_path[0] logger.info("Removing extracted data at {}".format(extracted_folder_path)) shutil.rmtree(extracted_folder_path) else: adata = scanpy.read_10x_h5(file_path, **scanpy_read_10x_kwargs) adata.var_names_make_unique() scanpy.pp.filter_cells(adata, min_counts=1) scanpy.pp.filter_genes(adata, min_counts=1) return adata