def _load_mouse_ob_dataset(save_path: str = "data/", run_setup_anndata: bool = True): save_path = os.path.abspath(save_path) url = "http://www.spatialtranscriptomicsresearch.org/wp-content/uploads/2016/07/Rep11_MOB_count_matrix-1.tsv" save_fn = "Rep11_MOB_count_matrix-1.tsv" _download(url, save_path, save_fn) adata = _load_csv( os.path.join(save_path, save_fn), delimiter="\t", gene_by_cell=False ) adata.obs["batch"] = np.zeros(adata.shape[0]).astype(int) adata.obs["labels"] = np.zeros(adata.shape[0]).astype(int) if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_smfish( save_path: str = "data/", use_high_level_cluster: bool = True, run_setup_anndata: bool = True, ) -> anndata.AnnData: save_path = os.path.abspath(save_path) url = "http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom" save_fn = "osmFISH_SScortex_mouse_all_cell.loom" _download(url, save_path, save_fn) adata = _load_smfish_data( os.path.join(save_path, save_fn), use_high_level_cluster=use_high_level_cluster ) adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64) if run_setup_anndata: _setup_anndata(adata, labels_key="labels", batch_key="batch") return adata
def _load_heart_cell_atlas_subsampled( save_path: str = "data/", run_setup_anndata: bool = True, remove_nuisance_clusters: bool = True, ): """ Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations. Dataset was filtered down randomly to 20k cells using :func:`~scanpy.pp.subsample`. The original data can be sourced from https://www.heartcellatlas.org/#DataSources. Parameters ---------- save_path Location to use when saving/loading the data. run_setup_anndata If true, runs setup_anndata() on dataset before returning remove_nuisance_clusters Remove doublets and unsassigned cells Returns ------- AnnData Notes ----- The data were filtered using the following sequence:: >>> adata = anndata.read_h5ad(path_to_anndata) >>> bdata = sc.pp.subsample(adata, n_obs=20000, copy=True) >>> sc.pp.filter_genes(bdata, min_counts=3) >>> bdata.write_h5ad(path, compression="gzip") """ url = "https://github.com/YosefLab/scVI-data/blob/master/hca_subsampled_20k.h5ad?raw=true" save_fn = "hca_subsampled_20k.h5ad" _download(url, save_path, save_fn) dataset = anndata.read_h5ad(os.path.join(save_path, save_fn)) if remove_nuisance_clusters: remove = ["doublets", "NotAssigned"] keep = [c not in remove for c in dataset.obs.cell_type.values] dataset = dataset[keep, :].copy() if run_setup_anndata: setup_anndata(dataset, ) return dataset
def _load_frontalcortex_dropseq(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom" save_fn = "fc-dropseq.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["batch"] = adata.obs["Clusters"] del adata.obs["Clusters"] adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64) # reorder labels such that layers of the cortex are in order # order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13] # self.reorder_cell_types(self.cell_types[order_labels]) if run_setup_anndata: _setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_retina(save_path: str = "data/", run_setup_anndata: bool = True) -> AnnData: """\ Loads retina dataset The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and 13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author. We also extract their normalized data with Combat and use it for benchmarking. """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/retina.loom" save_fn = "retina.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) cell_types = [ "RBC", "MG", "BC5A", "BC7", "BC6", "BC5C", "BC1A", "BC3B", "BC1B", "BC2", "BC5D", "BC3A", "BC5B", "BC4", "BC8_9", ] adata.obs["labels"] = [ cell_types[i] for i in adata.obs["ClusterID"].values.astype(int).ravel() ] del adata.obs["ClusterID"] adata.obs["batch"] = pd.Categorical(adata.obs["BatchID"].values.copy()) del adata.obs["BatchID"] if run_setup_anndata: _setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_prefrontalcortex_starmap(save_path: str = "data/") -> AnnData: """\ Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018) """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom" save_fn = "mpfc-starmap.loom" _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["labels"] = adata.obs.Clusters.values del adata.obs["Clusters"] adata.obs["batch"] = adata.obs.BatchID.values del adata.obs["BatchID"] adata.obs["x_coord"] = adata.obsm["Spatial_coordinates"][:, 0] adata.obs["y_coord"] = adata.obsm["Spatial_coordinates"][:, 1] return adata
def _load_brainlarge_dataset( save_path: str = "data/", sample_size_gene_var: int = 10000, max_cells_to_keep: int = None, n_genes_to_keep: int = 720, loading_batch_size: int = 100000, ) -> anndata.AnnData: """Loads brain-large dataset.""" url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5" save_fn = "brain_large.h5" _download(url, save_path, save_fn) adata = _load_brainlarge_file( os.path.join(save_path, save_fn), sample_size_gene_var=sample_size_gene_var, max_cells_to_keep=max_cells_to_keep, n_genes_to_keep=n_genes_to_keep, loading_batch_size=loading_batch_size, ) return adata
def _load_annotation_simulation(name: str, save_path: str = "data/") -> AnnData: """\ Simulated datasets for scANVI tutorials name One of "1", "2", or "3" """ save_path = os.path.abspath(save_path) url = "https://github.com/YosefLab/scVI-data/raw/master/simulation/simulation_{}.loom".format( name ) save_fn = "simulation_{}.loom".format(name) _download(url, save_path, save_fn) adata = _load_loom(os.path.join(save_path, save_fn)) adata.obs["labels"] = adata.obs.ClusterID.values del adata.obs["ClusterID"] adata.obs["batch"] = adata.obs.BatchID.values del adata.obs["BatchID"] return adata
def _load_purified_pbmc_dataset( save_path: str = "data/", subset_datasets: List[str] = None, run_setup_anndata: bool = True, ) -> anndata.AnnData: url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad" save_fn = "PurifiedPBMCDataset.h5ad" _download(url, save_path, save_fn) path_to_file = os.path.join(save_path, save_fn) adata = anndata.read(path_to_file) dataset_names = [ "cd4_t_helper", "regulatory_t", "naive_t", "memory_t", "cytotoxic_t", "naive_cytotoxic", "b_cells", "cd4_t_helper", "cd34", "cd56_nk", "cd14_monocytes", ] if subset_datasets is not None: row_indices = [] for dataset in subset_datasets: assert dataset in dataset_names idx = np.where(adata.obs["cell_types"] == dataset)[0] row_indices.append(idx) row_indices = np.concatenate(row_indices) adata = adata[row_indices].copy() if run_setup_anndata: setup_anndata(adata, batch_key="batch", labels_key="labels") return adata
def _load_pbmc_dataset( save_path: str = "data/", remove_extracted_data: bool = True, ) -> anndata.AnnData: urls = [ "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv", "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle", ] save_fns = ["gene_info_pbmc.csv", "pbmc_metadata.pickle"] for i in range(len(urls)): _download(urls[i], save_path, save_fns[i]) de_metadata = pd.read_csv(os.path.join(save_path, "gene_info_pbmc.csv"), sep=",") pbmc_metadata = pd.read_pickle(os.path.join(save_path, "pbmc_metadata.pickle")) pbmc8k = _load_dataset_10x( "pbmc8k", save_path=save_path, var_names="gene_ids", remove_extracted_data=remove_extracted_data, ) pbmc4k = _load_dataset_10x( "pbmc4k", save_path=save_path, var_names="gene_ids", remove_extracted_data=remove_extracted_data, ) barcodes = np.concatenate((pbmc8k.obs_names, pbmc4k.obs_names)) adata = pbmc8k.concatenate(pbmc4k) adata.obs_names = barcodes dict_barcodes = dict(zip(barcodes, np.arange(len(barcodes)))) subset_cells = [] barcodes_metadata = pbmc_metadata["barcodes"].index.values.ravel().astype(np.str) for barcode in barcodes_metadata: if ( barcode in dict_barcodes ): # barcodes with end -11 filtered on 10X website (49 cells) subset_cells += [dict_barcodes[barcode]] adata = adata[np.asarray(subset_cells), :].copy() idx_metadata = np.asarray( [not barcode.endswith("11") for barcode in barcodes_metadata], dtype=np.bool ) genes_to_keep = list( de_metadata["ENSG"].values ) # only keep the genes for which we have de data difference = list( set(genes_to_keep).difference(set(adata.var_names)) ) # Non empty only for unit tests for gene in difference: genes_to_keep.remove(gene) adata = adata[:, genes_to_keep].copy() design = pbmc_metadata["design"][idx_metadata] raw_qc = pbmc_metadata["raw_qc"][idx_metadata] normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata] design.index = adata.obs_names raw_qc.index = adata.obs_names normalized_qc.index = adata.obs_names adata.obs["batch"] = adata.obs["batch"].astype(np.int64) adata.obsm["design"] = design adata.obsm["raw_qc"] = raw_qc adata.obsm["normalized_qc"] = normalized_qc adata.obsm["qc_pc"] = pbmc_metadata["qc_pc"][idx_metadata] labels = pbmc_metadata["clusters"][idx_metadata] cell_types = pbmc_metadata["list_clusters"] adata.obs["labels"] = labels adata.uns["cell_types"] = cell_types adata.obs["str_labels"] = [cell_types[i] for i in labels] adata.var["n_counts"] = np.squeeze(np.asarray(np.sum(adata.X, axis=0))) return adata
def _load_dataset_10x( dataset_name: str = None, filename: str = None, save_path: str = "data/10X", url: str = None, return_filtered: bool = True, remove_extracted_data: bool = False, **scanpy_read_10x_kwargs, ): try: import scanpy except ImportError: raise ImportError("Please install scanpy -- `pip install scanpy`") # form data url and filename unless manual override if dataset_name is not None: if url is not None: logger.warning("dataset_name provided, manual url is disregarded.") if filename is not None: logger.warning( "dataset_name provided, manual filename is disregarded.") group = dataset_to_group[dataset_name] url_skeleton = group_to_url_skeleton[group] filter_type = "filtered" if return_filtered else "raw" url = url_skeleton.format(group, dataset_name, dataset_name, filter_type) filename_skeleton = group_to_filename_skeleton[group] filename = filename_skeleton.format(filter_type) save_path = os.path.join(save_path, dataset_name) elif filename is not None and url is not None: logger.info("Loading 10X dataset with custom url and filename") elif filename is not None and url is None: logger.info("Loading local 10X dataset with custom filename") else: logger.info("Loading extracted local 10X dataset with custom filename") _download(url, save_path=save_path, filename=filename) file_path = os.path.join(save_path, filename) # untar download_is_targz = url[-7:] == ".tar.gz" was_extracted = False if download_is_targz is True: if not os.path.exists(file_path[:-7]): # nothing extracted yet if tarfile.is_tarfile(file_path): logger.info("Extracting tar file") tar = tarfile.open(file_path, "r:gz") tar.extractall(path=save_path) was_extracted = True tar.close() path_to_data_folder, suffix = _find_path_to_mtx(save_path) adata = scanpy.read_10x_mtx(path_to_data_folder, **scanpy_read_10x_kwargs) if was_extracted and remove_extracted_data: folders_in_save_path = path_to_data_folder[len(save_path) + 1:].split("/") extracted_folder_path = save_path + "/" + folders_in_save_path[0] logger.info( "Removing extracted data at {}".format(extracted_folder_path)) shutil.rmtree(extracted_folder_path) else: adata = scanpy.read_10x_h5(file_path, **scanpy_read_10x_kwargs) adata.var_names_make_unique() scanpy.pp.filter_cells(adata, min_counts=1) scanpy.pp.filter_genes(adata, min_counts=1) return adata
def _load_pbmcs_10x_cite_seq( save_path: str = "data/", protein_join: str = "inner", run_setup_anndata: bool = True, ): """ Filtered PBMCs from 10x Genomics profiled with RNA and protein. Datasets were filtered for doublets and other outliers as in https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py Parameters ---------- save_path Location to use when saving/loading the data. protein_join Whether to take an inner join or outer join of proteins run_setup_anndata If true, runs setup_anndata() on dataset before returning Returns ------- `AnnData` with `.obsm["protein_expression"] Missing protein values are zero, and are identified during `AnnData` setup. """ url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true" save_fn = "pbmc_10k_protein_v3.h5ad" _download(url, save_path, save_fn) dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn)) dataset1.obs["batch"] = "PBMC10k" url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_5k_protein_v3.h5ad?raw=true" save_fn = "pbmc_5k_protein_v3.h5ad" _download(url, save_path, save_fn) dataset2 = anndata.read_h5ad( os.path.join(save_path, "pbmc_5k_protein_v3.h5ad")) dataset2.obs["batch"] = "PBMC5k" common_genes = dataset1.var_names.intersection(dataset2.var_names) dataset1 = dataset1[:, common_genes] dataset2 = dataset2[:, common_genes] dataset1.obsm["protein_expression"] = pd.DataFrame( dataset1.obsm["protein_expression"], columns=dataset1.uns["protein_names"], index=dataset1.obs_names, ) dataset2.obsm["protein_expression"] = pd.DataFrame( dataset2.obsm["protein_expression"], columns=dataset2.uns["protein_names"], index=dataset2.obs_names, ) del dataset1.uns["protein_names"] del dataset2.uns["protein_names"] dataset = anndata.concat([dataset1, dataset2], join=protein_join) dataset.obsm["protein_expression"] = dataset.obsm[ "protein_expression"].fillna(0) if run_setup_anndata: setup_anndata( dataset, batch_key="batch", protein_expression_obsm_key="protein_expression", ) return dataset
def _load_spleen_lymph_cite_seq( save_path: str = "data/", protein_join: str = "inner", remove_outliers: bool = True, run_setup_anndata: bool = True, ): """ Immune cells from the murine spleen and lymph nodes [GayosoSteier20]_. This dataset was used throughout the totalVI manuscript, and named SLN-all. Parameters ---------- save_path Location to use when saving/loading the data. protein_join Whether to take an inner join or outer join of proteins remove_outliers Whether to remove clusters annotated as doublet or low quality run_setup_anndata If true, runs setup_anndata() on dataset before returning Returns ------- `AnnData` with `.obsm["protein_expression"] Missing protein values are zero, and are identified during `AnnData` setup. """ url = "https://github.com/YosefLab/scVI-data/raw/master/sln_111.h5ad?raw=true" save_fn = "sln_111.h5ad" _download(url, save_path, save_fn) dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn)) dataset1.obsm["isotypes_htos"] = dataset1.obsm["htos"].copy() del dataset1.obsm["htos"] url = "https://github.com/YosefLab/scVI-data/raw/master/sln_208.h5ad?raw=true" save_fn = "sln_208.h5ad" _download(url, save_path, save_fn) dataset2 = anndata.read_h5ad(os.path.join(save_path, save_fn)) common_genes = dataset1.var_names.intersection(dataset2.var_names) dataset1 = dataset1[:, common_genes] dataset2 = dataset2[:, common_genes] del dataset1.uns["protein_names"] del dataset2.uns["protein_names"] dataset = anndata.concat( [dataset1, dataset2], join=protein_join, ) dataset.obsm["protein_expression"] = dataset.obsm[ "protein_expression"].fillna(0) if remove_outliers: include_cells = [ c not in [ "16,0", "17", "19", "21", "23", "24,0", "24,2", "25", "29" ] for c in dataset.obs["leiden_subclusters"] ] dataset = dataset[include_cells].copy() if run_setup_anndata: setup_anndata( dataset, batch_key="batch", labels_key="cell_types", protein_expression_obsm_key="protein_expression", ) return dataset
def _load_pbmc_seurat_v4_cite_seq( save_path: str = "data/", apply_filters: bool = True, aggregate_proteins: bool = True, mask_protein_batches: int = 0, run_setup_anndata: bool = True, ): url = "https://ndownloader.figshare.com/files/27458840" save_fn = "pbmc_seurat_v4.h5ad" _download(url, save_path, save_fn) adata = anndata.read_h5ad(os.path.join(save_path, save_fn)) if aggregate_proteins: protein_df = pd.DataFrame(index=adata.obsm["protein_counts"].index) ref_proteins = adata.obsm["protein_counts"].columns for p in ref_proteins: if p.split("-")[-1] == "1" or p.split("-")[-1] == "2": root = p.split("-")[0] if root not in ["Notch", "TCR"]: try: protein_df[root] = ( adata.obsm["protein_counts"][root + "-1"] + adata.obsm["protein_counts"][root + "-2"]).values except KeyError: protein_df[p] = adata.obsm["protein_counts"][p] else: protein_df[p] = adata.obsm["protein_counts"][p] else: protein_df[p] = adata.obsm["protein_counts"][p] adata.obsm["protein_counts"] = protein_df if apply_filters: adata.obs["total_counts"] = np.ravel(adata.X.sum(axis=1).A) adata.var["mt"] = adata.var_names.str.startswith("MT-") adata.obs["total_counts_mt"] = np.ravel( adata.X[:, adata.var["mt"].values].sum(axis=1).A) adata.obs["pct_counts_mt"] = (adata.obs["total_counts_mt"] / adata.obs["total_counts"] * 100) adata.obs["Protein log library size"] = np.log( adata.obsm["protein_counts"].sum(1)) adata.obs["Number proteins detected"] = (adata.obsm["protein_counts"] > 0).sum(1) adata.obs["RNA log library size"] = np.log(adata.X.sum(1).A) # actually filter adata = adata[adata.obs["Protein log library size"] > 7.6] adata = adata[adata.obs["Protein log library size"] < 10.3] adata = adata[adata.obs["Number proteins detected"] > 150] # filter doublet adata = adata[adata.obs["celltype.l2"] != "Doublet"] # MT adata = adata[adata.obs["pct_counts_mt"] < 12].copy() if mask_protein_batches > 24: raise ValueError("mask_protein_batches must be less than 24") if mask_protein_batches > 0: random_state = np.random.RandomState(seed=settings.seed) rand_cats = random_state.permutation(adata.obs["orig.ident"].astype( "category").cat.categories)[:mask_protein_batches] for r in rand_cats: adata.obsm["protein_counts"][adata.obs["orig.ident"] == r] = 0.0 if run_setup_anndata: _setup_anndata( adata, batch_key="orig.ident", protein_expression_obsm_key="protein_counts", ) return adata