Beispiel #1
0
Datei: _csv.py Projekt: vals/scVI
def _load_mouse_ob_dataset(save_path: str = "data/", run_setup_anndata: bool = True):
    save_path = os.path.abspath(save_path)
    url = "http://www.spatialtranscriptomicsresearch.org/wp-content/uploads/2016/07/Rep11_MOB_count_matrix-1.tsv"
    save_fn = "Rep11_MOB_count_matrix-1.tsv"
    _download(url, save_path, save_fn)
    adata = _load_csv(
        os.path.join(save_path, save_fn), delimiter="\t", gene_by_cell=False
    )
    adata.obs["batch"] = np.zeros(adata.shape[0]).astype(int)
    adata.obs["labels"] = np.zeros(adata.shape[0]).astype(int)

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Beispiel #2
0
def _load_smfish(
    save_path: str = "data/",
    use_high_level_cluster: bool = True,
    run_setup_anndata: bool = True,
) -> anndata.AnnData:
    save_path = os.path.abspath(save_path)
    url = "http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom"
    save_fn = "osmFISH_SScortex_mouse_all_cell.loom"
    _download(url, save_path, save_fn)
    adata = _load_smfish_data(
        os.path.join(save_path, save_fn), use_high_level_cluster=use_high_level_cluster
    )
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    if run_setup_anndata:
        _setup_anndata(adata, labels_key="labels", batch_key="batch")
    return adata
Beispiel #3
0
def _load_heart_cell_atlas_subsampled(
    save_path: str = "data/",
    run_setup_anndata: bool = True,
    remove_nuisance_clusters: bool = True,
):
    """
    Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations.

    Dataset was filtered down randomly to 20k cells using :func:`~scanpy.pp.subsample`. The original
    data can be sourced from https://www.heartcellatlas.org/#DataSources.

    Parameters
    ----------
    save_path
        Location to use when saving/loading the data.
    run_setup_anndata
        If true, runs setup_anndata() on dataset before returning
    remove_nuisance_clusters
        Remove doublets and unsassigned cells

    Returns
    -------
    AnnData

    Notes
    -----
    The data were filtered using the following sequence::

        >>> adata = anndata.read_h5ad(path_to_anndata)
        >>> bdata = sc.pp.subsample(adata, n_obs=20000, copy=True)
        >>> sc.pp.filter_genes(bdata, min_counts=3)
        >>> bdata.write_h5ad(path, compression="gzip")
    """
    url = "https://github.com/YosefLab/scVI-data/blob/master/hca_subsampled_20k.h5ad?raw=true"
    save_fn = "hca_subsampled_20k.h5ad"
    _download(url, save_path, save_fn)
    dataset = anndata.read_h5ad(os.path.join(save_path, save_fn))

    if remove_nuisance_clusters:
        remove = ["doublets", "NotAssigned"]
        keep = [c not in remove for c in dataset.obs.cell_type.values]
        dataset = dataset[keep, :].copy()

    if run_setup_anndata:
        setup_anndata(dataset, )

    return dataset
Beispiel #4
0
def _load_frontalcortex_dropseq(save_path: str = "data/",
                                run_setup_anndata: bool = True) -> AnnData:
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom"
    save_fn = "fc-dropseq.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))
    adata.obs["batch"] = adata.obs["Clusters"]
    del adata.obs["Clusters"]
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)

    # reorder labels such that layers of the cortex are in order
    # order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13]
    # self.reorder_cell_types(self.cell_types[order_labels])

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Beispiel #5
0
def _load_retina(save_path: str = "data/",
                 run_setup_anndata: bool = True) -> AnnData:
    """\
    Loads retina dataset

    The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and
    13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author.
    We also extract their normalized data with Combat and use it for benchmarking.

    """
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/retina.loom"
    save_fn = "retina.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))
    cell_types = [
        "RBC",
        "MG",
        "BC5A",
        "BC7",
        "BC6",
        "BC5C",
        "BC1A",
        "BC3B",
        "BC1B",
        "BC2",
        "BC5D",
        "BC3A",
        "BC5B",
        "BC4",
        "BC8_9",
    ]
    adata.obs["labels"] = [
        cell_types[i]
        for i in adata.obs["ClusterID"].values.astype(int).ravel()
    ]
    del adata.obs["ClusterID"]
    adata.obs["batch"] = pd.Categorical(adata.obs["BatchID"].values.copy())
    del adata.obs["BatchID"]
    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Beispiel #6
0
def _load_prefrontalcortex_starmap(save_path: str = "data/") -> AnnData:
    """\
    Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018)
    """
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom"
    save_fn = "mpfc-starmap.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))

    adata.obs["labels"] = adata.obs.Clusters.values
    del adata.obs["Clusters"]

    adata.obs["batch"] = adata.obs.BatchID.values
    del adata.obs["BatchID"]
    adata.obs["x_coord"] = adata.obsm["Spatial_coordinates"][:, 0]
    adata.obs["y_coord"] = adata.obsm["Spatial_coordinates"][:, 1]

    return adata
Beispiel #7
0
def _load_brainlarge_dataset(
    save_path: str = "data/",
    sample_size_gene_var: int = 10000,
    max_cells_to_keep: int = None,
    n_genes_to_keep: int = 720,
    loading_batch_size: int = 100000,
) -> anndata.AnnData:
    """Loads brain-large dataset."""
    url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5"
    save_fn = "brain_large.h5"

    _download(url, save_path, save_fn)
    adata = _load_brainlarge_file(
        os.path.join(save_path, save_fn),
        sample_size_gene_var=sample_size_gene_var,
        max_cells_to_keep=max_cells_to_keep,
        n_genes_to_keep=n_genes_to_keep,
        loading_batch_size=loading_batch_size,
    )
    return adata
Beispiel #8
0
def _load_annotation_simulation(name: str, save_path: str = "data/") -> AnnData:
    """\
    Simulated datasets for scANVI tutorials

    name
        One of "1", "2", or "3"
    """

    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/simulation/simulation_{}.loom".format(
        name
    )
    save_fn = "simulation_{}.loom".format(name)
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))

    adata.obs["labels"] = adata.obs.ClusterID.values
    del adata.obs["ClusterID"]

    adata.obs["batch"] = adata.obs.BatchID.values
    del adata.obs["BatchID"]

    return adata
Beispiel #9
0
def _load_purified_pbmc_dataset(
    save_path: str = "data/",
    subset_datasets: List[str] = None,
    run_setup_anndata: bool = True,
) -> anndata.AnnData:
    url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad"
    save_fn = "PurifiedPBMCDataset.h5ad"
    _download(url, save_path, save_fn)
    path_to_file = os.path.join(save_path, save_fn)
    adata = anndata.read(path_to_file)

    dataset_names = [
        "cd4_t_helper",
        "regulatory_t",
        "naive_t",
        "memory_t",
        "cytotoxic_t",
        "naive_cytotoxic",
        "b_cells",
        "cd4_t_helper",
        "cd34",
        "cd56_nk",
        "cd14_monocytes",
    ]
    if subset_datasets is not None:
        row_indices = []
        for dataset in subset_datasets:
            assert dataset in dataset_names
            idx = np.where(adata.obs["cell_types"] == dataset)[0]
            row_indices.append(idx)
        row_indices = np.concatenate(row_indices)
        adata = adata[row_indices].copy()

    if run_setup_anndata:
        setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
Beispiel #10
0
def _load_pbmc_dataset(
    save_path: str = "data/",
    remove_extracted_data: bool = True,
) -> anndata.AnnData:
    urls = [
        "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv",
        "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle",
    ]
    save_fns = ["gene_info_pbmc.csv", "pbmc_metadata.pickle"]

    for i in range(len(urls)):
        _download(urls[i], save_path, save_fns[i])

    de_metadata = pd.read_csv(os.path.join(save_path, "gene_info_pbmc.csv"), sep=",")
    pbmc_metadata = pd.read_pickle(os.path.join(save_path, "pbmc_metadata.pickle"))
    pbmc8k = _load_dataset_10x(
        "pbmc8k",
        save_path=save_path,
        var_names="gene_ids",
        remove_extracted_data=remove_extracted_data,
    )
    pbmc4k = _load_dataset_10x(
        "pbmc4k",
        save_path=save_path,
        var_names="gene_ids",
        remove_extracted_data=remove_extracted_data,
    )
    barcodes = np.concatenate((pbmc8k.obs_names, pbmc4k.obs_names))

    adata = pbmc8k.concatenate(pbmc4k)
    adata.obs_names = barcodes

    dict_barcodes = dict(zip(barcodes, np.arange(len(barcodes))))
    subset_cells = []
    barcodes_metadata = pbmc_metadata["barcodes"].index.values.ravel().astype(np.str)
    for barcode in barcodes_metadata:
        if (
            barcode in dict_barcodes
        ):  # barcodes with end -11 filtered on 10X website (49 cells)
            subset_cells += [dict_barcodes[barcode]]
    adata = adata[np.asarray(subset_cells), :].copy()
    idx_metadata = np.asarray(
        [not barcode.endswith("11") for barcode in barcodes_metadata], dtype=np.bool
    )
    genes_to_keep = list(
        de_metadata["ENSG"].values
    )  # only keep the genes for which we have de data
    difference = list(
        set(genes_to_keep).difference(set(adata.var_names))
    )  # Non empty only for unit tests
    for gene in difference:
        genes_to_keep.remove(gene)

    adata = adata[:, genes_to_keep].copy()
    design = pbmc_metadata["design"][idx_metadata]
    raw_qc = pbmc_metadata["raw_qc"][idx_metadata]
    normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]

    design.index = adata.obs_names
    raw_qc.index = adata.obs_names
    normalized_qc.index = adata.obs_names
    adata.obs["batch"] = adata.obs["batch"].astype(np.int64)
    adata.obsm["design"] = design
    adata.obsm["raw_qc"] = raw_qc
    adata.obsm["normalized_qc"] = normalized_qc

    adata.obsm["qc_pc"] = pbmc_metadata["qc_pc"][idx_metadata]
    labels = pbmc_metadata["clusters"][idx_metadata]
    cell_types = pbmc_metadata["list_clusters"]
    adata.obs["labels"] = labels
    adata.uns["cell_types"] = cell_types
    adata.obs["str_labels"] = [cell_types[i] for i in labels]

    adata.var["n_counts"] = np.squeeze(np.asarray(np.sum(adata.X, axis=0)))

    return adata
Beispiel #11
0
def _load_dataset_10x(
    dataset_name: str = None,
    filename: str = None,
    save_path: str = "data/10X",
    url: str = None,
    return_filtered: bool = True,
    remove_extracted_data: bool = False,
    **scanpy_read_10x_kwargs,
):
    try:
        import scanpy
    except ImportError:
        raise ImportError("Please install scanpy -- `pip install scanpy`")

    # form data url and filename unless manual override
    if dataset_name is not None:
        if url is not None:
            logger.warning("dataset_name provided, manual url is disregarded.")
        if filename is not None:
            logger.warning(
                "dataset_name provided, manual filename is disregarded.")
        group = dataset_to_group[dataset_name]
        url_skeleton = group_to_url_skeleton[group]

        filter_type = "filtered" if return_filtered else "raw"
        url = url_skeleton.format(group, dataset_name, dataset_name,
                                  filter_type)
        filename_skeleton = group_to_filename_skeleton[group]
        filename = filename_skeleton.format(filter_type)
        save_path = os.path.join(save_path, dataset_name)
    elif filename is not None and url is not None:
        logger.info("Loading 10X dataset with custom url and filename")
    elif filename is not None and url is None:
        logger.info("Loading local 10X dataset with custom filename")
    else:
        logger.info("Loading extracted local 10X dataset with custom filename")
    _download(url, save_path=save_path, filename=filename)
    file_path = os.path.join(save_path, filename)

    # untar
    download_is_targz = url[-7:] == ".tar.gz"
    was_extracted = False
    if download_is_targz is True:
        if not os.path.exists(file_path[:-7]):  # nothing extracted yet
            if tarfile.is_tarfile(file_path):
                logger.info("Extracting tar file")
                tar = tarfile.open(file_path, "r:gz")
                tar.extractall(path=save_path)
                was_extracted = True
                tar.close()
        path_to_data_folder, suffix = _find_path_to_mtx(save_path)
        adata = scanpy.read_10x_mtx(path_to_data_folder,
                                    **scanpy_read_10x_kwargs)
        if was_extracted and remove_extracted_data:
            folders_in_save_path = path_to_data_folder[len(save_path) +
                                                       1:].split("/")
            extracted_folder_path = save_path + "/" + folders_in_save_path[0]
            logger.info(
                "Removing extracted data at {}".format(extracted_folder_path))
            shutil.rmtree(extracted_folder_path)
    else:
        adata = scanpy.read_10x_h5(file_path, **scanpy_read_10x_kwargs)

    adata.var_names_make_unique()
    scanpy.pp.filter_cells(adata, min_counts=1)
    scanpy.pp.filter_genes(adata, min_counts=1)

    return adata
Beispiel #12
0
def _load_pbmcs_10x_cite_seq(
    save_path: str = "data/",
    protein_join: str = "inner",
    run_setup_anndata: bool = True,
):
    """
    Filtered PBMCs from 10x Genomics profiled with RNA and protein.

    Datasets were filtered for doublets and other outliers as in
    https://github.com/YosefLab/totalVI_reproducibility/blob/master/data/data_filtering_scripts/pbmc_10k/pbmc_10k.py

    Parameters
    ----------
    save_path
        Location to use when saving/loading the data.
    protein_join
        Whether to take an inner join or outer join of proteins
    run_setup_anndata
        If true, runs setup_anndata() on dataset before returning

    Returns
    -------
    `AnnData` with `.obsm["protein_expression"]

    Missing protein values are zero, and are identified during `AnnData` setup.
    """
    url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_10k_protein_v3.h5ad?raw=true"
    save_fn = "pbmc_10k_protein_v3.h5ad"
    _download(url, save_path, save_fn)
    dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn))
    dataset1.obs["batch"] = "PBMC10k"

    url = "https://github.com/YosefLab/scVI-data/raw/master/pbmc_5k_protein_v3.h5ad?raw=true"
    save_fn = "pbmc_5k_protein_v3.h5ad"
    _download(url, save_path, save_fn)
    dataset2 = anndata.read_h5ad(
        os.path.join(save_path, "pbmc_5k_protein_v3.h5ad"))
    dataset2.obs["batch"] = "PBMC5k"

    common_genes = dataset1.var_names.intersection(dataset2.var_names)
    dataset1 = dataset1[:, common_genes]
    dataset2 = dataset2[:, common_genes]
    dataset1.obsm["protein_expression"] = pd.DataFrame(
        dataset1.obsm["protein_expression"],
        columns=dataset1.uns["protein_names"],
        index=dataset1.obs_names,
    )
    dataset2.obsm["protein_expression"] = pd.DataFrame(
        dataset2.obsm["protein_expression"],
        columns=dataset2.uns["protein_names"],
        index=dataset2.obs_names,
    )
    del dataset1.uns["protein_names"]
    del dataset2.uns["protein_names"]

    dataset = anndata.concat([dataset1, dataset2], join=protein_join)
    dataset.obsm["protein_expression"] = dataset.obsm[
        "protein_expression"].fillna(0)

    if run_setup_anndata:
        setup_anndata(
            dataset,
            batch_key="batch",
            protein_expression_obsm_key="protein_expression",
        )

    return dataset
Beispiel #13
0
def _load_spleen_lymph_cite_seq(
    save_path: str = "data/",
    protein_join: str = "inner",
    remove_outliers: bool = True,
    run_setup_anndata: bool = True,
):
    """
    Immune cells from the murine spleen and lymph nodes [GayosoSteier20]_.

    This dataset was used throughout the totalVI manuscript, and named SLN-all.

    Parameters
    ----------
    save_path
        Location to use when saving/loading the data.
    protein_join
        Whether to take an inner join or outer join of proteins
    remove_outliers
        Whether to remove clusters annotated as doublet or low quality
    run_setup_anndata
        If true, runs setup_anndata() on dataset before returning

    Returns
    -------
    `AnnData` with `.obsm["protein_expression"]

    Missing protein values are zero, and are identified during `AnnData` setup.
    """
    url = "https://github.com/YosefLab/scVI-data/raw/master/sln_111.h5ad?raw=true"
    save_fn = "sln_111.h5ad"
    _download(url, save_path, save_fn)
    dataset1 = anndata.read_h5ad(os.path.join(save_path, save_fn))
    dataset1.obsm["isotypes_htos"] = dataset1.obsm["htos"].copy()
    del dataset1.obsm["htos"]

    url = "https://github.com/YosefLab/scVI-data/raw/master/sln_208.h5ad?raw=true"
    save_fn = "sln_208.h5ad"
    _download(url, save_path, save_fn)
    dataset2 = anndata.read_h5ad(os.path.join(save_path, save_fn))

    common_genes = dataset1.var_names.intersection(dataset2.var_names)
    dataset1 = dataset1[:, common_genes]
    dataset2 = dataset2[:, common_genes]

    del dataset1.uns["protein_names"]
    del dataset2.uns["protein_names"]

    dataset = anndata.concat(
        [dataset1, dataset2],
        join=protein_join,
    )
    dataset.obsm["protein_expression"] = dataset.obsm[
        "protein_expression"].fillna(0)

    if remove_outliers:
        include_cells = [
            c not in [
                "16,0", "17", "19", "21", "23", "24,0", "24,2", "25", "29"
            ] for c in dataset.obs["leiden_subclusters"]
        ]
        dataset = dataset[include_cells].copy()

    if run_setup_anndata:
        setup_anndata(
            dataset,
            batch_key="batch",
            labels_key="cell_types",
            protein_expression_obsm_key="protein_expression",
        )

    return dataset
Beispiel #14
0
def _load_pbmc_seurat_v4_cite_seq(
    save_path: str = "data/",
    apply_filters: bool = True,
    aggregate_proteins: bool = True,
    mask_protein_batches: int = 0,
    run_setup_anndata: bool = True,
):
    url = "https://ndownloader.figshare.com/files/27458840"
    save_fn = "pbmc_seurat_v4.h5ad"
    _download(url, save_path, save_fn)
    adata = anndata.read_h5ad(os.path.join(save_path, save_fn))

    if aggregate_proteins:
        protein_df = pd.DataFrame(index=adata.obsm["protein_counts"].index)
        ref_proteins = adata.obsm["protein_counts"].columns
        for p in ref_proteins:
            if p.split("-")[-1] == "1" or p.split("-")[-1] == "2":
                root = p.split("-")[0]
                if root not in ["Notch", "TCR"]:
                    try:
                        protein_df[root] = (
                            adata.obsm["protein_counts"][root + "-1"] +
                            adata.obsm["protein_counts"][root + "-2"]).values
                    except KeyError:
                        protein_df[p] = adata.obsm["protein_counts"][p]
                else:
                    protein_df[p] = adata.obsm["protein_counts"][p]
            else:
                protein_df[p] = adata.obsm["protein_counts"][p]
        adata.obsm["protein_counts"] = protein_df

    if apply_filters:
        adata.obs["total_counts"] = np.ravel(adata.X.sum(axis=1).A)
        adata.var["mt"] = adata.var_names.str.startswith("MT-")
        adata.obs["total_counts_mt"] = np.ravel(
            adata.X[:, adata.var["mt"].values].sum(axis=1).A)
        adata.obs["pct_counts_mt"] = (adata.obs["total_counts_mt"] /
                                      adata.obs["total_counts"] * 100)

        adata.obs["Protein log library size"] = np.log(
            adata.obsm["protein_counts"].sum(1))
        adata.obs["Number proteins detected"] = (adata.obsm["protein_counts"] >
                                                 0).sum(1)
        adata.obs["RNA log library size"] = np.log(adata.X.sum(1).A)

        # actually filter
        adata = adata[adata.obs["Protein log library size"] > 7.6]
        adata = adata[adata.obs["Protein log library size"] < 10.3]
        adata = adata[adata.obs["Number proteins detected"] > 150]
        # filter doublet
        adata = adata[adata.obs["celltype.l2"] != "Doublet"]
        # MT
        adata = adata[adata.obs["pct_counts_mt"] < 12].copy()

    if mask_protein_batches > 24:
        raise ValueError("mask_protein_batches must be less than 24")

    if mask_protein_batches > 0:
        random_state = np.random.RandomState(seed=settings.seed)
        rand_cats = random_state.permutation(adata.obs["orig.ident"].astype(
            "category").cat.categories)[:mask_protein_batches]
        for r in rand_cats:
            adata.obsm["protein_counts"][adata.obs["orig.ident"] == r] = 0.0

    if run_setup_anndata:
        _setup_anndata(
            adata,
            batch_key="orig.ident",
            protein_expression_obsm_key="protein_counts",
        )

    return adata