コード例 #1
0
ファイル: _seqfish.py プロジェクト: vitkl/scvi-tools
def _load_seqfishplus(
    save_path: str = "data/",
    tissue_region: str = "subventricular cortex",
    run_setup_anndata: bool = True,
) -> anndata.AnnData:

    if tissue_region == "subventricular cortex":
        file_prefix = "cortex_svz"
    elif tissue_region == "olfactory bulb":
        file_prefix = "ob"
    else:
        raise ValueError(
            '`tissue_type` must be "subventricular cortex" or "olfactory bulb", but got {}'.format(
                tissue_region
            )
        )

    save_path = os.path.abspath(save_path)
    url = "https://github.com/CaiGroup/seqFISH-PLUS/raw/master/sourcedata.zip"
    save_fn = "seqfishplus.zip"

    _download(url, save_path, save_fn)
    adata = _load_seqfishplus_data(
        os.path.join(save_path, save_fn), file_prefix, save_path, gene_by_cell=False
    )
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
コード例 #2
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_extra_covariates_transfer():
    adata = synthetic_iid()
    adata.obs["cont1"] = np.random.normal(size=(adata.shape[0], ))
    adata.obs["cont2"] = np.random.normal(size=(adata.shape[0], ))
    adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0], ))
    adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0], ))
    _setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
        continuous_covariate_keys=["cont1", "cont2"],
        categorical_covariate_keys=["cat1", "cat2"],
    )
    bdata = synthetic_iid()
    bdata.obs["cont1"] = np.random.normal(size=(bdata.shape[0], ))
    bdata.obs["cont2"] = np.random.normal(size=(bdata.shape[0], ))
    bdata.obs["cat1"] = 0
    bdata.obs["cat2"] = 1

    transfer_anndata_setup(adata_source=adata, adata_target=bdata)

    # give it a new category
    del bdata.uns["_scvi"]
    bdata.obs["cat1"] = 6
    transfer_anndata_setup(adata_source=adata,
                           adata_target=bdata,
                           extend_categories=True)
    assert bdata.uns["_scvi"]["extra_categoricals"]["mappings"]["cat1"][
        -1] == 6
コード例 #3
0
def _load_annotation_simulation(name: str,
                                save_path: str = "data/",
                                run_setup_anndata: bool = True) -> AnnData:
    """\
    Simulated datasets for scANVI tutorials

    name
        One of "1", "2", or "3"
    """

    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/simulation/simulation_{}.loom".format(
        name)
    save_fn = "simulation_{}.loom".format(name)
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))

    adata.obs["labels"] = adata.obs.ClusterID.values
    del adata.obs["ClusterID"]

    adata.obs["batch"] = adata.obs.BatchID.values
    del adata.obs["BatchID"]

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
コード例 #4
0
def test_brain_small(save_path):
    sp = os.path.join(save_path, "10X")
    dataset = dataset_10x(
        dataset_name="neuron_9k",
        save_path=sp,
        remove_extracted_data=True,
    )
    _setup_anndata(dataset)
    unsupervised_training_one_epoch(dataset)
コード例 #5
0
def test_populate_and_train_one_v1(save_path):
    sp = os.path.join(save_path, "10X")
    dataset = dataset_10x(
        dataset_name="cd4_t_helper",
        remove_extracted_data=True,
        save_path=sp,
    )
    _setup_anndata(dataset)
    unsupervised_training_one_epoch(dataset)
コード例 #6
0
def _load_cortex(save_path: str = "data/",
                 run_setup_anndata: bool = True) -> anndata.AnnData:
    """Loads cortex dataset."""
    save_path = os.path.abspath(save_path)
    url = "https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt"
    save_fn = "expression.bin"
    _download(url, save_path, save_fn)
    adata = _load_cortex_txt(os.path.join(save_path, save_fn))
    if run_setup_anndata:
        _setup_anndata(adata, labels_key="labels")
    return adata
コード例 #7
0
def test_pbmc_cite(save_path):
    file_path = os.path.join(
        save_path, "10X/pbmc_10k_protein_v3/filtered_feature_bc_matrix.tar.gz")
    sp = os.path.join(save_path, "10X/pbmc_10k_protein_v3/")
    tar = tarfile.open(file_path, "r:gz")
    tar.extractall(path=sp)
    tar.close()
    dataset = sc.read_10x_mtx(os.path.join(sp, "filtered_feature_bc_matrix"),
                              gex_only=False)
    organize_cite_seq_10x(dataset)
    _setup_anndata(dataset, protein_expression_obsm_key="protein_expression")
    unsupervised_training_one_epoch(dataset)
コード例 #8
0
ファイル: _seqfish.py プロジェクト: vitkl/scvi-tools
def _load_seqfish(
    save_path: str = "data/", run_setup_anndata: bool = True
) -> anndata.AnnData:
    save_path = os.path.abspath(save_path)
    url = "https://www.cell.com/cms/attachment/2080562255/2072099886/mmc6.xlsx"
    save_fn = "SeqFISH.xlsx"
    _download(url, save_path, save_fn)
    adata = _load_seqfish_data(os.path.join(save_path, save_fn))
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)
    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
コード例 #9
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_view_anndata_setup(save_path):
    adata = synthetic_iid(run_setup_anndata=False)
    adata.obs["cont1"] = np.random.uniform(5, adata.n_obs)
    adata.obs["cont2"] = np.random.uniform(5, adata.n_obs)
    adata.obs["cont1"][
        0] = 939543895847598301.423432423523512351234123421341234
    adata.obs["cont2"][1] = 0.12938471298374691827634

    adata.obs["cat1"] = np.random.randint(0, 5, adata.n_obs).astype(str)
    adata.obs["cat1"][8] = "asdf"
    adata.obs["cat1"][9] = "f34"
    adata.obs["cat2"] = np.random.randint(0, 7, adata.n_obs)

    _setup_anndata(
        adata,
        protein_expression_obsm_key="protein_expression",
        batch_key="batch",
        labels_key="labels",
        categorical_covariate_keys=["cat1", "cat2"],
        continuous_covariate_keys=["cont1", "cont2"],
    )
    # test it works with adata
    view_anndata_setup(adata)

    # test it works with scvi setup dict
    view_anndata_setup(adata.uns["_scvi"])

    adata = scvi.data.synthetic_iid()
    m = scvi.model.SCVI(adata)
    folder_path = os.path.join(save_path, "tmp")
    m.save(folder_path, save_anndata=True)

    # test it works with a saved model folder
    view_anndata_setup(folder_path)
    adata_path = os.path.join(folder_path, "adata.h5ad")
    # test it works with the path to an anndata
    view_anndata_setup(adata_path)

    m = scvi.model.SCVI(adata)
    m.save(folder_path, overwrite=True)
    # test it works without saving the anndata
    view_anndata_setup(folder_path)

    # test it throws error if adata was not setup
    with pytest.raises(ValueError):
        adata = synthetic_iid(run_setup_anndata=False)
        view_anndata_setup(adata)

    # test it throws error if we dont pass dict, anndata or str in
    with pytest.raises(ValueError):
        view_anndata_setup(0)
コード例 #10
0
def _load_heart_cell_atlas_subsampled(
    save_path: str = "data/",
    run_setup_anndata: bool = True,
    remove_nuisance_clusters: bool = True,
):
    """
    Combined single cell and single nuclei RNA-Seq data of 485K cardiac cells with annotations.

    Dataset was filtered down randomly to 20k cells using :func:`~scanpy.pp.subsample`. The original
    data can be sourced from https://www.heartcellatlas.org/#DataSources.

    Parameters
    ----------
    save_path
        Location to use when saving/loading the data.
    run_setup_anndata
        If true, runs setup_anndata() on dataset before returning
    remove_nuisance_clusters
        Remove doublets and unsassigned cells

    Returns
    -------
    AnnData

    Notes
    -----
    The data were filtered using the following sequence::

        >>> adata = anndata.read_h5ad(path_to_anndata)
        >>> bdata = sc.pp.subsample(adata, n_obs=20000, copy=True)
        >>> sc.pp.filter_genes(bdata, min_counts=3)
        >>> bdata.write_h5ad(path, compression="gzip")
    """
    url = "https://github.com/YosefLab/scVI-data/blob/master/hca_subsampled_20k.h5ad?raw=true"
    save_fn = "hca_subsampled_20k.h5ad"
    _download(url, save_path, save_fn)
    dataset = anndata.read_h5ad(os.path.join(save_path, save_fn))

    if remove_nuisance_clusters:
        remove = ["doublets", "NotAssigned"]
        keep = [c not in remove for c in dataset.obs.cell_type.values]
        dataset = dataset[keep, :].copy()

    if run_setup_anndata:
        _setup_anndata(
            dataset,
        )

    return dataset
コード例 #11
0
def _load_breast_cancer_dataset(save_path: str = "data/",
                                run_setup_anndata: bool = True):
    save_path = os.path.abspath(save_path)
    url = "http://www.spatialtranscriptomicsresearch.org/wp-content/uploads/2016/07/Layer2_BC_count_matrix-1.tsv"
    save_fn = "Layer2_BC_count_matrix-1.tsv"
    _download(url, save_path, save_fn)
    adata = _load_csv(os.path.join(save_path, save_fn),
                      delimiter="\t",
                      gene_by_cell=False)
    adata.obs["batch"] = np.zeros(adata.shape[0]).astype(int)
    adata.obs["labels"] = np.zeros(adata.shape[0]).astype(int)

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
コード例 #12
0
ファイル: _autozi.py プロジェクト: vitkl/scvi-tools
    def setup_anndata(
        adata: AnnData,
        batch_key: Optional[str] = None,
        labels_key: Optional[str] = None,
        layer: Optional[str] = None,
        copy: bool = False,
    ) -> Optional[AnnData]:
        """
        %(summary)s.

        Parameters
        ----------
        %(param_adata)s
        %(param_batch_key)s
        %(param_labels_key)s
        %(param_layer)s
        %(param_copy)s

        Returns
        -------
        %(returns)s
        """
        return _setup_anndata(
            adata,
            batch_key=batch_key,
            labels_key=labels_key,
            layer=layer,
            copy=copy,
        )
コード例 #13
0
ファイル: _smfish.py プロジェクト: vitkl/scvi-tools
def _load_smfish(
    save_path: str = "data/",
    use_high_level_cluster: bool = True,
    run_setup_anndata: bool = True,
) -> anndata.AnnData:
    save_path = os.path.abspath(save_path)
    url = "http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom"
    save_fn = "osmFISH_SScortex_mouse_all_cell.loom"
    _download(url, save_path, save_fn)
    adata = _load_smfish_data(
        os.path.join(save_path, save_fn), use_high_level_cluster=use_high_level_cluster
    )
    adata.obs["batch"] = np.zeros(adata.shape[0], dtype=np.int64)
    if run_setup_anndata:
        _setup_anndata(adata, labels_key="labels", batch_key="batch")
    return adata
コード例 #14
0
    def setup_anndata(
        adata: AnnData,
        batch_key: Optional[str] = None,
        layer: Optional[str] = None,
        categorical_covariate_keys: Optional[List[str]] = None,
        continuous_covariate_keys: Optional[List[str]] = None,
        copy: bool = False,
    ) -> Optional[AnnData]:
        """
        %(summary)s.

        Parameters
        ----------
        %(param_adata)s
        %(param_batch_key)s
        %(param_layer)s
        %(param_cat_cov_keys)s
        %(param_cont_cov_keys)s
        %(param_copy)s

        Returns
        -------
        %(returns)s
        """
        return _setup_anndata(
            adata,
            batch_key=batch_key,
            layer=layer,
            categorical_covariate_keys=categorical_covariate_keys,
            continuous_covariate_keys=continuous_covariate_keys,
            copy=copy,
        )
コード例 #15
0
def _load_retina(save_path: str = "data/",
                 run_setup_anndata: bool = True) -> AnnData:
    """\
    Loads retina dataset

    The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and
    13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author.
    We also extract their normalized data with Combat and use it for benchmarking.

    """
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/retina.loom"
    save_fn = "retina.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))
    cell_types = [
        "RBC",
        "MG",
        "BC5A",
        "BC7",
        "BC6",
        "BC5C",
        "BC1A",
        "BC3B",
        "BC1B",
        "BC2",
        "BC5D",
        "BC3A",
        "BC5B",
        "BC4",
        "BC8_9",
    ]
    adata.obs["labels"] = [
        cell_types[i]
        for i in adata.obs["ClusterID"].values.astype(int).ravel()
    ]
    del adata.obs["ClusterID"]
    adata.obs["batch"] = pd.Categorical(adata.obs["BatchID"].values.copy())
    del adata.obs["BatchID"]
    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
コード例 #16
0
def _load_frontalcortex_dropseq(save_path: str = "data/",
                                run_setup_anndata: bool = True) -> AnnData:
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom"
    save_fn = "fc-dropseq.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))
    adata.obs["batch"] = adata.obs["Clusters"]
    del adata.obs["Clusters"]
    adata.obs["labels"] = np.zeros(adata.shape[0], dtype=np.int64)

    # reorder labels such that layers of the cortex are in order
    # order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13]
    # self.reorder_cell_types(self.cell_types[order_labels])

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
コード例 #17
0
ファイル: _synthetic.py プロジェクト: vitkl/scvi-tools
def _generate_synthetic(
    batch_size: int = 128,
    n_genes: int = 100,
    n_proteins: int = 100,
    n_batches: int = 2,
    n_labels: int = 3,
    run_setup_anndata: bool = True,
) -> AnnData:

    data = np.random.negative_binomial(5,
                                       0.3,
                                       size=(batch_size * n_batches, n_genes))
    mask = np.random.binomial(n=1,
                              p=0.7,
                              size=(batch_size * n_batches, n_genes))
    data = data * mask  # We put the batch index first
    labels = np.random.randint(0, n_labels, size=(batch_size * n_batches, ))
    labels = np.array(["label_%d" % i for i in labels])

    batch = []
    for i in range(n_batches):
        batch += ["batch_{}".format(i)] * batch_size

    adata = AnnData(data)
    adata.obs["batch"] = pd.Categorical(batch)
    adata.obs["labels"] = pd.Categorical(labels)

    # Protein measurements
    p_data = np.random.negative_binomial(5,
                                         0.3,
                                         size=(adata.shape[0], n_proteins))
    adata.obsm["protein_expression"] = p_data
    adata.uns["protein_names"] = np.arange(n_proteins).astype(str)
    if run_setup_anndata:
        _setup_anndata(
            adata,
            batch_key="batch",
            labels_key="labels",
            protein_expression_obsm_key="protein_expression",
            protein_names_uns_key="protein_names",
        )

    return adata
コード例 #18
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_saving(save_path):
    save_path = os.path.join(save_path, "tmp_adata.h5ad")
    adata = synthetic_iid(run_setup_anndata=False)
    adata.obs["cont1"] = np.random.uniform(5, adata.n_obs)
    adata.obs["cont2"] = np.random.uniform(5, adata.n_obs)
    adata.obs["cat1"] = np.random.randint(0, 3, adata.n_obs).astype(str)
    adata.obs["cat1"][1] = "asdf"
    adata.obs["cat1"][2] = "f34"
    adata.obs["cat2"] = np.random.randint(0, 7, adata.n_obs)

    _setup_anndata(
        adata,
        protein_expression_obsm_key="protein_expression",
        batch_key="batch",
        labels_key="labels",
        categorical_covariate_keys=["cat1", "cat2"],
        continuous_covariate_keys=["cont1", "cont2"],
    )
    adata.write(save_path)
    anndata.read(save_path)
コード例 #19
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_data_format():
    # if data was dense np array, check after setup_anndata, data is C_CONTIGUOUS
    adata = synthetic_iid(run_setup_anndata=False)

    old_x = adata.X
    old_pro = adata.obsm["protein_expression"]
    old_obs = adata.obs
    adata.X = np.asfortranarray(old_x)
    adata.obsm["protein_expression"] = np.asfortranarray(old_pro)
    assert adata.X.flags["C_CONTIGUOUS"] is False
    assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is False

    _setup_anndata(adata, protein_expression_obsm_key="protein_expression")
    assert adata.X.flags["C_CONTIGUOUS"] is True
    assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is True

    assert np.array_equal(old_x, adata.X)
    assert np.array_equal(old_pro, adata.obsm["protein_expression"])
    assert np.array_equal(old_obs, adata.obs)

    assert np.array_equal(adata.X, get_from_registry(adata, _CONSTANTS.X_KEY))
    assert np.array_equal(
        adata.obsm["protein_expression"],
        get_from_registry(adata, _CONSTANTS.PROTEIN_EXP_KEY),
    )

    # if obsm is dataframe, make it C_CONTIGUOUS if it isnt
    adata = synthetic_iid()
    pe = np.asfortranarray(adata.obsm["protein_expression"])
    adata.obsm["protein_expression"] = pd.DataFrame(pe, index=adata.obs_names)
    assert adata.obsm["protein_expression"].to_numpy(
    ).flags["C_CONTIGUOUS"] is False
    _setup_anndata(adata, protein_expression_obsm_key="protein_expression")
    new_pe = get_from_registry(adata, "protein_expression")
    assert new_pe.to_numpy().flags["C_CONTIGUOUS"] is True
    assert np.array_equal(pe, new_pe)
    assert np.array_equal(adata.X, get_from_registry(adata, _CONSTANTS.X_KEY))
    assert np.array_equal(
        adata.obsm["protein_expression"],
        get_from_registry(adata, _CONSTANTS.PROTEIN_EXP_KEY),
    )
コード例 #20
0
def _load_prefrontalcortex_starmap(save_path: str = "data/",
                                   run_setup_anndata: bool = True) -> AnnData:
    """\
    Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018)
    """
    save_path = os.path.abspath(save_path)
    url = "https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom"
    save_fn = "mpfc-starmap.loom"
    _download(url, save_path, save_fn)
    adata = _load_loom(os.path.join(save_path, save_fn))

    adata.obs["labels"] = adata.obs.Clusters.values
    del adata.obs["Clusters"]

    adata.obs["batch"] = adata.obs.BatchID.values
    del adata.obs["BatchID"]
    adata.obs["x_coord"] = adata.obsm["Spatial_coordinates"][:, 0]
    adata.obs["y_coord"] = adata.obsm["Spatial_coordinates"][:, 1]
    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
コード例 #21
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_backed_anndata(save_path):
    adata = scvi.data.synthetic_iid()
    path = os.path.join(save_path, "test_data.h5ad")
    adata.write_h5ad(path)
    adata = anndata.read_h5ad(path, backed="r+")
    _setup_anndata(adata, batch_key="batch")

    # test get item
    bd = AnnTorchDataset(adata)
    bd[np.arange(adata.n_obs)]

    # sparse
    adata = scvi.data.synthetic_iid()
    adata.X = csr_matrix(adata.X)
    path = os.path.join(save_path, "test_data2.h5ad")
    adata.write_h5ad(path)
    adata = anndata.read_h5ad(path, backed="r+")
    _setup_anndata(adata, batch_key="batch")

    # test get item
    bd = AnnTorchDataset(adata)
    bd[np.arange(adata.n_obs)]
コード例 #22
0
ファイル: _brain_large.py プロジェクト: vitkl/scvi-tools
def _load_brainlarge_dataset(
    save_path: str = "data/",
    run_setup_anndata: bool = True,
    sample_size_gene_var: int = 10000,
    max_cells_to_keep: int = None,
    n_genes_to_keep: int = 720,
    loading_batch_size: int = 100000,
) -> anndata.AnnData:
    """Loads brain-large dataset."""
    url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5"
    save_fn = "brain_large.h5"

    _download(url, save_path, save_fn)
    adata = _load_brainlarge_file(
        os.path.join(save_path, save_fn),
        sample_size_gene_var=sample_size_gene_var,
        max_cells_to_keep=max_cells_to_keep,
        n_genes_to_keep=n_genes_to_keep,
        loading_batch_size=loading_batch_size,
    )
    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
コード例 #23
0
ファイル: _pbmc.py プロジェクト: vitkl/scvi-tools
def _load_purified_pbmc_dataset(
    save_path: str = "data/",
    subset_datasets: List[str] = None,
    run_setup_anndata: bool = True,
) -> anndata.AnnData:
    url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad"
    save_fn = "PurifiedPBMCDataset.h5ad"
    _download(url, save_path, save_fn)
    path_to_file = os.path.join(save_path, save_fn)
    adata = anndata.read(path_to_file)

    dataset_names = [
        "cd4_t_helper",
        "regulatory_t",
        "naive_t",
        "memory_t",
        "cytotoxic_t",
        "naive_cytotoxic",
        "b_cells",
        "cd4_t_helper",
        "cd34",
        "cd56_nk",
        "cd14_monocytes",
    ]
    if subset_datasets is not None:
        row_indices = []
        for dataset in subset_datasets:
            assert dataset in dataset_names
            idx = np.where(adata.obs["cell_types"] == dataset)[0]
            row_indices.append(idx)
        row_indices = np.concatenate(row_indices)
        adata = adata[row_indices].copy()

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")

    return adata
コード例 #24
0
    def setup_anndata(
        adata: AnnData,
        size_factor_key: str,
        batch_key: Optional[str] = None,
        layer: Optional[str] = None,
        categorical_covariate_keys: Optional[List[str]] = None,
        continuous_covariate_keys: Optional[List[str]] = None,
        copy: bool = False,
    ) -> Optional[AnnData]:
        """
        %(summary)s.

        Parameters
        ----------
        %(param_adata)s
        size_factor_key
            key in `adata.obs` with continuous valued size factors.
        %(param_batch_key)s
        %(param_layer)s
        %(param_cat_cov_keys)s
        %(param_cat_cov_keys)s
        %(param_copy)s

        Returns
        -------
        %(returns)s
        """
        setup_data = _setup_anndata(
            adata,
            batch_key=batch_key,
            layer=layer,
            categorical_covariate_keys=categorical_covariate_keys,
            continuous_covariate_keys=continuous_covariate_keys,
            copy=copy,
        )
        register_tensor_from_anndata(
            adata if setup_data is None else setup_data,
            "_size_factor",
            "obs",
            size_factor_key,
        )
        return setup_data
コード例 #25
0
ファイル: _totalvi.py プロジェクト: vitkl/scvi-tools
    def setup_anndata(
        adata: AnnData,
        protein_expression_obsm_key: str,
        protein_names_uns_key: Optional[str] = None,
        batch_key: Optional[str] = None,
        layer: Optional[str] = None,
        categorical_covariate_keys: Optional[List[str]] = None,
        continuous_covariate_keys: Optional[List[str]] = None,
        copy: bool = False,
    ) -> Optional[AnnData]:
        """
        %(summary)s.

        Parameters
        ----------
        %(param_adata)s
        protein_expression_obsm_key
            key in `adata.obsm` for protein expression data.
        protein_names_uns_key
            key in `adata.uns` for protein names. If None, will use the column names of `adata.obsm[protein_expression_obsm_key]`
            if it is a DataFrame, else will assign sequential names to proteins.
        %(param_batch_key)s
        %(param_layer)s
        %(param_cat_cov_keys)s
        %(param_cont_cov_keys)s
        %(param_copy)s

        Returns
        -------
        %(returns)s
        """
        return _setup_anndata(
            adata,
            batch_key=batch_key,
            layer=layer,
            protein_expression_obsm_key=protein_expression_obsm_key,
            protein_names_uns_key=protein_names_uns_key,
            categorical_covariate_keys=categorical_covariate_keys,
            continuous_covariate_keys=continuous_covariate_keys,
            copy=copy,
        )
コード例 #26
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_setup_anndata():
    # test regular setup
    adata = synthetic_iid(run_setup_anndata=False)
    _setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
    )
    np.testing.assert_array_equal(
        get_from_registry(adata, "batch_indices"),
        np.array(adata.obs["_scvi_batch"]).reshape((-1, 1)),
    )
    np.testing.assert_array_equal(
        get_from_registry(adata, "labels"),
        np.array(adata.obs["labels"].cat.codes).reshape((-1, 1)),
    )
    np.testing.assert_array_equal(get_from_registry(adata, "X"), adata.X)
    np.testing.assert_array_equal(
        get_from_registry(adata, "protein_expression"),
        adata.obsm["protein_expression"],
    )
    np.testing.assert_array_equal(adata.uns["_scvi"]["protein_names"],
                                  adata.uns["protein_names"])

    # test that error is thrown if its a view:
    adata = synthetic_iid()
    with pytest.raises(ValueError):
        _setup_anndata(adata[1])

    # If obsm is a df and protein_names_uns_key is None, protein names should be grabbed from column of df
    adata = synthetic_iid()
    new_protein_names = np.array(random.sample(range(100), 100)).astype("str")
    df = pd.DataFrame(
        adata.obsm["protein_expression"],
        index=adata.obs_names,
        columns=new_protein_names,
    )
    adata.obsm["protein_expression"] = df
    _setup_anndata(adata, protein_expression_obsm_key="protein_expression")
    np.testing.assert_array_equal(adata.uns["_scvi"]["protein_names"],
                                  new_protein_names)

    # test that layer is working properly
    adata = synthetic_iid()
    true_x = adata.X
    adata.layers["X"] = true_x
    adata.X = np.ones_like(adata.X)
    _setup_anndata(adata, layer="X")
    np.testing.assert_array_equal(get_from_registry(adata, "X"), true_x)

    # test that it creates layers and batch if no layers_key is passed
    adata = synthetic_iid()
    _setup_anndata(
        adata,
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
    )
    np.testing.assert_array_equal(get_from_registry(adata, "batch_indices"),
                                  np.zeros((adata.shape[0], 1)))
    np.testing.assert_array_equal(get_from_registry(adata, "labels"),
                                  np.zeros((adata.shape[0], 1)))
コード例 #27
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_transfer_anndata_setup():
    # test transfer_anndata function
    adata1 = synthetic_iid(run_setup_anndata=False)
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.X = adata1.X
    _setup_anndata(adata1)
    transfer_anndata_setup(adata1, adata2)
    np.testing.assert_array_equal(adata1.obs["_scvi_labels"],
                                  adata2.obs["_scvi_labels"])

    # test if layer was used initially, again used in transfer setup
    adata1 = synthetic_iid(run_setup_anndata=False)
    adata2 = synthetic_iid(run_setup_anndata=False)
    raw_counts = adata1.X.copy()
    adata1.layers["raw"] = raw_counts
    adata2.layers["raw"] = raw_counts
    zeros = np.zeros_like(adata1.X)
    ones = np.ones_like(adata1.X)
    adata1.X = zeros
    adata2.X = ones
    _setup_anndata(adata1, layer="raw")
    transfer_anndata_setup(adata1, adata2)
    np.testing.assert_array_equal(adata1.obs["_scvi_labels"],
                                  adata2.obs["_scvi_labels"])

    # test that an unknown batch throws an error
    adata1 = synthetic_iid()
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["batch"] = [2] * adata2.n_obs
    with pytest.raises(ValueError):
        transfer_anndata_setup(adata1, adata2)

    # TODO: test that a batch with wrong dtype throws an error
    # adata1 = synthetic_iid()
    # adata2 = synthetic_iid(run_setup_anndata=False)
    # adata2.obs["batch"] = ["0"] * adata2.n_obs
    # with pytest.raises(ValueError):
    #     transfer_anndata_setup(adata1, adata2)

    # test that an unknown label throws an error
    adata1 = synthetic_iid()
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["labels"] = ["label_123"] * adata2.n_obs
    with pytest.raises(ValueError):
        transfer_anndata_setup(adata1, adata2)

    # test that correct mapping was applied
    adata1 = synthetic_iid()
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["labels"] = ["label_1"] * adata2.n_obs
    transfer_anndata_setup(adata1, adata2)
    labels_mapping = adata1.uns["_scvi"]["categorical_mappings"][
        "_scvi_labels"]["mapping"]
    correct_label = np.where(labels_mapping == "label_1")[0][0]
    adata2.obs["_scvi_labels"][0] == correct_label

    # test that transfer_anndata_setup correctly looks for adata.obs['batch']
    adata1 = synthetic_iid()
    adata2 = synthetic_iid(run_setup_anndata=False)
    del adata2.obs["batch"]
    with pytest.raises(KeyError):
        transfer_anndata_setup(adata1, adata2)

    # test that transfer_anndata_setup assigns same batch and label to cells
    # if the original anndata was also same batch and label
    adata1 = synthetic_iid(run_setup_anndata=False)
    _setup_anndata(adata1)
    adata2 = synthetic_iid(run_setup_anndata=False)
    del adata2.obs["batch"]
    transfer_anndata_setup(adata1, adata2)
    assert adata2.obs["_scvi_batch"][0] == 0
    assert adata2.obs["_scvi_labels"][0] == 0

    # test that if a category mapping is a subset, transfer anndata is called
    a1 = scvi.data.synthetic_iid()
    a2 = scvi.data.synthetic_iid(run_setup_anndata=False)
    a2.obs["batch"] = "batch_1"
    scvi.model.SCVI.setup_anndata(a2, batch_key="batch")
    m = scvi.model.SCVI(a1)
    m.train(1)
    m.get_latent_representation(a2)
    assert a2.obs["_scvi_batch"].all() == 1
コード例 #28
0
ファイル: test_models.py プロジェクト: vitkl/scvi-tools
def test_scvi(save_path):
    n_latent = 5
    adata = synthetic_iid(run_setup_anndata=False)
    SCVI.setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
    )

    # Test with observed lib size.
    adata = synthetic_iid(run_setup_anndata=False)
    SCVI.setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
    )
    model = SCVI(adata, n_latent=n_latent)
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)

    model = SCVI(adata,
                 n_latent=n_latent,
                 var_activation=Softplus(),
                 use_observed_lib_size=False)
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)

    # tests __repr__
    print(model)

    assert model.is_trained is True
    z = model.get_latent_representation()
    assert z.shape == (adata.shape[0], n_latent)
    assert len(model.history["elbo_train"]) == 2
    model.get_elbo()
    model.get_marginal_ll(n_mc_samples=3)
    model.get_reconstruction_error()
    model.get_normalized_expression(transform_batch="batch_1")

    adata2 = synthetic_iid()
    model.get_elbo(adata2)
    model.get_marginal_ll(adata2, n_mc_samples=3)
    model.get_reconstruction_error(adata2)
    latent = model.get_latent_representation(adata2, indices=[1, 2, 3])
    assert latent.shape == (3, n_latent)
    denoised = model.get_normalized_expression(adata2)
    assert denoised.shape == adata.shape

    denoised = model.get_normalized_expression(adata2,
                                               indices=[1, 2, 3],
                                               transform_batch="batch_1")
    denoised = model.get_normalized_expression(
        adata2, indices=[1, 2, 3], transform_batch=["batch_0", "batch_1"])
    assert denoised.shape == (3, adata2.n_vars)
    sample = model.posterior_predictive_sample(adata2)
    assert sample.shape == adata2.shape
    sample = model.posterior_predictive_sample(adata2,
                                               indices=[1, 2, 3],
                                               gene_list=["1", "2"])
    assert sample.shape == (3, 2)
    sample = model.posterior_predictive_sample(adata2,
                                               indices=[1, 2, 3],
                                               gene_list=["1", "2"],
                                               n_samples=3)
    assert sample.shape == (3, 2, 3)

    model.get_feature_correlation_matrix(correlation_type="pearson")
    model.get_feature_correlation_matrix(
        adata2,
        indices=[1, 2, 3],
        correlation_type="spearman",
        rna_size_factor=500,
        n_samples=5,
    )
    model.get_feature_correlation_matrix(
        adata2,
        indices=[1, 2, 3],
        correlation_type="spearman",
        rna_size_factor=500,
        n_samples=5,
        transform_batch=["batch_0", "batch_1"],
    )
    params = model.get_likelihood_parameters()
    assert params["mean"].shape == adata.shape
    assert (params["mean"].shape == params["dispersions"].shape ==
            params["dropout"].shape)
    params = model.get_likelihood_parameters(adata2, indices=[1, 2, 3])
    assert params["mean"].shape == (3, adata.n_vars)
    params = model.get_likelihood_parameters(adata2,
                                             indices=[1, 2, 3],
                                             n_samples=3,
                                             give_mean=True)
    assert params["mean"].shape == (3, adata.n_vars)
    model.get_latent_library_size()
    model.get_latent_library_size(adata2, indices=[1, 2, 3])

    # test transfer_anndata_setup
    adata2 = synthetic_iid(run_setup_anndata=False)
    transfer_anndata_setup(adata, adata2)
    model.get_elbo(adata2)

    # test automatic transfer_anndata_setup + on a view
    adata = synthetic_iid()
    model = SCVI(adata)
    adata2 = synthetic_iid(run_setup_anndata=False)
    model.get_elbo(adata2[:10])

    # test that we catch incorrect mappings
    adata = synthetic_iid()
    adata2 = synthetic_iid(run_setup_anndata=False)
    transfer_anndata_setup(adata, adata2)
    adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"][
        "mapping"] = np.array(["label_4", "label_0", "label_2"])
    with pytest.raises(ValueError):
        model.get_elbo(adata2)

    # test that same mapping different order doesn't raise error
    adata = synthetic_iid()
    adata2 = synthetic_iid(run_setup_anndata=False)
    transfer_anndata_setup(adata, adata2)
    adata2.uns["_scvi"]["categorical_mappings"]["_scvi_labels"][
        "mapping"] = np.array(["label_1", "label_0", "label_2"])
    model.get_elbo(adata2)  # should automatically transfer setup

    # test mismatched categories raises ValueError
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs.labels.cat.rename_categories(["a", "b", "c"], inplace=True)
    with pytest.raises(ValueError):
        model.get_elbo(adata2)

    # test differential expression
    model.differential_expression(groupby="labels", group1="label_1")
    model.differential_expression(groupby="labels",
                                  group1="label_1",
                                  group2="label_2",
                                  mode="change")
    model.differential_expression(groupby="labels")
    model.differential_expression(idx1=[0, 1, 2], idx2=[3, 4, 5])
    model.differential_expression(idx1=[0, 1, 2])

    # transform batch works with all different types
    a = synthetic_iid(run_setup_anndata=False)
    batch = np.zeros(a.n_obs)
    batch[:64] += 1
    a.obs["batch"] = batch
    _setup_anndata(a, batch_key="batch")
    m = SCVI(a)
    m.train(1, train_size=0.5)
    m.get_normalized_expression(transform_batch=1)
    m.get_normalized_expression(transform_batch=[0, 1])

    # test get_likelihood_parameters() when dispersion=='gene-cell'
    model = SCVI(adata, dispersion="gene-cell")
    model.get_likelihood_parameters()

    # test train callbacks work
    a = synthetic_iid()
    m = scvi.model.SCVI(a)
    lr_monitor = LearningRateMonitor()
    m.train(
        callbacks=[lr_monitor],
        max_epochs=10,
        check_val_every_n_epoch=1,
        log_every_n_steps=1,
        plan_kwargs={"reduce_lr_on_plateau": True},
    )
    assert "lr-Adam" in m.history.keys()
コード例 #29
0
ファイル: _pbmc.py プロジェクト: vitkl/scvi-tools
def _load_pbmc_dataset(
    save_path: str = "data/",
    run_setup_anndata: bool = True,
    remove_extracted_data: bool = True,
) -> anndata.AnnData:
    urls = [
        "https://github.com/YosefLab/scVI-data/raw/master/gene_info.csv",
        "https://github.com/YosefLab/scVI-data/raw/master/pbmc_metadata.pickle",
    ]
    save_fns = ["gene_info_pbmc.csv", "pbmc_metadata.pickle"]

    for i in range(len(urls)):
        _download(urls[i], save_path, save_fns[i])

    de_metadata = pd.read_csv(os.path.join(save_path, "gene_info_pbmc.csv"),
                              sep=",")
    pbmc_metadata = pd.read_pickle(
        os.path.join(save_path, "pbmc_metadata.pickle"))
    pbmc8k = _load_dataset_10x(
        "pbmc8k",
        save_path=save_path,
        var_names="gene_ids",
        remove_extracted_data=remove_extracted_data,
    )
    pbmc4k = _load_dataset_10x(
        "pbmc4k",
        save_path=save_path,
        var_names="gene_ids",
        remove_extracted_data=remove_extracted_data,
    )
    barcodes = np.concatenate((pbmc8k.obs_names, pbmc4k.obs_names))

    adata = pbmc8k.concatenate(pbmc4k)
    adata.obs_names = barcodes

    dict_barcodes = dict(zip(barcodes, np.arange(len(barcodes))))
    subset_cells = []
    barcodes_metadata = pbmc_metadata["barcodes"].index.values.ravel().astype(
        np.str)
    for barcode in barcodes_metadata:
        if (barcode in dict_barcodes
            ):  # barcodes with end -11 filtered on 10X website (49 cells)
            subset_cells += [dict_barcodes[barcode]]
    adata = adata[np.asarray(subset_cells), :].copy()
    idx_metadata = np.asarray(
        [not barcode.endswith("11") for barcode in barcodes_metadata],
        dtype=np.bool)
    genes_to_keep = list(de_metadata["ENSG"].values
                         )  # only keep the genes for which we have de data
    difference = list(set(genes_to_keep).difference(set(
        adata.var_names)))  # Non empty only for unit tests
    for gene in difference:
        genes_to_keep.remove(gene)

    adata = adata[:, genes_to_keep].copy()
    design = pbmc_metadata["design"][idx_metadata]
    raw_qc = pbmc_metadata["raw_qc"][idx_metadata]
    normalized_qc = pbmc_metadata["normalized_qc"][idx_metadata]

    design.index = adata.obs_names
    raw_qc.index = adata.obs_names
    normalized_qc.index = adata.obs_names
    adata.obs["batch"] = adata.obs["batch"].astype(np.int64)
    adata.obsm["design"] = design
    adata.obsm["raw_qc"] = raw_qc
    adata.obsm["normalized_qc"] = normalized_qc

    adata.obsm["qc_pc"] = pbmc_metadata["qc_pc"][idx_metadata]
    labels = pbmc_metadata["clusters"][idx_metadata]
    cell_types = pbmc_metadata["list_clusters"]
    adata.obs["labels"] = labels
    adata.uns["cell_types"] = cell_types
    adata.obs["str_labels"] = [cell_types[i] for i in labels]

    adata.var["n_counts"] = np.squeeze(np.asarray(np.sum(adata.X, axis=0)))

    if run_setup_anndata:
        _setup_anndata(adata, batch_key="batch", labels_key="labels")
    return adata
コード例 #30
0
ファイル: test_anndata.py プロジェクト: vitkl/scvi-tools
def test_anntorchdataset_getitem():
    adata = synthetic_iid()
    _setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
    )
    # check that we can successfully pass in a list of tensors to get
    tensors_to_get = ["batch_indices", "labels"]
    bd = AnnTorchDataset(adata, getitem_tensors=tensors_to_get)
    np.testing.assert_array_equal(tensors_to_get, list(bd[1].keys()))

    # check that we can successfully pass in a dict of tensors and their associated types
    bd = AnnTorchDataset(adata,
                         getitem_tensors={
                             "X": np.int,
                             "labels": np.int64
                         })
    assert bd[1]["X"].dtype == np.int64
    assert bd[1]["labels"].dtype == np.int64

    # check that by default we get all the registered tensors
    bd = AnnTorchDataset(adata)
    all_registered_tensors = list(adata.uns["_scvi"]["data_registry"].keys())
    np.testing.assert_array_equal(all_registered_tensors, list(bd[1].keys()))
    assert bd[1]["X"].shape[0] == bd.adata.uns["_scvi"]["summary_stats"][
        "n_vars"]

    # check that AnnTorchDataset returns numpy array
    adata1 = synthetic_iid()
    bd = AnnTorchDataset(adata1)
    for key, value in bd[1].items():
        assert type(value) == np.ndarray

    # check AnnTorchDataset returns numpy array counts were sparse
    adata = synthetic_iid(run_setup_anndata=False)
    adata.X = sparse.csr_matrix(adata.X)
    _setup_anndata(adata)
    bd = AnnTorchDataset(adata)
    for key, value in bd[1].items():
        assert type(value) == np.ndarray

    # check AnnTorchDataset returns numpy array if pro exp was sparse
    adata = synthetic_iid(run_setup_anndata=False)
    adata.obsm["protein_expression"] = sparse.csr_matrix(
        adata.obsm["protein_expression"])
    _setup_anndata(adata,
                   batch_key="batch",
                   protein_expression_obsm_key="protein_expression")
    bd = AnnTorchDataset(adata)
    for key, value in bd[1].items():
        assert type(value) == np.ndarray

    # check pro exp is being returned as numpy array even if its DF
    adata = synthetic_iid(run_setup_anndata=False)
    adata.obsm["protein_expression"] = pd.DataFrame(
        adata.obsm["protein_expression"], index=adata.obs_names)
    _setup_anndata(adata,
                   batch_key="batch",
                   protein_expression_obsm_key="protein_expression")
    bd = AnnTorchDataset(adata)
    for key, value in bd[1].items():
        assert type(value) == np.ndarray