Exemple #1
0
def test_extra_covariates_transfer():
    adata = synthetic_iid()
    adata.obs["cont1"] = np.random.normal(size=(adata.shape[0], ))
    adata.obs["cont2"] = np.random.normal(size=(adata.shape[0], ))
    adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0], ))
    adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0], ))
    adata_manager = generic_setup_adata_manager(
        adata,
        batch_key="batch",
        labels_key="labels",
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
        continuous_covariate_keys=["cont1", "cont2"],
        categorical_covariate_keys=["cat1", "cat2"],
    )
    bdata = synthetic_iid()
    bdata.obs["cont1"] = np.random.normal(size=(bdata.shape[0], ))
    bdata.obs["cont2"] = np.random.normal(size=(bdata.shape[0], ))
    bdata.obs["cat1"] = 0
    bdata.obs["cat2"] = 1

    adata_manager.transfer_setup(bdata)

    # give it a new category
    bdata.obs["cat1"] = 6
    bdata_manager = adata_manager.transfer_setup(bdata, extend_categories=True)
    assert (bdata_manager.get_state_registry(
        REGISTRY_KEYS.CAT_COVS_KEY).mappings["cat1"][-1] == 6)
Exemple #2
0
def test_gimvi():
    adata_seq = synthetic_iid()
    adata_spatial = synthetic_iid()
    GIMVI.setup_anndata(
        adata_seq,
        batch_key="batch",
        labels_key="labels",
    )
    GIMVI.setup_anndata(
        adata_spatial,
        batch_key="batch",
        labels_key="labels",
    )
    model = GIMVI(adata_seq, adata_spatial, n_latent=10)
    assert hasattr(model.module, "library_log_means_0") and not hasattr(
        model.module, "library_log_means_1")
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)
    model.get_latent_representation()
    model.get_imputed_values()

    adata_spatial.var_names += "asdf"
    GIMVI.setup_anndata(
        adata_spatial,
        batch_key="batch",
        labels_key="labels",
    )
    with pytest.raises(ValueError):
        model = GIMVI(adata_seq, adata_spatial)
Exemple #3
0
def test_totalvi_online_update(save_path):
    # basic case
    n_latent = 5
    adata1 = synthetic_iid()
    model = TOTALVI(adata1, n_latent=n_latent, use_batch_norm="decoder")
    model.train(1, check_val_every_n_epoch=1)
    dir_path = os.path.join(save_path, "saved_model/")
    model.save(dir_path, overwrite=True)

    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"])

    model2 = TOTALVI.load_query_data(adata2, dir_path)
    assert model2.module.background_pro_alpha.requires_grad is True
    model2.train(max_epochs=1)
    model2.get_latent_representation()

    # batch 3 has no proteins
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"])
    adata2.obsm["protein_expression"][adata2.obs.batch == "batch_3"] = 0

    # load from model in memory
    model3 = TOTALVI.load_query_data(adata2, model)
    model3.module.protein_batch_mask[2]
    model3.module.protein_batch_mask[3]
    model3.train(max_epochs=1)
    model3.get_latent_representation()
Exemple #4
0
def test_gimvi():
    adata_seq = synthetic_iid()
    adata_spatial = synthetic_iid()
    model = GIMVI(adata_seq, adata_spatial, n_latent=10)
    model.get_latent_representation()
    model.get_imputed_values()
    model.train(1, frequency=1, early_stopping_kwargs=None, train_size=0.5)

    assert len(model.history["elbo_train_0"]) == 2
    assert len(model.history["elbo_train_1"]) == 2
    assert len(model.history["elbo_test_0"]) == 2
    assert len(model.history["elbo_test_1"]) == 2

    trainer = model.trainer
    results = pd.DataFrame(
        trainer.get_loss_magnitude(),
        index=["reconstruction", "kl_divergence", "discriminator"],
        columns=["Sequencing", "Spatial"],
    )
    results.columns.name = "Dataset"
    results.index.name = "Loss"
    trainer.get_discriminator_confusion()
    adata_spatial.var_names += "asdf"
    with pytest.raises(ValueError):
        model = GIMVI(adata_seq, adata_spatial)
Exemple #5
0
def test_extra_covariates_transfer():
    adata = synthetic_iid()
    adata.obs["cont1"] = np.random.normal(size=(adata.shape[0],))
    adata.obs["cont2"] = np.random.normal(size=(adata.shape[0],))
    adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0],))
    adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0],))
    setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
        continuous_covariate_keys=["cont1", "cont2"],
        categorical_covariate_keys=["cat1", "cat2"],
    )
    bdata = synthetic_iid()
    bdata.obs["cont1"] = np.random.normal(size=(bdata.shape[0],))
    bdata.obs["cont2"] = np.random.normal(size=(bdata.shape[0],))
    bdata.obs["cat1"] = 0
    bdata.obs["cat2"] = 1

    transfer_anndata_setup(adata_source=adata, adata_target=bdata)

    # give it a new category
    del bdata.uns["_scvi"]
    bdata.obs["cat1"] = 6
    transfer_anndata_setup(
        adata_source=adata, adata_target=bdata, extend_categories=True
    )
    assert bdata.uns["_scvi"]["extra_categoricals"]["mappings"]["cat1"][-1] == 6
Exemple #6
0
def test_scvi_library_size_update(save_path):
    n_latent = 5
    adata1 = synthetic_iid()
    model = SCVI(adata1, n_latent=n_latent, use_observed_lib_size=False)

    assert (getattr(model.module, "library_log_means", None) is not None
            and model.module.library_log_means.shape == (1, 2)
            and model.module.library_log_means.count_nonzero().item() == 2)
    assert getattr(
        model.module, "library_log_vars",
        None) is not None and model.module.library_log_vars.shape == (1, 2)

    model.train(1, check_val_every_n_epoch=1)
    dir_path = os.path.join(save_path, "saved_model/")
    model.save(dir_path, overwrite=True)

    # also test subset var option
    adata2 = synthetic_iid(run_setup_anndata=False, n_genes=110)
    adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(
        ["batch_2", "batch_3"])

    model2 = SCVI.load_query_data(adata2,
                                  dir_path,
                                  inplace_subset_query_vars=True)
    assert (getattr(model2.module, "library_log_means", None) is not None
            and model2.module.library_log_means.shape == (1, 4)
            and model2.module.library_log_means[:, :2].equal(
                model.module.library_log_means)
            and model2.module.library_log_means.count_nonzero().item() == 4)
    assert (getattr(model2.module, "library_log_vars", None) is not None
            and model2.module.library_log_vars.shape == (1, 4)
            and model2.module.library_log_vars[:, :2].equal(
                model.module.library_log_vars))
Exemple #7
0
def test_lda_model():
    use_gpu = torch.cuda.is_available()
    n_topics = 5
    adata = synthetic_iid(run_setup_anndata=False)

    # Test with float and Sequence priors.
    AmortizedLDA.setup_anndata(adata)
    mod1 = AmortizedLDA(
        adata, n_topics=n_topics, cell_topic_prior=1.5, topic_feature_prior=1.5
    )
    mod1.train(
        max_epochs=1,
        batch_size=256,
        lr=0.01,
        use_gpu=use_gpu,
    )
    mod2 = AmortizedLDA(
        adata,
        n_topics=n_topics,
        cell_topic_prior=[1.5 for _ in range(n_topics)],
        topic_feature_prior=[1.5 for _ in range(adata.n_vars)],
    )
    mod2.train(
        max_epochs=1,
        batch_size=256,
        lr=0.01,
        use_gpu=use_gpu,
    )

    mod = AmortizedLDA(adata, n_topics=n_topics)
    mod.train(
        max_epochs=5,
        batch_size=256,
        lr=0.01,
        use_gpu=use_gpu,
    )
    adata_gbt = mod.get_feature_by_topic().to_numpy()
    assert np.allclose(adata_gbt.sum(axis=0), 1)
    adata_lda = mod.get_latent_representation(adata).to_numpy()
    assert (
        adata_lda.shape == (adata.n_obs, n_topics)
        and np.all((adata_lda <= 1) & (adata_lda >= 0))
        and np.allclose(adata_lda.sum(axis=1), 1)
    )
    mod.get_elbo()
    mod.get_perplexity()

    adata2 = synthetic_iid(run_setup_anndata=False)
    AmortizedLDA.setup_anndata(adata2)
    adata2_lda = mod.get_latent_representation(adata2).to_numpy()
    assert (
        adata2_lda.shape == (adata2.n_obs, n_topics)
        and np.all((adata2_lda <= 1) & (adata2_lda >= 0))
        and np.allclose(adata2_lda.sum(axis=1), 1)
    )
    mod.get_elbo(adata2)
    mod.get_perplexity(adata2)
Exemple #8
0
    def test_save_and_load(save_path, legacy=False):
        prefix = "GIMVI_"
        adata = synthetic_iid()
        GIMVI.setup_anndata(
            adata,
            batch_key="batch",
        )
        adata2 = synthetic_iid()
        GIMVI.setup_anndata(
            adata2,
            batch_key="batch",
        )

        # GIMVI
        model = GIMVI(adata, adata2)
        model.train(3, train_size=0.5)
        z1 = model.get_latent_representation([adata])
        z2 = model.get_latent_representation([adata])
        np.testing.assert_array_equal(z1, z2)
        if legacy:
            legacy_save(model,
                        save_path,
                        overwrite=True,
                        save_anndata=True,
                        prefix=prefix)
        else:
            model.save(save_path,
                       overwrite=True,
                       save_anndata=True,
                       prefix=prefix)
        model = GIMVI.load(save_path, prefix=prefix)
        model.get_latent_representation()
        tmp_adata = scvi.data.synthetic_iid(n_genes=200)
        tmp_adata2 = scvi.data.synthetic_iid(n_genes=200)
        with pytest.raises(ValueError):
            GIMVI.load(save_path,
                       adata_seq=tmp_adata,
                       adata_spatial=tmp_adata2,
                       prefix=prefix)
        model = GIMVI.load(save_path,
                           adata_seq=adata,
                           adata_spatial=adata2,
                           prefix=prefix)
        z2 = model.get_latent_representation([adata])
        np.testing.assert_array_equal(z1, z2)
        model = GIMVI.load(
            save_path,
            adata_seq=adata,
            adata_spatial=adata2,
            use_gpu=False,
            prefix=prefix,
        )
        z2 = model.get_latent_representation([adata])
        np.testing.assert_almost_equal(z1, z2, decimal=3)
        assert model.is_trained is True
Exemple #9
0
def test_gimvi():
    adata_seq = synthetic_iid()
    adata_spatial = synthetic_iid()
    model = GIMVI(adata_seq, adata_spatial, n_latent=10)
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)
    model.get_latent_representation()
    model.get_imputed_values()

    adata_spatial.var_names += "asdf"
    with pytest.raises(ValueError):
        model = GIMVI(adata_seq, adata_spatial)
Exemple #10
0
def test_scanvi(save_path):
    adata = synthetic_iid(run_setup_anndata=False)
    SCANVI.setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
    )
    model = SCANVI(adata, "label_0", n_latent=10)
    model.train(1, train_size=0.5, check_val_every_n_epoch=1)
    logged_keys = model.history.keys()
    assert "elbo_validation" in logged_keys
    assert "reconstruction_loss_validation" in logged_keys
    assert "kl_local_validation" in logged_keys
    assert "elbo_train" in logged_keys
    assert "reconstruction_loss_train" in logged_keys
    assert "kl_local_train" in logged_keys
    assert "classification_loss_validation" in logged_keys
    adata2 = synthetic_iid()
    predictions = model.predict(adata2, indices=[1, 2, 3])
    assert len(predictions) == 3
    model.predict()
    df = model.predict(adata2, soft=True)
    assert isinstance(df, pd.DataFrame)
    model.predict(adata2, soft=True, indices=[1, 2, 3])
    model.get_normalized_expression(adata2)
    model.differential_expression(groupby="labels", group1="label_1")
    model.differential_expression(groupby="labels",
                                  group1="label_1",
                                  group2="label_2")

    # test that all data labeled runs
    unknown_label = "asdf"
    a = scvi.data.synthetic_iid()
    scvi.model.SCANVI.setup_anndata(a, batch_key="batch", labels_key="labels")
    m = scvi.model.SCANVI(a, unknown_label)
    m.train(1)

    # test mix of labeled and unlabeled data
    unknown_label = "label_0"
    a = scvi.data.synthetic_iid()
    scvi.model.SCANVI.setup_anndata(a, batch_key="batch", labels_key="labels")
    m = scvi.model.SCANVI(a, unknown_label)
    m.train(1, train_size=0.9)

    # test from_scvi_model
    a = scvi.data.synthetic_iid()
    m = scvi.model.SCVI(a, use_observed_lib_size=False)
    a2 = scvi.data.synthetic_iid()
    scanvi_model = scvi.model.SCANVI.from_scvi_model(m, "label_0", adata=a2)
    scanvi_model = scvi.model.SCANVI.from_scvi_model(m,
                                                     "label_0",
                                                     use_labels_groups=False)
    scanvi_model.train(1)
Exemple #11
0
def test_gimvi_model_library_size():
    adata_seq = synthetic_iid()
    adata_spatial = synthetic_iid()
    model = GIMVI(
        adata_seq, adata_spatial, model_library_size=[True, True], n_latent=10
    )
    assert hasattr(model.module, "library_log_means_0") and hasattr(
        model.module, "library_log_means_1"
    )
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)
    model.get_latent_representation()
    model.get_imputed_values()
Exemple #12
0
    def test_save_load_model(cls, adata, save_path, prefix=None, legacy=False):
        if cls is TOTALVI:
            cls.setup_anndata(
                adata,
                batch_key="batch",
                labels_key="labels",
                protein_expression_obsm_key="protein_expression",
                protein_names_uns_key="protein_names",
            )
        else:
            cls.setup_anndata(adata, batch_key="batch", labels_key="labels")
        model = cls(adata, latent_distribution="normal")
        model.train(1, train_size=0.2)
        z1 = model.get_latent_representation(adata)
        test_idx1 = model.validation_indices
        if legacy:
            legacy_save(model,
                        save_path,
                        overwrite=True,
                        save_anndata=True,
                        prefix=prefix)
        else:
            model.save(save_path,
                       overwrite=True,
                       save_anndata=True,
                       prefix=prefix)
        model = cls.load(save_path, prefix=prefix)
        model.get_latent_representation()

        # Load with mismatched genes.
        tmp_adata = synthetic_iid(n_genes=200, )
        with pytest.raises(ValueError):
            cls.load(save_path, adata=tmp_adata, prefix=prefix)

        # Load with different batches.
        tmp_adata = synthetic_iid()
        tmp_adata.obs["batch"] = tmp_adata.obs["batch"].cat.rename_categories(
            ["batch_2", "batch_3"])
        with pytest.raises(ValueError):
            cls.load(save_path, adata=tmp_adata, prefix=prefix)

        model = cls.load(save_path, adata=adata, prefix=prefix)
        assert "batch" in model.adata_manager.data_registry
        assert model.adata_manager.data_registry["batch"] == dict(
            attr_name="obs", attr_key="_scvi_batch")

        z2 = model.get_latent_representation()
        test_idx2 = model.validation_indices
        np.testing.assert_array_equal(z1, z2)
        np.testing.assert_array_equal(test_idx1, test_idx2)
        assert model.is_trained is True
Exemple #13
0
def test_view_anndata_setup(save_path):
    adata = synthetic_iid(run_setup_anndata=False)
    adata.obs["cont1"] = np.random.uniform(5, adata.n_obs)
    adata.obs["cont2"] = np.random.uniform(5, adata.n_obs)
    adata.obs["cont1"][
        0] = 939543895847598301.423432423523512351234123421341234
    adata.obs["cont2"][1] = 0.12938471298374691827634

    adata.obs["cat1"] = np.random.randint(0, 5, adata.n_obs).astype(str)
    adata.obs["cat1"][8] = "asdf"
    adata.obs["cat1"][9] = "f34"
    adata.obs["cat2"] = np.random.randint(0, 7, adata.n_obs)

    setup_anndata(
        adata,
        protein_expression_obsm_key="protein_expression",
        batch_key="batch",
        labels_key="labels",
        categorical_covariate_keys=["cat1", "cat2"],
        continuous_covariate_keys=["cont1", "cont2"],
    )
    # test it works with adata
    view_anndata_setup(adata)

    # test it works with scvi setup dict
    view_anndata_setup(adata.uns["_scvi"])

    adata = scvi.data.synthetic_iid()
    m = scvi.model.SCVI(adata)
    folder_path = os.path.join(save_path, "tmp")
    m.save(folder_path, save_anndata=True)

    # test it works with a saved model folder
    view_anndata_setup(folder_path)
    adata_path = os.path.join(folder_path, "adata.h5ad")
    # test it works with the path to an anndata
    view_anndata_setup(adata_path)

    m = scvi.model.SCVI(adata)
    m.save(folder_path, overwrite=True)
    # test it works without saving the anndata
    view_anndata_setup(folder_path)

    # test it throws error if adata was not setup
    with pytest.raises(ValueError):
        adata = synthetic_iid(run_setup_anndata=False)
        view_anndata_setup(adata)

    # test it throws error if we dont pass dict, anndata or str in
    with pytest.raises(ValueError):
        view_anndata_setup(0)
Exemple #14
0
def test_scanvi_online_update(save_path):
    # ref has semi-observed labels
    n_latent = 5
    adata1 = synthetic_iid(run_setup_anndata=False)
    new_labels = adata1.obs.labels.to_numpy()
    new_labels[0] = "Unknown"
    adata1.obs["labels"] = pd.Categorical(new_labels)
    setup_anndata(adata1, batch_key="batch", labels_key="labels")
    model = SCANVI(adata1, "Unknown", n_latent=n_latent, encode_covariates=True)
    model.train(n_epochs_unsupervised=1, n_epochs_semisupervised=1, frequency=1)
    dir_path = os.path.join(save_path, "saved_model/")
    model.save(dir_path, overwrite=True)

    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"])
    adata2.obs["labels"] = "Unknown"

    model = SCANVI.load_query_data(adata2, dir_path, freeze_batchnorm_encoder=True)
    model.train(
        n_epochs_unsupervised=1, n_epochs_semisupervised=1, train_base_model=False
    )
    model.get_latent_representation()
    model.predict()

    # ref has fully-observed labels
    n_latent = 5
    adata1 = synthetic_iid(run_setup_anndata=False)
    new_labels = adata1.obs.labels.to_numpy()
    adata1.obs["labels"] = pd.Categorical(new_labels)
    setup_anndata(adata1, batch_key="batch", labels_key="labels")
    model = SCANVI(adata1, "Unknown", n_latent=n_latent, encode_covariates=True)
    model.train(n_epochs_unsupervised=1, n_epochs_semisupervised=1, frequency=1)
    dir_path = os.path.join(save_path, "saved_model/")
    model.save(dir_path, overwrite=True)

    # query has one new label
    adata2 = synthetic_iid(run_setup_anndata=False)
    adata2.obs["batch"] = adata2.obs.batch.cat.rename_categories(["batch_2", "batch_3"])
    new_labels = adata2.obs.labels.to_numpy()
    new_labels[0] = "Unknown"
    adata2.obs["labels"] = pd.Categorical(new_labels)

    model = SCANVI.load_query_data(adata2, dir_path, freeze_batchnorm_encoder=True)
    model._unlabeled_indices = np.arange(adata2.n_obs)
    model._labeled_indices = []
    model.train(
        n_epochs_unsupervised=1, n_epochs_semisupervised=1, train_base_model=False
    )
    model.get_latent_representation()
    model.predict()
def test_scanvi():
    adata = synthetic_iid()
    model = SCANVI(adata, "undefined_0", n_latent=10)
    model.train(1)
    adata2 = synthetic_iid()
    predictions = model.predict(adata2, indices=[1, 2, 3])
    assert len(predictions) == 3
    model.predict()
    model.predict(adata2, soft=True)
    model.predict(adata2, soft=True, indices=[1, 2, 3])
    model.get_normalized_expression(adata2)
    model.differential_expression(groupby="labels", group1="undefined_1")
    model.differential_expression(groupby="labels",
                                  group1="undefined_1",
                                  group2="undefined_2")
Exemple #16
0
def test_scanvi():
    adata = synthetic_iid()
    model = SCANVI(adata, "label_0", n_latent=10)
    model.train(1, train_size=0.5, frequency=1)
    assert len(model.history["unsupervised_trainer_history"]) == 2
    assert len(model.history["semisupervised_trainer_history"]) == 7
    adata2 = synthetic_iid()
    predictions = model.predict(adata2, indices=[1, 2, 3])
    assert len(predictions) == 3
    model.predict()
    model.predict(adata2, soft=True)
    model.predict(adata2, soft=True, indices=[1, 2, 3])
    model.get_normalized_expression(adata2)
    model.differential_expression(groupby="labels", group1="label_1")
    model.differential_expression(groupby="labels", group1="label_1", group2="label_2")
Exemple #17
0
def test_pyro_bayesian_regression_jit():
    use_gpu = int(torch.cuda.is_available())
    adata = synthetic_iid()
    train_dl = AnnDataLoader(adata, shuffle=True, batch_size=128)
    pyro.clear_param_store()
    model = BayesianRegressionModule(adata.shape[1], 1)
    train_dl = AnnDataLoader(adata, shuffle=True, batch_size=128)
    plan = PyroTrainingPlan(model, loss_fn=pyro.infer.JitTrace_ELBO())
    trainer = Trainer(gpus=use_gpu,
                      max_epochs=2,
                      callbacks=[PyroJitGuideWarmup(train_dl)])
    trainer.fit(plan, train_dl)

    # 100 features, 1 for sigma, 1 for bias
    assert list(model.guide.parameters())[0].shape[0] == 102

    if use_gpu == 1:
        model.cuda()

    # test Predictive
    num_samples = 5
    predictive = model.create_predictive(num_samples=num_samples)
    for tensor_dict in train_dl:
        args, kwargs = model._get_fn_args_from_batch(tensor_dict)
        _ = {
            k: v.detach().cpu().numpy()
            for k, v in predictive(*args, **kwargs).items() if k != "obs"
        }
Exemple #18
0
def test_save_best_state_callback(save_path):
    n_latent = 5
    adata = synthetic_iid()
    SCVI.setup_anndata(adata, batch_key="batch", labels_key="labels")
    model = SCVI(adata, n_latent=n_latent)
    callbacks = [SaveBestState(verbose=True)]
    model.train(3, check_val_every_n_epoch=1, train_size=0.5, callbacks=callbacks)
Exemple #19
0
def test_multiple_covariates_scvi(save_path):
    adata = synthetic_iid()
    adata.obs["cont1"] = np.random.normal(size=(adata.shape[0], ))
    adata.obs["cont2"] = np.random.normal(size=(adata.shape[0], ))
    adata.obs["cat1"] = np.random.randint(0, 5, size=(adata.shape[0], ))
    adata.obs["cat2"] = np.random.randint(0, 5, size=(adata.shape[0], ))

    SCVI.setup_anndata(
        adata,
        batch_key="batch",
        labels_key="labels",
        continuous_covariate_keys=["cont1", "cont2"],
        categorical_covariate_keys=["cat1", "cat2"],
    )
    m = SCVI(adata)
    m.train(1)

    m = SCANVI(adata, unlabeled_category="Unknown")
    m.train(1)

    TOTALVI.setup_anndata(
        adata,
        batch_key="batch",
        protein_expression_obsm_key="protein_expression",
        protein_names_uns_key="protein_names",
        continuous_covariate_keys=["cont1", "cont2"],
        categorical_covariate_keys=["cat1", "cat2"],
    )
    m = TOTALVI(adata)
    m.train(1)
Exemple #20
0
def test_multivi():
    data = synthetic_iid(run_setup_anndata=False)
    MULTIVI.setup_anndata(
        data,
        batch_key="batch",
    )
    vae = MULTIVI(
        data,
        n_genes=50,
        n_regions=50,
    )
    vae.train(1, save_best=False)
    vae.train(1, adversarial_mixing=False)
    vae.train(3)
    vae.get_elbo(indices=vae.validation_indices)
    vae.get_accessibility_estimates()
    vae.get_accessibility_estimates(normalize_cells=True)
    vae.get_accessibility_estimates(normalize_regions=True)
    vae.get_normalized_expression()
    vae.get_library_size_factors()
    vae.get_region_factors()
    vae.get_reconstruction_error(indices=vae.validation_indices)
    vae.get_latent_representation()
    vae.differential_accessibility(groupby="labels", group1="label_1")
    vae.differential_expression(groupby="labels", group1="label_1")
Exemple #21
0
def test_pyro_bayesian_train_sample_mixin_with_local_full_data():
    use_gpu = torch.cuda.is_available()
    adata = synthetic_iid()
    mod = BayesianRegressionModel(adata, per_cell_weight=True)
    mod.train(
        max_epochs=2,
        batch_size=None,
        lr=0.01,
        train_size=1,  # does not work when there is a validation set.
        use_gpu=use_gpu,
    )

    # 100
    assert list(mod.module.guide.state_dict()
                ["locs.linear.weight_unconstrained"].shape) == [1, 100]

    # test posterior sampling
    samples = mod.sample_posterior(num_samples=10,
                                   use_gpu=use_gpu,
                                   batch_size=adata.n_obs,
                                   return_samples=True)

    assert len(samples["posterior_samples"]["sigma"]) == 10
    assert samples["posterior_samples"]["per_cell_weights"].shape == (
        10,
        adata.n_obs,
        1,
    )
Exemple #22
0
def test_destvi(save_path):
    # Step1 learn CondSCVI
    n_latent = 2
    n_labels = 5
    n_layers = 2
    dataset = synthetic_iid(n_labels=n_labels)
    sc_model = CondSCVI(dataset, n_latent=n_latent, n_layers=n_layers)
    sc_model.train(1, train_size=1)

    # step 2 learn destVI with multiple amortization scheme

    for amor_scheme in ["both", "none", "proportion", "latent"]:
        spatial_model = DestVI.from_rna_model(
            dataset,
            sc_model,
            amortization=amor_scheme,
        )
        spatial_model.train(max_epochs=1)
        assert not np.isnan(spatial_model.history["elbo_train"].values[0][0])

        assert spatial_model.get_proportions().shape == (dataset.n_obs, n_labels)
        assert spatial_model.get_gamma(return_numpy=True).shape == (
            dataset.n_obs,
            n_latent,
            n_labels,
        )

        assert spatial_model.get_scale_for_ct("label_0", np.arange(50)).shape == (
            50,
            dataset.n_vars,
        )
Exemple #23
0
def test_pyro_bayesian_regression(save_path):
    use_gpu = int(torch.cuda.is_available())
    adata = synthetic_iid()
    train_dl = AnnDataLoader(adata, shuffle=True, batch_size=128)
    pyro.clear_param_store()
    model = BayesianRegressionModule(adata.shape[1], 1)
    plan = PyroTrainingPlan(model)
    plan.n_obs_training = len(train_dl.indices)
    trainer = Trainer(
        gpus=use_gpu,
        max_epochs=2,
    )
    trainer.fit(plan, train_dl)
    if use_gpu == 1:
        model.cuda()

    # test Predictive
    num_samples = 5
    predictive = model.create_predictive(num_samples=num_samples)
    for tensor_dict in train_dl:
        args, kwargs = model._get_fn_args_from_batch(tensor_dict)
        _ = {
            k: v.detach().cpu().numpy()
            for k, v in predictive(*args, **kwargs).items()
            if k != "obs"
        }
    # test save and load
    # cpu/gpu has minor difference
    model.cpu()
    quants = model.guide.quantiles([0.5])
    sigma_median = quants["sigma"][0].detach().cpu().numpy()
    linear_median = quants["linear.weight"][0].detach().cpu().numpy()

    model_save_path = os.path.join(save_path, "model_params.pt")
    torch.save(model.state_dict(), model_save_path)

    pyro.clear_param_store()
    new_model = BayesianRegressionModule(adata.shape[1], 1)
    # run model one step to get autoguide params
    try:
        new_model.load_state_dict(torch.load(model_save_path))
    except RuntimeError as err:
        if isinstance(new_model, PyroBaseModuleClass):
            plan = PyroTrainingPlan(new_model)
            plan.n_obs_training = len(train_dl.indices)
            trainer = Trainer(
                gpus=use_gpu,
                max_steps=1,
            )
            trainer.fit(plan, train_dl)
            new_model.load_state_dict(torch.load(model_save_path))
        else:
            raise err

    quants = new_model.guide.quantiles([0.5])
    sigma_median_new = quants["sigma"][0].detach().cpu().numpy()
    linear_median_new = quants["linear.weight"][0].detach().cpu().numpy()

    np.testing.assert_array_equal(sigma_median_new, sigma_median)
    np.testing.assert_array_equal(linear_median_new, linear_median)
Exemple #24
0
def test_stereoscope(save_path):
    dataset = synthetic_iid(n_labels=5, run_setup_anndata=False)
    RNAStereoscope.setup_anndata(
        dataset,
        labels_key="labels",
    )

    # train with no proportions
    sc_model = RNAStereoscope(dataset)
    sc_model.train(max_epochs=1)

    # train again with proportions
    sc_model = RNAStereoscope(dataset, ct_weights=np.ones((5, )))
    sc_model.train(max_epochs=1)
    # test save/load
    sc_model.save(save_path, overwrite=True, save_anndata=True)
    sc_model = RNAStereoscope.load(save_path)

    st_model = SpatialStereoscope.from_rna_model(dataset,
                                                 sc_model,
                                                 prior_weight="minibatch")
    st_model.train(max_epochs=1)
    st_model.get_proportions()
    # test save/load
    st_model.save(save_path, overwrite=True, save_anndata=True)
    st_model = SpatialStereoscope.load(save_path)
    st_model.get_proportions()

    # try imputation code
    y = np.array(50 * ["label_0"])
    st_model.get_scale_for_ct(y)
Exemple #25
0
def test_peakvi():
    data = synthetic_iid()
    vae = PEAKVI(
        data,
        model_depth=False,
    )
    vae.train(1, save_best=False)
    vae = PEAKVI(
        data,
        region_factors=False,
    )
    vae.train(1, save_best=False)
    vae = PEAKVI(
        data,
    )
    vae.train(3)
    vae.get_elbo(indices=vae.validation_indices)
    vae.get_accessibility_estimates()
    vae.get_accessibility_estimates(normalize_cells=True)
    vae.get_accessibility_estimates(normalize_regions=True)
    vae.get_library_size_factors()
    vae.get_region_factors()
    vae.get_reconstruction_error(indices=vae.validation_indices)
    vae.get_latent_representation()
    vae.differential_accessibility(groupby="labels", group1="label_1")
Exemple #26
0
def test_lda_model_save_load(save_path):
    use_gpu = torch.cuda.is_available()
    n_topics = 5
    adata = synthetic_iid(run_setup_anndata=False)
    AmortizedLDA.setup_anndata(adata)
    mod = AmortizedLDA(adata, n_topics=n_topics)
    mod.train(
        max_epochs=5,
        batch_size=256,
        lr=0.01,
        use_gpu=use_gpu,
    )
    hist_elbo = mod.history_["elbo_train"]

    feature_by_topic_1 = mod.get_feature_by_topic(n_samples=5000)
    latent_1 = mod.get_latent_representation(n_samples=5000)

    save_path = os.path.join(save_path, "tmp")
    mod.save(save_path, overwrite=True, save_anndata=True)
    mod = AmortizedLDA.load(save_path)

    np.testing.assert_array_equal(mod.history_["elbo_train"], hist_elbo)

    feature_by_topic_2 = mod.get_feature_by_topic(n_samples=5000)
    latent_2 = mod.get_latent_representation(n_samples=5000)
    np.testing.assert_almost_equal(
        feature_by_topic_1.to_numpy(), feature_by_topic_2.to_numpy(), decimal=2
    )
    np.testing.assert_almost_equal(latent_1.to_numpy(), latent_2.to_numpy(), decimal=2)
Exemple #27
0
def test_data_splitter():
    a = synthetic_iid()
    adata_manager = generic_setup_adata_manager(a,
                                                batch_key="batch",
                                                labels_key="labels")
    # test leaving validataion_size empty works
    ds = DataSplitter(adata_manager, train_size=0.4)
    ds.setup()
    # check the number of indices
    _, _, _ = ds.train_dataloader(), ds.val_dataloader(), ds.test_dataloader()
    n_train_idx = len(ds.train_idx)
    n_validation_idx = len(ds.val_idx) if ds.val_idx is not None else 0
    n_test_idx = len(ds.test_idx) if ds.test_idx is not None else 0

    assert n_train_idx + n_validation_idx + n_test_idx == a.n_obs
    assert np.isclose(n_train_idx / a.n_obs, 0.4)
    assert np.isclose(n_validation_idx / a.n_obs, 0.6)
    assert np.isclose(n_test_idx / a.n_obs, 0)

    # test test size
    ds = DataSplitter(adata_manager, train_size=0.4, validation_size=0.3)
    ds.setup()
    # check the number of indices
    _, _, _ = ds.train_dataloader(), ds.val_dataloader(), ds.test_dataloader()
    n_train_idx = len(ds.train_idx)
    n_validation_idx = len(ds.val_idx) if ds.val_idx is not None else 0
    n_test_idx = len(ds.test_idx) if ds.test_idx is not None else 0

    assert n_train_idx + n_validation_idx + n_test_idx == a.n_obs
    assert np.isclose(n_train_idx / a.n_obs, 0.4)
    assert np.isclose(n_validation_idx / a.n_obs, 0.3)
    assert np.isclose(n_test_idx / a.n_obs, 0.3)

    # test that 0 < train_size <= 1
    with pytest.raises(ValueError):
        ds = DataSplitter(adata_manager, train_size=2)
        ds.setup()
        ds.train_dataloader()
    with pytest.raises(ValueError):
        ds = DataSplitter(adata_manager, train_size=-2)
        ds.setup()
        ds.train_dataloader()

    # test that 0 <= validation_size < 1
    with pytest.raises(ValueError):
        ds = DataSplitter(adata_manager, train_size=0.1, validation_size=1)
        ds.setup()
        ds.val_dataloader()
    with pytest.raises(ValueError):
        ds = DataSplitter(adata_manager, train_size=0.1, validation_size=-1)
        ds.setup()
        ds.val_dataloader()

    # test that train_size + validation_size <= 1
    with pytest.raises(ValueError):
        ds = DataSplitter(adata_manager, train_size=1, validation_size=0.1)
        ds.setup()
        ds.train_dataloader()
        ds.val_dataloader()
Exemple #28
0
def test_solo(save_path):
    n_latent = 5
    adata = synthetic_iid(run_setup_anndata=False)
    setup_anndata(adata)
    model = SCVI(adata, n_latent=n_latent)
    model.train(1, check_val_every_n_epoch=1, train_size=0.5)

    solo = SOLO.from_scvi_model(model)
    solo.train(1, check_val_every_n_epoch=1, train_size=0.9)
    assert "validation_loss" in solo.history.keys()
    solo.predict()

    bdata = synthetic_iid(run_setup_anndata=False)
    solo = SOLO.from_scvi_model(model, bdata)
    solo.train(1, check_val_every_n_epoch=1, train_size=0.9)
    assert "validation_loss" in solo.history.keys()
    solo.predict()
Exemple #29
0
def test_data_format():
    # if data was dense np array, check after setup_anndata, data is C_CONTIGUOUS
    adata = synthetic_iid()

    old_x = adata.X
    old_pro = adata.obsm["protein_expression"]
    old_obs = adata.obs
    adata.X = np.asfortranarray(old_x)
    adata.obsm["protein_expression"] = np.asfortranarray(old_pro)
    assert adata.X.flags["C_CONTIGUOUS"] is False
    assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is False

    adata_manager = generic_setup_adata_manager(
        adata, protein_expression_obsm_key="protein_expression")
    assert adata.X.flags["C_CONTIGUOUS"] is True
    assert adata.obsm["protein_expression"].flags["C_CONTIGUOUS"] is True

    assert np.array_equal(old_x, adata.X)
    assert np.array_equal(old_pro, adata.obsm["protein_expression"])
    assert np.array_equal(old_obs, adata.obs)

    assert np.array_equal(adata.X,
                          adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY))
    assert np.array_equal(
        adata.obsm["protein_expression"],
        adata_manager.get_from_registry(REGISTRY_KEYS.PROTEIN_EXP_KEY),
    )

    # if obsm is dataframe, make it C_CONTIGUOUS if it isnt
    adata = synthetic_iid()
    pe = np.asfortranarray(adata.obsm["protein_expression"])
    adata.obsm["protein_expression"] = pd.DataFrame(pe, index=adata.obs_names)
    assert adata.obsm["protein_expression"].to_numpy(
    ).flags["C_CONTIGUOUS"] is False
    adata_manager = generic_setup_adata_manager(
        adata, protein_expression_obsm_key="protein_expression")
    new_pe = adata_manager.get_from_registry(REGISTRY_KEYS.PROTEIN_EXP_KEY)
    assert new_pe.to_numpy().flags["C_CONTIGUOUS"] is True
    assert np.array_equal(pe, new_pe)
    assert np.array_equal(adata.X,
                          adata_manager.get_from_registry(REGISTRY_KEYS.X_KEY))
    assert np.array_equal(
        adata.obsm["protein_expression"],
        adata_manager.get_from_registry(REGISTRY_KEYS.PROTEIN_EXP_KEY),
    )
Exemple #30
0
def test_saving_and_loading(save_path):
    def test_save_load_model(cls, adata, save_path):
        model = cls(adata, latent_distribution="normal")
        model.train(1, train_size=0.2)
        z1 = model.get_latent_representation(adata)
        test_idx1 = model.validation_indices
        model.save(save_path, overwrite=True, save_anndata=True)
        model = cls.load(save_path)
        model.get_latent_representation()
        tmp_adata = scvi.data.synthetic_iid(n_genes=200)
        with pytest.raises(ValueError):
            cls.load(save_path, tmp_adata)
        model = cls.load(save_path, adata)
        z2 = model.get_latent_representation()
        test_idx2 = model.validation_indices
        np.testing.assert_array_equal(z1, z2)
        np.testing.assert_array_equal(test_idx1, test_idx2)
        assert model.is_trained is True

    save_path = os.path.join(save_path, "tmp")
    adata = synthetic_iid()

    for cls in [SCVI, LinearSCVI, TOTALVI]:
        print(cls)
        test_save_load_model(cls, adata, save_path)

    # AUTOZI
    model = AUTOZI(adata, latent_distribution="normal")
    model.train(1, train_size=0.5)
    ab1 = model.get_alphas_betas()
    model.save(save_path, overwrite=True, save_anndata=True)
    model = AUTOZI.load(save_path)
    model.get_latent_representation()
    tmp_adata = scvi.data.synthetic_iid(n_genes=200)
    with pytest.raises(ValueError):
        AUTOZI.load(save_path, tmp_adata)
    model = AUTOZI.load(save_path, adata)
    ab2 = model.get_alphas_betas()
    np.testing.assert_array_equal(ab1["alpha_posterior"],
                                  ab2["alpha_posterior"])
    np.testing.assert_array_equal(ab1["beta_posterior"], ab2["beta_posterior"])
    assert model.is_trained is True

    # SCANVI
    model = SCANVI(adata, "label_0")
    model.train(max_epochs=1, train_size=0.5)
    p1 = model.predict()
    model.save(save_path, overwrite=True, save_anndata=True)
    model = SCANVI.load(save_path)
    model.get_latent_representation()
    tmp_adata = scvi.data.synthetic_iid(n_genes=200)
    with pytest.raises(ValueError):
        SCANVI.load(save_path, tmp_adata)
    model = SCANVI.load(save_path, adata)
    p2 = model.predict()
    np.testing.assert_array_equal(p1, p2)
    assert model.is_trained is True