Esempio n. 1
0
def totalvi_benchmark(dataset, n_epochs, use_cuda=True):
    totalvae = TOTALVI(dataset.nb_genes,
                       len(dataset.protein_names),
                       n_batch=dataset.n_batches)
    trainer = TotalTrainer(totalvae,
                           dataset,
                           train_size=0.5,
                           use_cuda=use_cuda,
                           early_stopping_kwargs=None)
    trainer.train(n_epochs=n_epochs)
    trainer.test_set.reconstruction_error()
    trainer.test_set.marginal_ll()

    trainer.test_set.get_protein_background_mean()
    trainer.test_set.get_latent()
    trainer.test_set.generate()
    trainer.test_set.get_sample_dropout()
    trainer.test_set.get_normalized_denoised_expression(transform_batch=0)
    trainer.test_set.get_normalized_denoised_expression(transform_batch=0)
    trainer.test_set.imputation()
    trainer.test_set.get_protein_mean()
    trainer.test_set.one_vs_all_degenes(n_samples=2, M_permutation=10)
    trainer.test_set.generate_feature_correlation_matrix(n_samples=2)
    trainer.test_set.generate_feature_correlation_matrix(n_samples=2,
                                                         transform_batch=0)

    return trainer
Esempio n. 2
0
    def test_special_dataset_size(self):
        gene_dataset = GeneExpressionDataset()
        x = np.random.randint(1, 100, (17 * 2, 10))
        y = np.random.randint(1, 100, (17 * 2, 10))
        gene_dataset.populate_from_data(x)
        protein_data = CellMeasurement(
            name="protein_expression",
            data=y,
            columns_attr_name="protein_names",
            columns=np.arange(10),
        )
        gene_dataset.initialize_cell_measurement(protein_data)

        # Test UnsupervisedTrainer
        vae = VAE(
            gene_dataset.nb_genes,
            n_batch=gene_dataset.n_batches,
            n_labels=gene_dataset.n_labels,
        )
        trainer = UnsupervisedTrainer(
            vae,
            gene_dataset,
            train_size=0.5,
            use_cuda=False,
            data_loader_kwargs={"batch_size": 8},
        )
        trainer.train(n_epochs=1)

        # Test JVATrainer
        jvae = JVAE(
            [gene_dataset.nb_genes, gene_dataset.nb_genes],
            gene_dataset.nb_genes,
            [slice(None)] * 2,
            ["zinb", "zinb"],
            [True, True],
            n_batch=1,
        )
        cls = Classifier(gene_dataset.nb_genes, n_labels=2, logits=True)
        trainer = JVAETrainer(
            jvae,
            cls,
            [gene_dataset, gene_dataset],
            train_size=0.5,
            use_cuda=False,
            data_loader_kwargs={"batch_size": 8},
        )
        trainer.train(n_epochs=1)

        totalvae = TOTALVI(gene_dataset.nb_genes,
                           len(gene_dataset.protein_names))
        trainer = TotalTrainer(
            totalvae,
            gene_dataset,
            train_size=0.5,
            use_cuda=False,
            data_loader_kwargs={"batch_size": 8},
            early_stopping_kwargs=None,
        )
        trainer.train(n_epochs=1)
Esempio n. 3
0
def totalvi_benchmark(dataset, n_epochs, use_cuda=True):
    totalvae = TOTALVI(
        dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches
    )
    trainer = TotalTrainer(totalvae, dataset, train_size=0.5, use_cuda=use_cuda)
    trainer.train(n_epochs=n_epochs)
    trainer.test_set.reconstruction_error()
    trainer.test_set.marginal_ll()

    trainer.test_set.get_protein_background_mean()
    trainer.test_set.get_latent()
    trainer.test_set.generate()
    trainer.test_set.get_sample_dropout()
    trainer.test_set.get_normalized_denoised_expression()
    trainer.test_set.imputation()

    return trainer
Esempio n. 4
0
def test_totalvi(save_path):
    synthetic_dataset_one_batch = SyntheticDataset(n_batches=1)
    totalvi_benchmark(synthetic_dataset_one_batch,
                      n_epochs=1,
                      use_cuda=use_cuda)
    synthetic_dataset_two_batches = SyntheticDataset(n_batches=2)
    totalvi_benchmark(synthetic_dataset_two_batches,
                      n_epochs=1,
                      use_cuda=use_cuda)

    # adversarial testing
    dataset = synthetic_dataset_two_batches
    totalvae = TOTALVI(dataset.nb_genes,
                       len(dataset.protein_names),
                       n_batch=dataset.n_batches)
    trainer = TotalTrainer(
        totalvae,
        dataset,
        train_size=0.5,
        use_cuda=use_cuda,
        early_stopping_kwargs=None,
        use_adversarial_loss=True,
    )
    trainer.train(n_epochs=1)

    with tempfile.TemporaryDirectory() as temp_dir:
        posterior_save_path = os.path.join(temp_dir, "posterior_data")
        original_post = trainer.create_posterior(
            totalvae,
            dataset,
            indices=np.arange(len(dataset)),
            type_class=TotalPosterior,
        )
        original_post.save_posterior(posterior_save_path)
        new_totalvae = TOTALVI(dataset.nb_genes,
                               len(dataset.protein_names),
                               n_batch=dataset.n_batches)
        new_post = load_posterior(posterior_save_path,
                                  model=new_totalvae,
                                  use_cuda=False)
        assert new_post.posterior_type == "TotalPosterior"
        assert np.array_equal(new_post.gene_dataset.protein_expression,
                              dataset.protein_expression)
Esempio n. 5
0
def test_totalvi(save_path):
    synthetic_dataset_one_batch = SyntheticDataset(n_batches=1)
    totalvi_benchmark(synthetic_dataset_one_batch, n_epochs=1, use_cuda=use_cuda)
    synthetic_dataset_two_batches = SyntheticDataset(n_batches=2)
    totalvi_benchmark(synthetic_dataset_two_batches, n_epochs=1, use_cuda=use_cuda)

    # adversarial testing
    dataset = synthetic_dataset_two_batches
    totalvae = TOTALVI(
        dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches
    )
    trainer = TotalTrainer(
        totalvae,
        dataset,
        train_size=0.5,
        use_cuda=use_cuda,
        early_stopping_kwargs=None,
        use_adversarial_loss=True,
    )
    trainer.train(n_epochs=1)
    early_stopping_kwargs = {
        "early_stopping_metric": "elbo",
        "save_best_state_metric": "elbo",
        "patience": 45,
        "threshold": 0,
        "reduce_lr_on_plateau": True,
        "lr_patience": 30,
        "lr_factor": 0.6,
        "posterior_class": TotalPosterior,
    }

    trainer = TotalTrainer(
        model,
        dataset,
        train_size=0.9,
        test_size=0.1,
        use_cuda=use_cuda,
        frequency=1,
        data_loader_kwargs={"batch_size": 256, "pin_memory": False},
        early_stopping_kwargs=early_stopping_kwargs,
    )
    trainer.train(lr=lr, n_epochs=500)
    # create posterior on full data
    full_posterior = trainer.create_posterior(
        model, dataset, indices=np.arange(len(dataset)), type_class=TotalPosterior,
    )

    torch.save(
        trainer.model.state_dict(), "differential_expression/saved_models/" + n + ".pt"
    )
Esempio n. 7
0
def test_differential_expression(save_path):
    dataset = CortexDataset(save_path=save_path)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = VAE(dataset.nb_genes, dataset.n_batches)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda)
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices)

    with tempfile.TemporaryDirectory() as temp_dir:
        posterior_save_path = os.path.join(temp_dir, "posterior_data")
        post.save_posterior(posterior_save_path)
        new_vae = VAE(dataset.nb_genes, dataset.n_batches)
        new_post = load_posterior(posterior_save_path, model=new_vae, use_cuda=False)
    assert np.array_equal(new_post.indices, post.indices)
    assert np.array_equal(new_post.gene_dataset.X, post.gene_dataset.X)

    # Sample scale example
    px_scales = post.scale_sampler(
        n_samples_per_cell=4, n_samples=None, selection=all_indices
    )["scale"]
    assert (
        px_scales.shape[1] == dataset.nb_genes
    ), "posterior scales should have shape (n_samples, n_genes)"

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
        cred_interval_lvls=[0.5, 0.95],
    )
    print(de_dataframe.keys())
    assert (
        de_dataframe["confidence_interval_0.5_min"]
        <= de_dataframe["confidence_interval_0.5_max"]
    ).all()
    assert (
        de_dataframe["confidence_interval_0.95_min"]
        <= de_dataframe["confidence_interval_0.95_max"]
    ).all()

    # DE estimation example
    de_probabilities = de_dataframe.loc[:, "proba_de"]
    assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()

    # Test totalVI DE
    sp = os.path.join(save_path, "10X")
    dataset = Dataset10X(dataset_name="pbmc_10k_protein_v3", save_path=sp)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = TOTALVI(
        dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches
    )
    trainer = TotalTrainer(
        vae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None
    )
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(
        vae, dataset, shuffle=False, indices=all_indices, type_class=TotalPosterior
    )

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
    )