Beispiel #1
0
def test_fish_rna(save_path):
    gene_dataset_fish = SmfishDataset(save_path)
    gene_dataset_seq = CortexDataset(
        save_path=save_path,
        genes_to_keep=gene_dataset_fish.gene_names,
        total_genes=gene_dataset_fish.nb_genes + 50)
    benchmark_fish_scrna(gene_dataset_seq, gene_dataset_fish)
Beispiel #2
0
def test_iwae(save_path):
    import time
    dataset = CortexDataset(save_path=save_path)
    torch.manual_seed(42)

    vae = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda()
    start = time.time()
    trainer = UnsupervisedTrainer(vae,
                                  gene_dataset=dataset,
                                  ratio_loss=True,
                                  k_importance_weighted=5,
                                  single_backward=True)
    trainer.train(n_epochs=10)
    stop1 = time.time() - start

    vae = VAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda()
    start = time.time()
    trainer = UnsupervisedTrainer(vae,
                                  gene_dataset=dataset,
                                  ratio_loss=True,
                                  k_importance_weighted=5,
                                  single_backward=False)
    trainer.train(n_epochs=10)
    stop2 = time.time() - start

    print('Time single backward : ', stop1)
    print('Time all elements : ', stop2)
Beispiel #3
0
def cortex_benchmark(n_epochs=250,
                     use_cuda=True,
                     save_path='data/',
                     show_plot=True):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes)
    trainer_cortex_vae = UnsupervisedTrainer(vae,
                                             cortex_dataset,
                                             use_cuda=use_cuda)
    trainer_cortex_vae.train(n_epochs=n_epochs)
    trainer_cortex_vae.train_set.differential_expression_score(
        'oligodendrocytes', 'pyramidal CA1', genes=["THY1", "MBP"])

    trainer_cortex_vae.test_set.ll()  # assert ~ 1200
    vae = VAE(cortex_dataset.nb_genes)
    trainer_cortex_vae = UnsupervisedTrainer(vae,
                                             cortex_dataset,
                                             use_cuda=use_cuda)
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=n_epochs)
    trainer_cortex_vae.uncorrupt_posteriors()
    trainer_cortex_vae.train_set.imputation_benchmark(verbose=(n_epochs > 1),
                                                      save_path=save_path,
                                                      show_plot=show_plot)

    n_samples = 10 if n_epochs == 1 else None  # n_epochs == 1 is unit tests
    trainer_cortex_vae.train_set.show_t_sne(n_samples=n_samples)
    return trainer_cortex_vae
Beispiel #4
0
def load_datasets(dataset_name, save_path="data/", url=None):
    if dataset_name == "synthetic":
        gene_dataset = SyntheticDataset()
    elif dataset_name == "cortex":
        gene_dataset = CortexDataset()
    elif dataset_name == "brain_large":
        gene_dataset = BrainLargeDataset(save_path=save_path)
    elif dataset_name == "retina":
        gene_dataset = RetinaDataset(save_path=save_path)
    elif dataset_name == "cbmc":
        gene_dataset = CbmcDataset(save_path=save_path)
    elif dataset_name == "brain_small":
        gene_dataset = BrainSmallDataset(save_path=save_path)
    elif dataset_name == "hemato":
        gene_dataset = HematoDataset(save_path="data/HEMATO/")
    elif dataset_name == "pbmc":
        gene_dataset = PbmcDataset(save_path=save_path)
    elif dataset_name[-5:] == ".loom":
        gene_dataset = LoomDataset(filename=dataset_name, save_path=save_path, url=url)
    elif dataset_name[-5:] == ".h5ad":
        gene_dataset = AnnDataset(dataset_name, save_path=save_path, url=url)
    elif ".csv" in dataset_name:
        gene_dataset = CsvDataset(dataset_name, save_path=save_path)
    else:
        raise Exception("No such dataset available")
    return gene_dataset
Beispiel #5
0
def cortex_benchmark(n_epochs=250,
                     use_cuda=True,
                     save_path="data/",
                     show_plot=True):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes)
    trainer_cortex_vae = UnsupervisedTrainer(vae,
                                             cortex_dataset,
                                             use_cuda=use_cuda)
    trainer_cortex_vae.train(n_epochs=n_epochs)
    couple_celltypes = (4, 5)  # the couple types on which to study DE
    cell_idx1 = cortex_dataset.labels.ravel() == couple_celltypes[0]
    cell_idx2 = cortex_dataset.labels.ravel() == couple_celltypes[1]
    trainer_cortex_vae.train_set.differential_expression_score(
        cell_idx1, cell_idx2, genes=["THY1", "MBP"])

    trainer_cortex_vae.test_set.reconstruction_error()  # assert ~ 1200
    vae = VAE(cortex_dataset.nb_genes)
    trainer_cortex_vae = UnsupervisedTrainer(vae,
                                             cortex_dataset,
                                             use_cuda=use_cuda)
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=n_epochs)
    trainer_cortex_vae.uncorrupt_posteriors()
    trainer_cortex_vae.train_set.imputation_benchmark(save_path=save_path,
                                                      show_plot=show_plot)

    n_samples = 10 if n_epochs == 1 else None  # n_epochs == 1 is unit tests
    trainer_cortex_vae.train_set.show_t_sne(n_samples=n_samples)
    return trainer_cortex_vae
Beispiel #6
0
def test_fish_rna(save_path):
    gene_dataset_fish = SmfishDataset(save_path)
    gene_dataset_seq = CortexDataset(save_path=save_path,
                                     genes_fish=gene_dataset_fish.gene_names,
                                     genes_to_keep=[],
                                     additional_genes=50)
    benchmark_fish_scrna(gene_dataset_seq, gene_dataset_fish)
Beispiel #7
0
def test_gamma_de():
    cortex_dataset = CortexDataset()
    cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(cortex_vae,
                                             cortex_dataset,
                                             train_size=0.5,
                                             use_cuda=use_cuda)
    trainer_cortex_vae.train(n_epochs=2)

    full = trainer_cortex_vae.create_posterior(trainer_cortex_vae.model,
                                               cortex_dataset,
                                               indices=np.arange(
                                                   len(cortex_dataset)))

    n_samples = 10
    M_permutation = 100
    cell_idx1 = cortex_dataset.labels.ravel() == 0
    cell_idx2 = cortex_dataset.labels.ravel() == 1

    full.differential_expression_score(cell_idx1,
                                       cell_idx2,
                                       n_samples=n_samples,
                                       M_permutation=M_permutation)
    full.differential_expression_gamma(cell_idx1,
                                       cell_idx2,
                                       n_samples=n_samples,
                                       M_permutation=M_permutation)
Beispiel #8
0
def test_annealing_procedures(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)

    trainer_cortex_vae = UnsupervisedTrainer(
        cortex_vae,
        cortex_dataset,
        train_size=0.5,
        use_cuda=use_cuda,
        n_epochs_kl_warmup=1,
    )
    trainer_cortex_vae.train(n_epochs=2)
    assert trainer_cortex_vae.kl_weight >= 0.99, "Annealing should be over"

    trainer_cortex_vae = UnsupervisedTrainer(
        cortex_vae,
        cortex_dataset,
        train_size=0.5,
        use_cuda=use_cuda,
        n_epochs_kl_warmup=5,
    )
    trainer_cortex_vae.train(n_epochs=2)
    assert trainer_cortex_vae.kl_weight <= 0.99, "Annealing should be proceeding"

    # iter
    trainer_cortex_vae = UnsupervisedTrainer(
        cortex_vae,
        cortex_dataset,
        train_size=0.5,
        use_cuda=use_cuda,
        n_iter_kl_warmup=1,
        n_epochs_kl_warmup=None,
    )
    trainer_cortex_vae.train(n_epochs=2)
    assert trainer_cortex_vae.kl_weight >= 0.99, "Annealing should be over"
Beispiel #9
0
def load_datasets(dataset_name, save_path='data/', url=None):
    if dataset_name == 'synthetic':
        gene_dataset = SyntheticDataset()
    elif dataset_name == 'cortex':
        gene_dataset = CortexDataset()
    elif dataset_name == 'brain_large':
        gene_dataset = BrainLargeDataset(save_path=save_path)
    elif dataset_name == 'retina':
        gene_dataset = RetinaDataset(save_path=save_path)
    elif dataset_name == 'cbmc':
        gene_dataset = CbmcDataset(save_path=save_path)
    elif dataset_name == 'brain_small':
        gene_dataset = BrainSmallDataset(save_path=save_path)
    elif dataset_name == 'hemato':
        gene_dataset = HematoDataset(save_path='data/HEMATO/')
    elif dataset_name == 'pbmc':
        gene_dataset = PbmcDataset(save_path=save_path)
    elif dataset_name[-5:] == ".loom":
        gene_dataset = LoomDataset(filename=dataset_name,
                                   save_path=save_path,
                                   url=url)
    elif dataset_name[-5:] == ".h5ad":
        gene_dataset = AnnDataset(dataset_name, save_path=save_path, url=url)
    elif ".csv" in dataset_name:
        gene_dataset = CsvDataset(dataset_name, save_path=save_path)
    else:
        raise "No such dataset available"
    return gene_dataset
Beispiel #10
0
    def test_variance_and_order_and_size(self):
        to_keep = ["THY1", "sst", "Tomem2", "Crhbp"]
        total_genes = 10
        dataset_full = CortexDataset(save_path="tests/data", total_genes=None)
        dataset_small = CortexDataset(
            save_path="tests/data", genes_to_keep=to_keep, total_genes=total_genes
        )
        self.assertListEqual(dataset_small.gene_names[:4].tolist(), to_keep)

        small_variance = np.std(dataset_small.X[:, 4:], axis=0).argsort()[::-1]
        self.assertListEqual(small_variance.tolist(), list(range(6)))

        full_variance = np.std(dataset_full.X, axis=0).argsort()[::-1]
        variable_genes_all = dataset_full.gene_names[full_variance]
        genes_truth = (to_keep + [g for g in variable_genes_all if g not in to_keep])[
            :total_genes
        ]
        self.assertListEqual(dataset_small.gene_names.tolist(), genes_truth)
Beispiel #11
0
def cortex_benchmark(n_epochs=250, use_cuda=True, unit_test=False):
    cortex_dataset = CortexDataset()
    vae = VAE(cortex_dataset.nb_genes)
    infer_cortex_vae = VariationalInference(vae, cortex_dataset, use_cuda=use_cuda)
    infer_cortex_vae.train(n_epochs=n_epochs)

    infer_cortex_vae.ll('test')  # assert ~ 1200
    infer_cortex_vae.differential_expression('test')
    infer_cortex_vae.imputation('test', rate=0.1)  # assert ~ 2.3
    n_samples = 1000 if not unit_test else 10
    infer_cortex_vae.show_t_sne('test', n_samples=n_samples)
    return infer_cortex_vae
Beispiel #12
0
def test_iaf2(save_path):
    dataset = CortexDataset(save_path=save_path)
    vae = IALogNormalPoissonVAE(n_input=dataset.nb_genes,
                                n_batch=dataset.n_batches,
                                do_h=True).cuda()
    trainer = UnsupervisedTrainer(vae,
                                  dataset,
                                  train_size=0.5,
                                  ratio_loss=True)
    trainer.train(n_epochs=1000)
    print(trainer.train_losses)
    z, l = trainer.test_set.get_latents(n_samples=5, device='cpu')
    return
Beispiel #13
0
def test_classifier_accuracy(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    cls_trainer = ClassifierTrainer(cls,
                                    cortex_dataset,
                                    metrics_to_monitor=['accuracy'],
                                    frequency=1,
                                    early_stopping_kwargs={
                                        'early_stopping_metric': 'accuracy',
                                        'save_best_state_metric': 'accuracy'
                                    })
    cls_trainer.train(n_epochs=2)
    cls_trainer.train_set.accuracy()
Beispiel #14
0
def test_differential_expression(save_path):
    dataset = CortexDataset(save_path=save_path)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = VAE(dataset.nb_genes, dataset.n_batches)
    trainer = UnsupervisedTrainer(vae,
                                  dataset,
                                  train_size=0.5,
                                  use_cuda=use_cuda)
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(vae,
                                    dataset,
                                    shuffle=False,
                                    indices=all_indices)

    # Sample scale example
    px_scales = post.scale_sampler(n_samples_per_cell=4,
                                   n_samples=None,
                                   selection=all_indices)["scale"]
    assert (px_scales.shape[1] == dataset.nb_genes
            ), "posterior scales should have shape (n_samples, n_genes)"

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
    )
    print(de_dataframe.keys())
    assert (de_dataframe["confidence_interval_0.5_min"] <=
            de_dataframe["confidence_interval_0.5_max"]).all()
    assert (de_dataframe["confidence_interval_0.95_min"] <=
            de_dataframe["confidence_interval_0.95_max"]).all()

    # DE estimation example
    de_probabilities = de_dataframe.loc[:, "proba_de"]
    assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()
Beispiel #15
0
def test_sampling_zl(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(
        cortex_vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda
    )
    trainer_cortex_vae.train(n_epochs=2)

    cortex_cls = Classifier((cortex_vae.n_latent + 1), n_labels=cortex_dataset.n_labels)
    trainer_cortex_cls = ClassifierTrainer(
        cortex_cls, cortex_dataset, sampling_model=cortex_vae, sampling_zl=True
    )
    trainer_cortex_cls.train(n_epochs=2)
    trainer_cortex_cls.test_set.accuracy()
Beispiel #16
0
def test_cortex():
    cortex_dataset = CortexDataset()
    vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    infer_cortex_vae = VariationalInference(vae,
                                            cortex_dataset,
                                            train_size=0.1,
                                            use_cuda=use_cuda)
    infer_cortex_vae.train(n_epochs=1)
    infer_cortex_vae.ll('train')
    infer_cortex_vae.differential_expression_stats('train')
    infer_cortex_vae.differential_expression('test')
    infer_cortex_vae.imputation('train', corruption='uniform')
    infer_cortex_vae.imputation('test', n_samples=2, corruption='binomial')

    svaec = SVAEC(cortex_dataset.nb_genes, cortex_dataset.n_batches,
                  cortex_dataset.n_labels)
    infer_cortex_svaec = JointSemiSupervisedVariationalInference(
        svaec,
        cortex_dataset,
        n_labelled_samples_per_class=50,
        use_cuda=use_cuda)
    infer_cortex_svaec.train(n_epochs=1)
    infer_cortex_svaec.accuracy('labelled')
    infer_cortex_svaec.ll('all')

    svaec = SVAEC(cortex_dataset.nb_genes,
                  cortex_dataset.n_batches,
                  cortex_dataset.n_labels,
                  logreg_classifier=True)
    infer_cortex_svaec = AlternateSemiSupervisedVariationalInference(
        svaec,
        cortex_dataset,
        n_labelled_samples_per_class=50,
        use_cuda=use_cuda)
    infer_cortex_svaec.train(n_epochs=1, lr=1e-2)
    infer_cortex_svaec.accuracy('unlabelled')
    infer_cortex_svaec.svc_rf(unit_test=True)

    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    infer_cls = ClassifierInference(cls, cortex_dataset)
    infer_cls.train(n_epochs=1)
    infer_cls.accuracy('train')
Beispiel #17
0
def test_full_cov():
    dataset = CortexDataset()
    mdl = VAE(n_input=dataset.nb_genes,
              n_batch=dataset.n_batches,
              reconstruction_loss='zinb',
              n_latent=2,
              full_cov=True)
    trainer = UnsupervisedTrainer(model=mdl,
                                  gene_dataset=dataset,
                                  use_cuda=True,
                                  train_size=0.7,
                                  frequency=1,
                                  early_stopping_kwargs={
                                      'early_stopping_metric': 'elbo',
                                      'save_best_state_metric': 'elbo',
                                      'patience': 15,
                                      'threshold': 3
                                  })
    trainer.train(n_epochs=20, lr=1e-3)
    assert not np.isnan(trainer.history['ll_test_set']).any()
Beispiel #18
0
def test_cortex(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda)
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.train_set.ll()
    trainer_cortex_vae.train_set.differential_expression_stats()

    trainer_cortex_vae.corrupt_posteriors(corruption='binomial')
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.uncorrupt_posteriors()

    trainer_cortex_vae.train_set.imputation_benchmark(n_samples=1, show_plot=False,
                                                      title_plot='imputation', save_path=save_path)

    svaec = SCANVI(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels)
    trainer_cortex_svaec = JointSemiSupervisedTrainer(svaec, cortex_dataset,
                                                      n_labelled_samples_per_class=3,
                                                      use_cuda=use_cuda)
    trainer_cortex_svaec.train(n_epochs=1)
    trainer_cortex_svaec.labelled_set.accuracy()
    trainer_cortex_svaec.full_dataset.ll()

    svaec = SCANVI(cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels)
    trainer_cortex_svaec = AlternateSemiSupervisedTrainer(svaec, cortex_dataset,
                                                          n_labelled_samples_per_class=3,
                                                          use_cuda=use_cuda)
    trainer_cortex_svaec.train(n_epochs=1, lr=1e-2)
    trainer_cortex_svaec.unlabelled_set.accuracy()
    data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data()
    data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data()
    compute_accuracy_svc(data_train, labels_train, data_test, labels_test,
                         param_grid=[{'C': [1], 'kernel': ['linear']}])
    compute_accuracy_rf(data_train, labels_train, data_test, labels_test,
                        param_grid=[{'max_depth': [3], 'n_estimators': [10]}])

    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    cls_trainer = ClassifierTrainer(cls, cortex_dataset)
    cls_trainer.train(n_epochs=1)
    cls_trainer.train_set.accuracy()
Beispiel #19
0
def test_vae_ratio_loss(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(cortex_vae,
                                             cortex_dataset,
                                             train_size=0.5,
                                             use_cuda=use_cuda,
                                             ratio_loss=True)
    trainer_cortex_vae.train(n_epochs=2)

    dataset = LatentLogPoissonDataset(n_genes=5,
                                      n_latent=2,
                                      n_cells=300,
                                      n_comps=1)
    vae = LogNormalPoissonVAE(dataset.nb_genes,
                              dataset.n_batches,
                              full_cov=True)
    trainer_vae = UnsupervisedTrainer(vae,
                                      dataset,
                                      train_size=0.5,
                                      use_cuda=use_cuda,
                                      ratio_loss=True)
    trainer_vae.train(n_epochs=2)
Beispiel #20
0
def test_iaf(save_path):
    enc = EncoderIAF(n_in=5,
                     n_latent=2,
                     n_cat_list=None,
                     n_hidden=12,
                     n_layers=2,
                     t=3).cuda()
    x = torch.rand(64, 5, device='cuda')
    z1, _ = enc(x)
    assert z1.shape == (64, 2)

    dataset = CortexDataset(save_path=save_path)
    vae = IAVAE(n_input=dataset.nb_genes, n_batch=dataset.n_batches).cuda()
    trainer = UnsupervisedTrainer(vae,
                                  dataset,
                                  train_size=0.5,
                                  ratio_loss=True)
    trainer.train(n_epochs=2)

    z, labels = trainer.train_set.get_latents(n_samples=10, device='cuda')

    vae = IALogNormalPoissonVAE(n_input=dataset.nb_genes,
                                n_batch=dataset.n_batches).cuda()
    trainer = UnsupervisedTrainer(vae,
                                  dataset,
                                  train_size=0.5,
                                  ratio_loss=True)
    trainer.train(n_epochs=2)
    with torch.no_grad():
        outputs = vae.inference(x=torch.randint(low=1,
                                                high=10,
                                                size=(128, dataset.nb_genes),
                                                device='cuda',
                                                dtype=torch.float),
                                n_samples=3)
    z, l = trainer.test_set.get_latents(n_samples=5, device='cpu')
    return
Beispiel #21
0
def test_cortex(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(
        vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda
    )
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.train_set.reconstruction_error()
    trainer_cortex_vae.train_set.differential_expression_stats()
    trainer_cortex_vae.train_set.generate_feature_correlation_matrix(
        n_samples=2, correlation_type="pearson"
    )
    trainer_cortex_vae.train_set.generate_feature_correlation_matrix(
        n_samples=2, correlation_type="spearman"
    )
    trainer_cortex_vae.train_set.imputation(n_samples=1)
    trainer_cortex_vae.test_set.imputation(n_samples=5)

    trainer_cortex_vae.corrupt_posteriors(corruption="binomial")
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.uncorrupt_posteriors()

    trainer_cortex_vae.train_set.imputation_benchmark(
        n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path
    )
    trainer_cortex_vae.train_set.generate_parameters()

    n_cells, n_genes = (
        len(trainer_cortex_vae.train_set.indices),
        cortex_dataset.nb_genes,
    )
    n_samples = 3
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters()
    assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes)
    assert dispersions.shape == (n_cells, n_genes)
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters(
        n_samples=n_samples
    )
    assert dropout.shape == (n_samples, n_cells, n_genes)
    assert means.shape == (n_samples, n_cells, n_genes,)
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters(
        n_samples=n_samples, give_mean=True
    )
    assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes)

    full = trainer_cortex_vae.create_posterior(
        vae, cortex_dataset, indices=np.arange(len(cortex_dataset))
    )
    x_new, x_old = full.generate(n_samples=10)
    assert x_new.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes, 10)
    assert x_old.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes)

    trainer_cortex_vae.train_set.imputation_benchmark(
        n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path
    )

    svaec = SCANVI(
        cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels
    )
    trainer_cortex_svaec = JointSemiSupervisedTrainer(
        svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda
    )
    trainer_cortex_svaec.train(n_epochs=1)
    trainer_cortex_svaec.labelled_set.accuracy()
    trainer_cortex_svaec.full_dataset.reconstruction_error()

    svaec = SCANVI(
        cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels
    )
    trainer_cortex_svaec = AlternateSemiSupervisedTrainer(
        svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda
    )
    trainer_cortex_svaec.train(n_epochs=1, lr=1e-2)
    trainer_cortex_svaec.unlabelled_set.accuracy()
    data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data()
    data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data()
    compute_accuracy_svc(
        data_train,
        labels_train,
        data_test,
        labels_test,
        param_grid=[{"C": [1], "kernel": ["linear"]}],
    )
    compute_accuracy_rf(
        data_train,
        labels_train,
        data_test,
        labels_test,
        param_grid=[{"max_depth": [3], "n_estimators": [10]}],
    )

    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    cls_trainer = ClassifierTrainer(cls, cortex_dataset)
    cls_trainer.train(n_epochs=1)
    cls_trainer.train_set.accuracy()
Beispiel #22
0
def test_fish_rna():
    gene_dataset_fish = SmfishDataset()
    gene_dataset_seq = CortexDataset(genes_fish=gene_dataset_fish.gene_names,
                                     genes_to_keep=[],
                                     additional_genes=50)
    benchamrk_fish_scrna(gene_dataset_seq, gene_dataset_fish)
Beispiel #23
0
def test_filter_and_concat_datasets():
    cortex_dataset_1 = CortexDataset(save_path='tests/data/')
    cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3))
    cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
    cortex_dataset_2 = CortexDataset(save_path='tests/data/')
    cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4))
    cortex_dataset_2.filter_cell_types(["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"])
    cortex_dataset_2.filter_cell_types([2, 0])
    cortex_dataset_merged = GeneExpressionDataset.concat_datasets(cortex_dataset_1, cortex_dataset_2)
    assert cortex_dataset_merged.nb_genes == 2

    synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5)
    synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3)
    synthetic_merged_1 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2)
    assert synthetic_merged_1.n_batches == 5
    assert synthetic_merged_1.n_labels == 5

    synthetic_merged_2 = GeneExpressionDataset.concat_datasets(synthetic_dataset_1, synthetic_dataset_2,
                                                               shared_labels=False)
    assert synthetic_merged_2.n_batches == 5
    assert synthetic_merged_2.n_labels == 8

    synthetic_dataset_1.filter_cell_types([0, 1, 2, 3])
    assert synthetic_dataset_1.n_labels == 4

    synthetic_dataset_1.subsample_cells(50)
    assert len(synthetic_dataset_1) == 50

    synthetic_dataset_3 = SyntheticDataset(n_labels=6)
    synthetic_dataset_3.cell_types = np.arange(6).astype(np.str)
    synthetic_dataset_3.map_cell_types({"2": "9", ("4", "3"): "8"})
Beispiel #24
0
def test_differential_expression(save_path):
    dataset = CortexDataset(save_path=save_path)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = VAE(dataset.nb_genes, dataset.n_batches)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda)
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices)

    with tempfile.TemporaryDirectory() as temp_dir:
        posterior_save_path = os.path.join(temp_dir, "posterior_data")
        post.save_posterior(posterior_save_path)
        new_vae = VAE(dataset.nb_genes, dataset.n_batches)
        new_post = load_posterior(posterior_save_path, model=new_vae, use_cuda=False)
    assert np.array_equal(new_post.indices, post.indices)
    assert np.array_equal(new_post.gene_dataset.X, post.gene_dataset.X)

    # Sample scale example
    px_scales = post.scale_sampler(
        n_samples_per_cell=4, n_samples=None, selection=all_indices
    )["scale"]
    assert (
        px_scales.shape[1] == dataset.nb_genes
    ), "posterior scales should have shape (n_samples, n_genes)"

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
        cred_interval_lvls=[0.5, 0.95],
    )
    print(de_dataframe.keys())
    assert (
        de_dataframe["confidence_interval_0.5_min"]
        <= de_dataframe["confidence_interval_0.5_max"]
    ).all()
    assert (
        de_dataframe["confidence_interval_0.95_min"]
        <= de_dataframe["confidence_interval_0.95_max"]
    ).all()

    # DE estimation example
    de_probabilities = de_dataframe.loc[:, "proba_de"]
    assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()

    # Test totalVI DE
    sp = os.path.join(save_path, "10X")
    dataset = Dataset10X(dataset_name="pbmc_10k_protein_v3", save_path=sp)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = TOTALVI(
        dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches
    )
    trainer = TotalTrainer(
        vae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None
    )
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(
        vae, dataset, shuffle=False, indices=all_indices, type_class=TotalPosterior
    )

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
    )
Beispiel #25
0
def test_filter_and_concat_datasets():
    cortex_dataset_1 = CortexDataset()
    cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 300))
    cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
    cortex_dataset_2 = CortexDataset()
    cortex_dataset_2.subsample_genes(subset_genes=np.arange(100, 400))
    cortex_dataset_2.filter_cell_types(
        ["endothelial-mural", "interneurons", "microglia", "oligodendrocytes"])
    cortex_dataset_2.filter_cell_types([2, 0])
    cortex_dataset_merged = GeneExpressionDataset.concat_datasets(
        cortex_dataset_1, cortex_dataset_2)
    assert cortex_dataset_merged.nb_genes == 200

    synthetic_dataset_1 = SyntheticDataset(n_batches=2, n_labels=5)
    synthetic_dataset_2 = SyntheticDataset(n_batches=3, n_labels=3)
    synthetic_merged_1 = GeneExpressionDataset.concat_datasets(
        synthetic_dataset_1, synthetic_dataset_2)
    assert synthetic_merged_1.n_batches == 5
    assert synthetic_merged_1.n_labels == 5

    synthetic_merged_2 = GeneExpressionDataset.concat_datasets(
        synthetic_dataset_1, synthetic_dataset_2, shared_labels=False)
    assert synthetic_merged_2.n_batches == 5
    assert synthetic_merged_2.n_labels == 8

    synthetic_dataset_1.filter_cell_types([0, 1, 2, 3])
    assert synthetic_dataset_1.n_labels == 4

    synthetic_dataset_1.subsample_cells(50)
    assert len(synthetic_dataset_1) == 50
Beispiel #26
0
 def test_populate_from_datasets_cortex(self):
     cortex_dataset_1 = CortexDataset(save_path="tests/data")
     cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3),
                                      mode="variance")
     cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
     cortex_dataset_2 = CortexDataset(save_path="tests/data")
     cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4),
                                      mode="variance")
     cortex_dataset_2.filter_cell_types([
         "endothelial-mural", "interneurons", "microglia",
         "oligodendrocytes"
     ])
     cortex_dataset_2.filter_cell_types([2, 0])
     dataset = GeneExpressionDataset()
     dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2])
     self.assertEqual(2, dataset.nb_genes)
Beispiel #27
0
def to_tensor(x):
    """ numpy array to pytorch tensor """
    return torch.from_numpy(x.astype('float32')).to(torch_device)


def to_array(x):
    """ pytorch tensor to numpy array """
    if hasattr(x, 'todense'):
        return np.array(x.todense())
    if hasattr(x, 'cpu'):
        return x.data.cpu().numpy()
    return x


# Load dataset
cortex = CortexDataset(save_path=SAVE_DATA_PATH)
X = cortex.X
labels = cortex.cell_types
n_labels = len(labels)
Y = one_hot(cortex.labels.ravel(), n_labels)

# ===========================================================================
# scVI
# ===========================================================================
scvi = VAE(n_input=cortex.nb_genes,
           n_batch=0,
           n_labels=0,
           n_hidden=n_hidden,
           n_latent=n_latent,
           n_layers=n_layer,
           dispersion=dispersion,
Beispiel #28
0
 def test_populate(self):
     dataset = CortexDataset(save_path="tests/data")
     unsupervised_training_one_epoch(dataset)
Beispiel #29
0
show_plot = True

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from scvi.dataset import CortexDataset, RetinaDataset
from scvi.models import *
from scvi.inference import UnsupervisedTrainer
import torch

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

gene_dataset = CortexDataset(save_path=save_path)

n_epochs = 400 if n_epochs_all is None else n_epochs_all
lr = 1e-3
use_batches = False
use_cuda = True

vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches)
trainer = UnsupervisedTrainer(vae,
                              gene_dataset,
                              train_size=0.75,
                              use_cuda=use_cuda,
                              frequency=5,
                              verbose=True)

trainer.train(n_epochs=n_epochs, lr=lr)