Example #1
0
def correct_scvi(Xs, genes):
    import torch
    use_cuda = True
    torch.cuda.set_device(1)

    from scvi.dataset.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import SCANVI, VAE
    from scvi.dataset.anndata import AnnDataset

    all_ann = [AnnDataset(AnnData(X, var=genes)) for X in Xs]

    all_dataset = GeneExpressionDataset.concat_datasets(*all_ann)

    vae = VAE(all_dataset.nb_genes,
              n_batch=all_dataset.n_batches,
              n_labels=all_dataset.n_labels,
              n_hidden=128,
              n_latent=30,
              n_layers=2,
              dispersion='gene')
    trainer = UnsupervisedTrainer(vae, all_dataset, train_size=0.99999)
    n_epochs = 100
    #trainer.train(n_epochs=n_epochs)
    #torch.save(trainer.model.state_dict(),
    #           'data/harmonization.vae.pkl')
    trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl'))
    trainer.model.eval()

    full = trainer.create_posterior(trainer.model,
                                    all_dataset,
                                    indices=np.arange(len(all_dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()

    return latent
def trainVAE(gene_dataset,
             filename,
             rep,
             nlayers=2,
             n_hidden=128,
             reconstruction_loss: str = 'zinb'):
    vae = VAE(gene_dataset.nb_genes,
              n_batch=gene_dataset.n_batches,
              n_labels=gene_dataset.n_labels,
              n_hidden=n_hidden,
              n_latent=10,
              n_layers=nlayers,
              dispersion='gene',
              reconstruction_loss=reconstruction_loss)
    trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0)
    filename = '../' + filename + '/' + 'vae' + '.' + reconstruction_loss + '.rep' + str(
        rep) + '.pkl'
    if os.path.isfile(filename):
        trainer.model.load_state_dict(torch.load(filename))
        trainer.model.eval()
    else:
        trainer.train(n_epochs=250)
        torch.save(trainer.model.state_dict(), filename)
    full = trainer.create_posterior(trainer.model,
                                    gene_dataset,
                                    indices=np.arange(len(gene_dataset)))
    return full
Example #3
0
def scVI_latent(csv_file,
                csv_path,
                vae_model=VAE,
                train_size=1.0,
                n_labels=0,
                seed=1234,
                n_cores=1,
                lr=1e-3,
                use_cuda=False):
    set_seed(seed)
    dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None)
    # Based on recommendations in basic_tutorial.ipynb
    n_epochs = 400 if (len(dat) < 10000) else 200
    # trainer and model
    vae = vae_model(dat.nb_genes, n_labels=n_labels)
    trainer = UnsupervisedTrainer(
        vae,
        dat,
        train_size=train_size,  # default to 0.8, documentation recommends 1
        use_cuda=use_cuda)
    # limit cpu usage
    torch.set_num_threads(n_cores)
    trainer.train(n_epochs=n_epochs, lr=lr)
    full = trainer.create_posterior(trainer.model,
                                    dat,
                                    indices=np.arange(len(dat)))
    # Updating the "minibatch" size after training is useful in low memory configurations
    Z_hat = full.sequential().get_latent()[0]
    adata = anndata.AnnData(dat.X)
    for i, z in enumerate(Z_hat.T):
        adata.obs[f'Z_{i}'] = z
    # reordering for convenience and correspondance with PCA's ordering
    cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis=1)
    return (cellLoads)
Example #4
0
def test_gamma_de():
    cortex_dataset = CortexDataset()
    cortex_vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(cortex_vae,
                                             cortex_dataset,
                                             train_size=0.5,
                                             use_cuda=use_cuda)
    trainer_cortex_vae.train(n_epochs=2)

    full = trainer_cortex_vae.create_posterior(trainer_cortex_vae.model,
                                               cortex_dataset,
                                               indices=np.arange(
                                                   len(cortex_dataset)))

    n_samples = 10
    M_permutation = 100
    cell_idx1 = cortex_dataset.labels.ravel() == 0
    cell_idx2 = cortex_dataset.labels.ravel() == 1

    full.differential_expression_score(cell_idx1,
                                       cell_idx2,
                                       n_samples=n_samples,
                                       M_permutation=M_permutation)
    full.differential_expression_gamma(cell_idx1,
                                       cell_idx2,
                                       n_samples=n_samples,
                                       M_permutation=M_permutation)
Example #5
0
def scVI_norm(csv_file,
              csv_path,
              vae_model=VAE,
              train_size=1.0,
              n_labels=0,
              seed=1234,
              n_cores=1,
              lr=1e-3,
              use_cuda=False):
    set_seed(seed)
    dat = CsvDataset(csv_file, save_path=csv_path, new_n_genes=None)
    dat.subsample_genes(1000, mode="variance")
    # Based on recommendations in basic_tutorial.ipynb
    n_epochs = 400 if (len(dat) < 10000) else 200
    # trainer and model
    vae = vae_model(dat.nb_genes, n_labels=n_labels)
    trainer = UnsupervisedTrainer(
        vae,
        dat,
        train_size=train_size,  # default to 0.8, documentation recommends 1
        use_cuda=use_cuda)
    # limit cpu usage
    torch.set_num_threads(n_cores)
    trainer.train(n_epochs=n_epochs, lr=lr)
    full = trainer.create_posterior(trainer.model,
                                    dat,
                                    indices=np.arange(len(dat)))
    # Updating the "minibatch" size after training is useful in low memory configurations
    normalized_values = full.sequential().get_sample_scale()
    return [normalized_values, dat.gene_names]
Example #6
0
def scVI_ld(csv_file, csv_path, ndims, vae_model = VAE, n_labels = 0, n_cores=1, seed= 1234, lr = 1e-3, use_cuda = False): 
  set_seed(seed)
  dat = CsvDataset(csv_file, 
                   save_path=csv_path, 
                   new_n_genes=None) 
  # Based on recommendations in linear_decoder.ipynb
  n_epochs = 250
  # trainer and model 
  ldvae = LDVAE(
        dat.nb_genes,
        n_batch = dat.n_batches,
        n_latent = ndims, 
        n_labels = n_labels
        )
  trainerLD = UnsupervisedTrainer(ldvae, dat, use_cuda=use_cuda)
  # limit cpu usage
  torch.set_num_threads(n_cores) 
  trainerLD.train(n_epochs=n_epochs, lr=lr)
  # extract mean value for the ld
  full = trainerLD.create_posterior(trainerLD.model, dat, indices=np.arange(len(dat)))
  Z_hat = full.sequential().get_latent()[0]
  adata = anndata.AnnData(dat.X)
  for i, z in enumerate(Z_hat.T):
      adata.obs[f'Z_{i}'] = z
  # reordering for convenience and correspondance with PCA's ordering
  cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis = 1)
  return(cellLoads)
Example #7
0
    def run(self):
        n_epochs = 100
        n_latent = 10
        n_hidden = 128
        n_layers = 2
        net_data = self.data.copy()
        net_data.X = self.data.layers['counts']
        del net_data.layers['counts']
        net_data.raw = None  # Ensure that the raw counts are not accidentally used

        # Define batch indices
        le = LabelEncoder()
        net_data.obs['batch_indices'] = le.fit_transform(
            net_data.obs[self.batch].values)
        net_data = AnnDatasetFromAnnData(net_data)
        vae = VAE(net_data.nb_genes,
                  reconstruction_loss='nb',
                  n_batch=net_data.n_batches,
                  n_layers=n_layers,
                  n_latent=n_latent,
                  n_hidden=n_hidden)
        trainer = UnsupervisedTrainer(vae,
                                      net_data,
                                      train_size=1,
                                      use_cuda=False)
        trainer.train(n_epochs=n_epochs, lr=1e-3)
        full = trainer.create_posterior(trainer.model,
                                        net_data,
                                        indices=np.arange(len(net_data)))
        latent, _, _ = full.sequential().get_latent()
        self.data.obsm['X_emb'] = latent
        self.dump_to_h5ad("scvi")
Example #8
0
def test_encoder_only():
    # torch.autograd.set_detect_anomaly(mode=True)
    dataset = LatentLogPoissonDataset(n_genes=5,
                                      n_latent=2,
                                      n_cells=300,
                                      n_comps=1)
    dataset = LatentLogPoissonDataset(n_genes=3,
                                      n_latent=2,
                                      n_cells=15,
                                      n_comps=2)
    dataset = LatentLogPoissonDataset(n_genes=5,
                                      n_latent=2,
                                      n_cells=150,
                                      n_comps=1,
                                      learn_prior_scale=True)

    # _, _, marginals = dataset.compute_posteriors(
    #     x_obs=torch.randint(0, 150, size=(1, 5), dtype=torch.float),
    #     mcmc_kwargs={"num_samples": 20, "warmup_steps": 20, "num_chains": 1}
    # )
    # stats = marginals.diagnostics()
    # print(stats)
    dataset.cuda()

    vae_mdl = LogNormalPoissonVAE(
        dataset.nb_genes,
        dataset.n_batches,
        autoregressive=False,
        full_cov=True,
        n_latent=2,
        gt_decoder=dataset.nn_model,
    )
    params = vae_mdl.encoder_params
    trainer = UnsupervisedTrainer(
        model=vae_mdl,
        gene_dataset=dataset,
        use_cuda=True,
        train_size=0.7,
        n_epochs_kl_warmup=1,
        ratio_loss=True,
    )
    trainer.train(
        n_epochs=2,
        lr=1e-3,
        params=params,
    )

    full = trainer.create_posterior(trainer.model,
                                    dataset,
                                    indices=np.arange(len(dataset)))
    lkl_estimate = vae_mdl.marginal_ll(full, n_samples_mc=50)
Example #9
0
def test_differential_expression(save_path):
    dataset = CortexDataset(save_path=save_path)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = VAE(dataset.nb_genes, dataset.n_batches)
    trainer = UnsupervisedTrainer(vae,
                                  dataset,
                                  train_size=0.5,
                                  use_cuda=use_cuda)
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(vae,
                                    dataset,
                                    shuffle=False,
                                    indices=all_indices)

    # Sample scale example
    px_scales = post.scale_sampler(n_samples_per_cell=4,
                                   n_samples=None,
                                   selection=all_indices)["scale"]
    assert (px_scales.shape[1] == dataset.nb_genes
            ), "posterior scales should have shape (n_samples, n_genes)"

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
    )
    print(de_dataframe.keys())
    assert (de_dataframe["confidence_interval_0.5_min"] <=
            de_dataframe["confidence_interval_0.5_max"]).all()
    assert (de_dataframe["confidence_interval_0.95_min"] <=
            de_dataframe["confidence_interval_0.95_max"]).all()

    # DE estimation example
    de_probabilities = de_dataframe.loc[:, "proba_de"]
    assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()
Example #10
0
def trainVAE(gene_dataset, rmCellTypes,rep):
    vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels,
              n_hidden=128, n_latent=10, n_layers=2, dispersion='gene')
    trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0)
    if os.path.isfile('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)):
        trainer.model.load_state_dict(torch.load('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)))
        trainer.model.eval()
    else:
        trainer.train(n_epochs=150)
        torch.save(trainer.model.state_dict(), '../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep))
    full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()
    batch_indices = batch_indices.ravel()
    return latent, batch_indices,labels,trainer
Example #11
0
def compute_scvi_latent(
    adata: sc.AnnData,
    n_latent: int = 5,
    n_epochs: int = 100,
    lr: float = 1e-3,
    use_batches: bool = False,
    use_cuda: bool = True,
) -> Tuple[scvi.inference.Posterior, np.ndarray]:
    """Train and return a scVI model and sample a latent space

    :param adata: sc.AnnData object non-normalized
    :param n_latent: dimension of the latent space
    :param n_epochs: number of training epochs
    :param lr: learning rate
    :param use_batches
    :param use_cuda
    :return: (scvi.Posterior, latent_space)
    """
    # Convert easily to scvi dataset
    scviDataset = AnnDataset(adata)

    # Train a model
    vae = VAE(
        scviDataset.nb_genes,
        n_batch=scviDataset.n_batches * use_batches,
        n_latent=n_latent,
    )
    trainer = UnsupervisedTrainer(vae,
                                  scviDataset,
                                  train_size=1.0,
                                  use_cuda=use_cuda)
    trainer.train(n_epochs=n_epochs, lr=lr)
    ####

    # Extract latent space
    posterior = trainer.create_posterior(trainer.model,
                                         scviDataset,
                                         indices=np.arange(
                                             len(scviDataset))).sequential()

    latent, _, _ = posterior.get_latent()

    return posterior, latent
Example #12
0
def correct_scvi(Xs, genes):
    import torch
    torch.manual_seed(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    from scvi.dataset import AnnDatasetFromAnnData
    from scvi.dataset.dataset import GeneExpressionDataset
    from scvi.inference import UnsupervisedTrainer
    from scvi.models import VAE

    all_ann = [AnnDatasetFromAnnData(AnnData(X, var=genes)) for X in Xs]

    all_dataset = GeneExpressionDataset()
    all_dataset.populate_from_datasets(all_ann)

    vae = VAE(all_dataset.nb_genes,
              n_batch=all_dataset.n_batches,
              n_labels=all_dataset.n_labels,
              n_hidden=128,
              n_latent=30,
              n_layers=2,
              dispersion='gene')
    trainer = UnsupervisedTrainer(
        vae,
        all_dataset,
        train_size=1.,
        use_cuda=True,
    )
    n_epochs = 100
    #trainer.train(n_epochs=n_epochs)
    #torch.save(trainer.model.state_dict(),
    #           'data/harmonization.vae.pkl')
    trainer.model.load_state_dict(torch.load('data/harmonization.vae.pkl'))
    trainer.model.eval()

    full = trainer.create_posterior(trainer.model,
                                    all_dataset,
                                    indices=np.arange(len(all_dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()

    return latent
Example #13
0
def scvi_impute() -> None:
    fnm: str = "sc_10x_5cl_forimput_cnt.csv"
    save_path: PosixPath = here('./10xGenomics/scRNAseq')

    symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True)

    vae = VAE(symsim_dataset.nb_genes)

    trainer = UnsupervisedTrainer(vae,
                                  symsim_dataset,
                                  train_size=1.0,
                                  use_cuda=use_cuda,
                                  frequency=5)

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(trainer.model,
                                    symsim_dataset,
                                    indices=np.arange(len(symsim_dataset)))
    impute_values = full.sequential().imputation()

    outfnm: str = "scvi_impt.csv"
    out_path = here("./10xGenomics/impt/").joinpath(outfnm)
    np.savetxt(out_path, impute_values, delimiter=",")
Example #14
0
def scvi_impute(seed: int = 1, platform: str = "umi") -> None:
    fnm: str = f"sim_{ncell}_{ngene}_{seed}_{platform}_.csv"
    save_path: PosixPath = here('./scVI/data/symsim')
    # fullpath:PosixPath = here('./scVI/data/symsim').joinpath(fnm)

    symsim_dataset = CsvDataset(fnm, save_path=save_path, gene_by_cell=True)

    vae = VAE(symsim_dataset.nb_genes)

    trainer = UnsupervisedTrainer(vae,
                                  symsim_dataset,
                                  train_size=1.0,
                                  use_cuda=use_cuda,
                                  frequency=5)

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(trainer.model,
                                    symsim_dataset,
                                    indices=np.arange(len(symsim_dataset)))
    impute_values = full.sequential().imputation()

    out_path = here("./simutool/jobs/scvi_result").joinpath(fnm)
    np.savetxt(out_path, impute_values, delimiter=",")
Example #15
0
def test_differential_expression(save_path):
    dataset = CortexDataset(save_path=save_path)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = VAE(dataset.nb_genes, dataset.n_batches)
    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.5, use_cuda=use_cuda)
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(vae, dataset, shuffle=False, indices=all_indices)

    with tempfile.TemporaryDirectory() as temp_dir:
        posterior_save_path = os.path.join(temp_dir, "posterior_data")
        post.save_posterior(posterior_save_path)
        new_vae = VAE(dataset.nb_genes, dataset.n_batches)
        new_post = load_posterior(posterior_save_path, model=new_vae, use_cuda=False)
    assert np.array_equal(new_post.indices, post.indices)
    assert np.array_equal(new_post.gene_dataset.X, post.gene_dataset.X)

    # Sample scale example
    px_scales = post.scale_sampler(
        n_samples_per_cell=4, n_samples=None, selection=all_indices
    )["scale"]
    assert (
        px_scales.shape[1] == dataset.nb_genes
    ), "posterior scales should have shape (n_samples, n_genes)"

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
        cred_interval_lvls=[0.5, 0.95],
    )
    print(de_dataframe.keys())
    assert (
        de_dataframe["confidence_interval_0.5_min"]
        <= de_dataframe["confidence_interval_0.5_max"]
    ).all()
    assert (
        de_dataframe["confidence_interval_0.95_min"]
        <= de_dataframe["confidence_interval_0.95_max"]
    ).all()

    # DE estimation example
    de_probabilities = de_dataframe.loc[:, "proba_de"]
    assert ((0.0 <= de_probabilities) & (de_probabilities <= 1.0)).all()

    # Test totalVI DE
    sp = os.path.join(save_path, "10X")
    dataset = Dataset10X(dataset_name="pbmc_10k_protein_v3", save_path=sp)
    n_cells = len(dataset)
    all_indices = np.arange(n_cells)
    vae = TOTALVI(
        dataset.nb_genes, len(dataset.protein_names), n_batch=dataset.n_batches
    )
    trainer = TotalTrainer(
        vae, dataset, train_size=0.5, use_cuda=use_cuda, early_stopping_kwargs=None
    )
    trainer.train(n_epochs=2)
    post = trainer.create_posterior(
        vae, dataset, shuffle=False, indices=all_indices, type_class=TotalPosterior
    )

    # Differential expression different models
    idx_1 = [1, 2, 3]
    idx_2 = [4, 5, 6, 7]
    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="vanilla",
        use_permutation=True,
        M_permutation=100,
    )

    de_dataframe = post.differential_expression_score(
        idx1=idx_1,
        idx2=idx_2,
        n_samples=10,
        mode="change",
        use_permutation=True,
        M_permutation=100,
    )
Example #16
0
def test_cortex(save_path):
    cortex_dataset = CortexDataset(save_path=save_path)
    vae = VAE(cortex_dataset.nb_genes, cortex_dataset.n_batches)
    trainer_cortex_vae = UnsupervisedTrainer(
        vae, cortex_dataset, train_size=0.5, use_cuda=use_cuda
    )
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.train_set.reconstruction_error()
    trainer_cortex_vae.train_set.differential_expression_stats()
    trainer_cortex_vae.train_set.generate_feature_correlation_matrix(
        n_samples=2, correlation_type="pearson"
    )
    trainer_cortex_vae.train_set.generate_feature_correlation_matrix(
        n_samples=2, correlation_type="spearman"
    )
    trainer_cortex_vae.train_set.imputation(n_samples=1)
    trainer_cortex_vae.test_set.imputation(n_samples=5)

    trainer_cortex_vae.corrupt_posteriors(corruption="binomial")
    trainer_cortex_vae.corrupt_posteriors()
    trainer_cortex_vae.train(n_epochs=1)
    trainer_cortex_vae.uncorrupt_posteriors()

    trainer_cortex_vae.train_set.imputation_benchmark(
        n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path
    )
    trainer_cortex_vae.train_set.generate_parameters()

    n_cells, n_genes = (
        len(trainer_cortex_vae.train_set.indices),
        cortex_dataset.nb_genes,
    )
    n_samples = 3
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters()
    assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes)
    assert dispersions.shape == (n_cells, n_genes)
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters(
        n_samples=n_samples
    )
    assert dropout.shape == (n_samples, n_cells, n_genes)
    assert means.shape == (n_samples, n_cells, n_genes,)
    (dropout, means, dispersions,) = trainer_cortex_vae.train_set.generate_parameters(
        n_samples=n_samples, give_mean=True
    )
    assert dropout.shape == (n_cells, n_genes) and means.shape == (n_cells, n_genes)

    full = trainer_cortex_vae.create_posterior(
        vae, cortex_dataset, indices=np.arange(len(cortex_dataset))
    )
    x_new, x_old = full.generate(n_samples=10)
    assert x_new.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes, 10)
    assert x_old.shape == (cortex_dataset.nb_cells, cortex_dataset.nb_genes)

    trainer_cortex_vae.train_set.imputation_benchmark(
        n_samples=1, show_plot=False, title_plot="imputation", save_path=save_path
    )

    svaec = SCANVI(
        cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels
    )
    trainer_cortex_svaec = JointSemiSupervisedTrainer(
        svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda
    )
    trainer_cortex_svaec.train(n_epochs=1)
    trainer_cortex_svaec.labelled_set.accuracy()
    trainer_cortex_svaec.full_dataset.reconstruction_error()

    svaec = SCANVI(
        cortex_dataset.nb_genes, cortex_dataset.n_batches, cortex_dataset.n_labels
    )
    trainer_cortex_svaec = AlternateSemiSupervisedTrainer(
        svaec, cortex_dataset, n_labelled_samples_per_class=3, use_cuda=use_cuda
    )
    trainer_cortex_svaec.train(n_epochs=1, lr=1e-2)
    trainer_cortex_svaec.unlabelled_set.accuracy()
    data_train, labels_train = trainer_cortex_svaec.labelled_set.raw_data()
    data_test, labels_test = trainer_cortex_svaec.unlabelled_set.raw_data()
    compute_accuracy_svc(
        data_train,
        labels_train,
        data_test,
        labels_test,
        param_grid=[{"C": [1], "kernel": ["linear"]}],
    )
    compute_accuracy_rf(
        data_train,
        labels_train,
        data_test,
        labels_test,
        param_grid=[{"max_depth": [3], "n_estimators": [10]}],
    )

    cls = Classifier(cortex_dataset.nb_genes, n_labels=cortex_dataset.n_labels)
    cls_trainer = ClassifierTrainer(cls, cortex_dataset)
    cls_trainer.train(n_epochs=1)
    cls_trainer.train_set.accuracy()
Example #17
0
#Train the scVI model
#depending on the size of your data and if you have an NVIDIA GPU, this could
#take 10 minutes to 1+ hours. If you'd like to make some tea or coffee, now
#would be an appropriate time to do so.
trainer = UnsupervisedTrainer(vae,
                              dataset,
                              train_size=train_size,
                              use_cuda=use_cuda,
                              frequency=5)
trainer.train(n_epochs=n_epochs, lr=lr)
print("Model training finished!")

#Create the posterior representation of the data, and extract the latent space
#and imputed data
downsampled_gene_names = dataset.gene_names
full_posterior = trainer.create_posterior(vae,
                                          dataset,
                                          indices=np.arange(len(dataset)))
scVI_latent = full_posterior.sequential().get_latent()[0]
scVI_imputed = full_posterior.sequential().imputation()

#Save the relevant output files
np.savetxt(latent_save_file, scVI_latent, fmt='%s', delimiter=",")
np.savetxt(imputation_save_file, scVI_imputed, fmt='%s', delimiter=",")
np.savetxt(gene_names_save_file,
           downsampled_gene_names,
           fmt='%s',
           delimiter=",")
torch.save(trainer.model.state_dict(), scVI_model_save_file)
Example #18
0
    trainer.train(n_epochs=n_epochs, lr=0.001)
    torch.save(trainer.model.state_dict(), file_name)

    # write training info
    ll_train_set = trainer.history["ll_train_set"][1:]
    ll_test_set = trainer.history["ll_test_set"][1:]
    x = np.linspace(1, n_epochs, (len(ll_train_set)))
    plt.plot(x, ll_train_set)
    plt.plot(x, ll_test_set)
    plt.title("training ll")
    plt.savefig("figures/simulations_scRNA/loss_training.png")
    plt.clf()

# get latent space
full = trainer.create_posterior(trainer.model,
                                gene_dataset,
                                indices=np.arange(len(gene_dataset)))
latent, batch_indices, labels = full.sequential().get_latent()
if plot:
    n_samples_tsne = 4000
    full.show_t_sne(n_samples=n_samples_tsne,
                    color_by='labels',
                    save_name="figures/simulations_scRNA/tSNE.png")

# prepare for differential expression
cell_types = gene_dataset.cell_types
print(gene_dataset.cell_types)
couple_celltypes_list = [(0, 1), (1, 2), (1, 3), (3, 4)]

for key in theoretical_FC.columns:
    print(key)
Example #19
0
dataset = "path/to/UMI_count_table.csv.gz"
dataset_dir = "path/to/"
outdir = "path/to/output/directory/"

# Read count matrix with all genes (from https://github.com/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb)
local_csv_dataset = CsvDataset(dataset,
                               save_path=dataset_dir,
                               compression='gzip',
                               new_n_genes=False)

# Process data (from https://github.com/YosefLab/scVI/blob/master/tests/notebooks/basic_tutorial.ipynb)
use_batches = False
use_cuda = True
vae = VAE(local_csv_dataset.nb_genes,
          n_batch=local_csv_dataset.n_batches * use_batches)
trainer = UnsupervisedTrainer(vae,
                              local_csv_dataset,
                              train_size=0.75,
                              use_cuda=use_cuda)
trainer.train()
full = trainer.create_posterior(trainer.model,
                                local_csv_dataset,
                                indices=np.arange(len(local_csv_dataset)))
imputed_values = full.sequential().imputation()

# Write output matrix
np.savetxt(outdir + '/scvi_normalization.txt',
           imputed_values.T,
           fmt='%.6e',
           delimiter='\t')
Example #20
0
trainer = UnsupervisedTrainer(vae,
                              gene_dataset,
                              train_size=0.9,
                              use_cuda=use_cuda,
                              frequency=5)
trainer.train(n_epochs=n_epochs, lr=lr)

ll_train = trainer.history["ll_train_set"]
ll_test = trainer.history["ll_test_set"]
x = np.linspace(0,50,(len(ll_train)))
plt.plot(x, ll_train)
plt.plot(x, ll_test)
plt.ylim(min(ll_train)-50, 1000)
plt.show()

full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset)))
print("Entropy batch mixing :", full.entropy_batch_mixing())

full.clustering_scores(prediction_algorithm = "gmm")
full.show_t_sne()
xx = full.one_vs_all_degenes()


# ========

from scvi.inference import Trainer

from scvi.inference.posterior import Posterior
from sklearn.model_selection._split import _validate_shuffle_split
trainerr = Trainer(vae,gene_dataset)
Example #21
0
def test_model_fit(model_fit: bool):
    """
    Test that controls that scVI inferred distributions make sense on a non-trivial synthetic
    dataset.

    We define technical zeros of the synthetic dataset as the zeros that result from
    highly expressed genes (relatively to the considered cell) and the biological zeros as the
    rest of the zeros
    :return: None
    """
    print('model_fit set to : ', model_fit)
    folder = '/tmp/scVI_zeros_test'
    print('Saving graphs in : {}'.format(folder))
    if not os.path.exists(folder):
        os.makedirs(folder)

    n_epochs = 150 if model_fit else 1
    n_mc_sim_total = 100 if model_fit else 1
    n_cells_cluster = 1000 if model_fit else 100

    torch.manual_seed(seed=42)
    synth_data = ZISyntheticDatasetCorr(n_clusters=8,
                                        n_genes_high=15,
                                        n_overlap=8,
                                        lam_0=320,
                                        n_cells_cluster=n_cells_cluster,
                                        weight_high=1.714286,
                                        weight_low=1,
                                        dropout_coef_low=0.08,
                                        dropout_coef_high=0.05)

    is_high = synth_data.is_highly_exp.squeeze()
    poisson_params_gt = synth_data.exprs_param.squeeze()

    # Step 2: Training scVI model
    mdl = VAE(n_input=synth_data.nb_genes,
              n_batch=synth_data.n_batches,
              reconstruction_loss='zinb',
              n_latent=5)

    trainer = UnsupervisedTrainer(model=mdl,
                                  gene_dataset=synth_data,
                                  use_cuda=True,
                                  train_size=1.0)
    trainer.train(n_epochs=n_epochs, lr=1e-3)
    full = trainer.create_posterior(trainer.model,
                                    synth_data,
                                    indices=np.arange(len(synth_data)))

    # Step 3: Inference
    poisson_params = []
    p_dropout_infered = []
    latent_reps = []
    bio_zero_p = []
    tech_zero_p = []
    with torch.no_grad():
        for tensors in full.sequential():
            # TODO: Properly sample posterior
            sample_batch, _, _, batch_index, labels = tensors
            px_scale, px_dispersion, px_rate, px_dropout, qz_m, qz_v, z, ql_m, ql_v, library = mdl.inference(
                sample_batch, batch_index)
            p_zero = 1.0 / (1.0 + torch.exp(-px_dropout))
            p_dropout_infered.append(p_zero.cpu().numpy())

            l_train_batch = torch.zeros(
                (sample_batch.size(0), sample_batch.size(1), n_mc_sim_total),
                device=sample_batch.device)

            for n_mc_sim in range(n_mc_sim_total):
                p = px_rate / (px_rate + px_dispersion)
                r = px_dispersion
                l_train = torch.distributions.Gamma(concentration=r,
                                                    rate=(1 - p) / p).sample()
                l_train = torch.clamp(l_train, max=1e18)
                X = torch.distributions.Poisson(l_train).sample()
                l_train_batch[:, :, n_mc_sim] = l_train
                p_zero = 1.0 / (1.0 + torch.exp(-px_dropout))
                random_prob = torch.rand_like(p_zero)
                X[random_prob <= p_zero] = 0

            l_train_batch = torch.mean(l_train_batch, dim=(-1))

            bio_zero_prob_batch = torch.exp(-l_train_batch)
            tech_zero_prob_batch = p_zero

            bio_zero_p.append(bio_zero_prob_batch.cpu().numpy())
            tech_zero_p.append(tech_zero_prob_batch.cpu().numpy())
            latent_reps.append(z.cpu().numpy())
            poisson_params.append(l_train_batch.cpu().numpy())

    latent_reps = np.concatenate(latent_reps)
    bio_zero_p = np.concatenate(bio_zero_p)
    tech_zero_p = np.concatenate(tech_zero_p)
    bio_zero_tech_no = bio_zero_p * (1.0 - tech_zero_p)
    tech_zero_bio_no = (1.0 - bio_zero_p) * tech_zero_p

    # Final Step: Checking predictions
    # Dropout checks
    p_dropout_infered_all = np.concatenate(p_dropout_infered)
    p_dropout_gt = synth_data.p_dropout.squeeze()
    vmin = 0.0
    vmax = 2.0 * p_dropout_gt.max()
    fig, axes = plt.subplots(ncols=2, nrows=2, figsize=(10, 10))
    sns.heatmap(p_dropout_infered_all, vmin=vmin, vmax=vmax, ax=axes[0, 1])
    axes[0, 1].set_title('Dropout Rate Predicted')
    sns.heatmap(p_dropout_gt, vmin=vmin, vmax=vmax, ax=axes[0, 0])
    axes[0, 0].set_title('Dropout Rate GT')

    # Poisson Params checks
    poisson_params = np.concatenate(poisson_params)
    vmin = min(poisson_params_gt.min(), poisson_params.min())
    vmax = max(poisson_params_gt.max(), poisson_params.max())
    sns.heatmap(poisson_params, vmin=vmin, vmax=vmax, ax=axes[1, 1])
    axes[1, 1].set_title('Poisson Distribution Parameter Predicted')

    sns.heatmap(poisson_params_gt, vmin=vmin, vmax=vmax, ax=axes[1, 0])
    axes[1, 0].set_title('Poisson Distribution Parameter GT')
    plt.savefig(os.path.join(folder, 'params_comparison.png'))
    plt.close()

    # TODO: Decrease test tolerances
    l1_poisson = np.abs(poisson_params - poisson_params_gt).mean()
    if model_fit:
        print('Average Poisson L1 error: ', l1_poisson)
        assert l1_poisson <= 0.75, \
            'High Error on Poisson parameter inference'
        l1_dropout = np.abs(p_dropout_infered_all -
                            synth_data.p_dropout).mean()
        print('Average Dropout L1 error: ', l1_dropout)
        assert l1_dropout <= 5e-2, \
            'High Error on Dropout parameter inference'

    # tSNE plot
    print("Computing tSNE rep ...")
    x_rep = TSNE(n_components=2).fit_transform(latent_reps)
    print("Done!")
    pos = np.random.permutation(len(x_rep))[:1000]
    labels = ['c_{}'.format(idx) for idx in synth_data.labels[pos].squeeze()]
    sns.scatterplot(x=x_rep[pos, 0],
                    y=x_rep[pos, 1],
                    hue=labels,
                    palette='Set2')
    plt.title('Synthetic Dataset latent space')
    plt.savefig(os.path.join(folder, 't_sne.png'))
    plt.close()

    # Tech/Bio Classif checks
    # --For high expressed genes
    # ---Poisson nul and ZI non null
    print(bio_zero_tech_no[is_high].mean(),
          synth_data.probas_zero_bio_tech_high[1, 0])
    # ---Poisson non nul and .
    print(tech_zero_bio_no[is_high].mean(),
          synth_data.probas_zero_bio_tech_high[0, 1])

    # --Low expressed expressend
    # ---Poisson nul and ZI non null
    print(bio_zero_tech_no[~is_high].mean(),
          synth_data.probas_zero_bio_tech_low[1, 0])
    # ---Poisson non nul and .
    print(tech_zero_bio_no[~is_high].mean(),
          synth_data.probas_zero_bio_tech_low[0, 1])

    diff1 = np.abs(bio_zero_tech_no[is_high].mean() -
                   synth_data.probas_zero_bio_tech_high[1, 0])
    diff2 = np.abs(tech_zero_bio_no[is_high].mean() -
                   synth_data.probas_zero_bio_tech_high[0, 1])
    diff3 = np.abs(bio_zero_tech_no[~is_high].mean() -
                   synth_data.probas_zero_bio_tech_low[1, 0])
    diff4 = np.abs(tech_zero_bio_no[~is_high].mean() -
                   synth_data.probas_zero_bio_tech_low[0, 1])

    if model_fit:
        assert diff1 <= 2e-2
        assert diff2 <= 2e-2
        assert diff3 <= 2e-2
        assert diff4 <= 2e-2
Example #22
0
def solo(X,
         gene_names,
         doublet_depth=2.0,
         gpu=False,
         out_dir='solo_out',
         doublet_ratio=2.0,
         seed=None,
         known_doublets=None,
         doublet_type='multinomial',
         expected_number_of_doublets=None,
         plot=False,
         normal_logging=False,
         n_hidden=128,
         n_latent=16,
         cl_hidden=64,
         cl_layers=1,
         dropout_rate=0.1,
         learning_rate=0.001,
         valid_pct=0.1):
    from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
    import json
    import os
    import shutil
    import anndata
    import numpy as np
    from anndata import AnnData
    from sklearn.metrics import roc_auc_score, roc_curve
    from scipy.sparse import issparse
    from collections import defaultdict
    import scvi
    from scvi.dataset import AnnDatasetFromAnnData, LoomDataset, GeneExpressionDataset
    from scvi.models import Classifier, VAE
    from scvi.inference import UnsupervisedTrainer, ClassifierTrainer
    import torch
    from solo.utils import create_average_doublet, create_summed_doublet, create_multinomial_doublet, make_gene_expression_dataset
    if not normal_logging:
        scvi._settings.set_verbosity(10)
    if gpu and not torch.cuda.is_available():
        gpu = torch.cuda.is_available()
        print('Cuda is not available, switching to cpu running!')
    # if not os.path.isdir(out_dir):
    #     os.mkdir(out_dir)
    # data_ext = os.path.splitext(data_file)[-1]
    # if data_ext == '.loom':
    #     scvi_data = LoomDataset(data_file)
    # elif data_ext == '.h5ad':
    #     scvi_data = AnnDatasetFromAnnData(anndata.read(data_file))
    # else:
    #     msg = f'{data_ext} is not a recognized format.\n'
    #     msg += 'must be one of {h5ad, loom}'
    #     raise TypeError(msg)
    # if issparse(scvi_data.X):
    #     scvi_data.X = scvi_data.X.todense()
    scvi_data = make_gene_expression_dataset(X, gene_names)
    num_cells, num_genes = scvi_data.X.shape
    if known_doublets is not None:
        print('Removing known doublets for in silico doublet generation')
        print('Make sure known doublets are in the same order as your data')
        known_doublets = np.loadtxt(known_doublets, dtype=str) == 'True'
        assert len(known_doublets) == scvi_data.X.shape[0]
        known_doublet_data = make_gene_expression_dataset(
            scvi_data.X[known_doublets], scvi_data.gene_names)
        known_doublet_data.labels = np.ones(known_doublet_data.X.shape[0])
        singlet_scvi_data = make_gene_expression_dataset(
            scvi_data.X[~known_doublets], scvi_data.gene_names)
        singlet_num_cells, _ = singlet_scvi_data.X.shape
    else:
        known_doublet_data = None
        singlet_num_cells = num_cells
        known_doublets = np.zeros(num_cells, dtype=bool)
        singlet_scvi_data = scvi_data
    singlet_scvi_data.labels = np.zeros(singlet_scvi_data.X.shape[0])
    scvi_data.labels = known_doublets.astype(int)
    params = {
        "n_hidden": n_hidden,
        "n_latent": n_latent,
        "cl_hidden": cl_hidden,
        "cl_layers": cl_layers,
        "dropout_rate": dropout_rate,
        "learning_rate": learning_rate,
        "valid_pct": valid_pct
    }
    # set VAE params
    vae_params = {}
    for par in [
            'n_hidden', 'n_latent', 'n_layers', 'dropout_rate', 'ignore_batch'
    ]:
        if par in params:
            vae_params[par] = params[par]
    vae_params['n_batch'] = 0 if params.get('ignore_batch',
                                            False) else scvi_data.n_batches
    # training parameters
    valid_pct = params.get('valid_pct', 0.1)
    learning_rate = params.get('learning_rate', 1e-3)
    stopping_params = {'patience': params.get('patience', 10), 'threshold': 0}
    ##################################################
    # VAE
    vae = VAE(n_input=singlet_scvi_data.nb_genes,
              n_labels=2,
              reconstruction_loss='nb',
              log_variational=True,
              **vae_params)
    if seed:
        if gpu:
            device = torch.device('cuda')
            vae.load_state_dict(torch.load(os.path.join(seed, 'vae.pt')))
            vae.to(device)
        else:
            map_loc = 'cpu'
            vae.load_state_dict(
                torch.load(os.path.join(seed, 'vae.pt'), map_location=map_loc))
        # copy latent representation
        latent_file = os.path.join(seed, 'latent.npy')
        if os.path.isfile(latent_file):
            shutil.copy(latent_file, os.path.join(out_dir, 'latent.npy'))
    else:
        stopping_params['early_stopping_metric'] = 'reconstruction_error'
        stopping_params['save_best_state_metric'] = 'reconstruction_error'
        # initialize unsupervised trainer
        utrainer = \
            UnsupervisedTrainer(vae, singlet_scvi_data,
                                train_size=(1. - valid_pct),
                                frequency=2,
                                metrics_to_monitor=['reconstruction_error'],
                                use_cuda=gpu,
                                early_stopping_kwargs=stopping_params)
        utrainer.history['reconstruction_error_test_set'].append(0)
        # initial epoch
        utrainer.train(n_epochs=2000, lr=learning_rate)
        # drop learning rate and continue
        utrainer.early_stopping.wait = 0
        utrainer.train(n_epochs=500, lr=0.5 * learning_rate)
        # save VAE
        torch.save(vae.state_dict(), os.path.join(out_dir, 'vae.pt'))
        # save latent representation
        full_posterior = utrainer.create_posterior(utrainer.model,
                                                   singlet_scvi_data,
                                                   indices=np.arange(
                                                       len(singlet_scvi_data)))
        latent, _, _ = full_posterior.sequential().get_latent()
        np.save(os.path.join(out_dir, 'latent.npy'), latent.astype('float32'))
    ##################################################
    # simulate doublets
    non_zero_indexes = np.where(singlet_scvi_data.X > 0)
    cells = non_zero_indexes[0]
    genes = non_zero_indexes[1]
    cells_ids = defaultdict(list)
    for cell_id, gene in zip(cells, genes):
        cells_ids[cell_id].append(gene)
    # choose doublets function type
    if doublet_type == 'average':
        doublet_function = create_average_doublet
    elif doublet_type == 'sum':
        doublet_function = create_summed_doublet
    else:
        doublet_function = create_multinomial_doublet
    cell_depths = singlet_scvi_data.X.sum(axis=1)
    num_doublets = int(doublet_ratio * singlet_num_cells)
    if known_doublet_data is not None:
        num_doublets -= known_doublet_data.X.shape[0]
        # make sure we are making a non negative amount of doublets
        assert num_doublets >= 0
    in_silico_doublets = np.zeros((num_doublets, num_genes), dtype='float32')
    # for desired # doublets
    for di in range(num_doublets):
        # sample two cells
        i, j = np.random.choice(singlet_num_cells, size=2)
        # generate doublets
        in_silico_doublets[di, :] = \
            doublet_function(singlet_scvi_data.X, i, j,
                             doublet_depth=doublet_depth,
                             cell_depths=cell_depths, cells_ids=cells_ids)
    # merge datasets
    # we can maybe up sample the known doublets
    # concatentate
    classifier_data = GeneExpressionDataset()
    classifier_data.populate_from_data(
        X=np.vstack([scvi_data.X, in_silico_doublets]),
        labels=np.hstack(
            [np.ravel(scvi_data.labels),
             np.ones(in_silico_doublets.shape[0])]),
        remap_attributes=False)
    assert (len(np.unique(classifier_data.labels.flatten())) == 2)
    ##################################################
    # classifier
    # model
    classifier = Classifier(n_input=(vae.n_latent + 1),
                            n_hidden=params['cl_hidden'],
                            n_layers=params['cl_layers'],
                            n_labels=2,
                            dropout_rate=params['dropout_rate'])
    # trainer
    stopping_params['early_stopping_metric'] = 'accuracy'
    stopping_params['save_best_state_metric'] = 'accuracy'
    strainer = ClassifierTrainer(classifier,
                                 classifier_data,
                                 train_size=(1. - valid_pct),
                                 frequency=2,
                                 metrics_to_monitor=['accuracy'],
                                 use_cuda=gpu,
                                 sampling_model=vae,
                                 sampling_zl=True,
                                 early_stopping_kwargs=stopping_params)
    # initial
    strainer.train(n_epochs=1000, lr=learning_rate)
    # drop learning rate and continue
    strainer.early_stopping.wait = 0
    strainer.train(n_epochs=300, lr=0.1 * learning_rate)
    torch.save(classifier.state_dict(), os.path.join(out_dir, 'classifier.pt'))
    ##################################################
    # post-processing
    # use logits for predictions for better results
    logits_classifier = Classifier(n_input=(vae.n_latent + 1),
                                   n_hidden=params['cl_hidden'],
                                   n_layers=params['cl_layers'],
                                   n_labels=2,
                                   dropout_rate=params['dropout_rate'],
                                   logits=True)
    logits_classifier.load_state_dict(classifier.state_dict())
    # using logits leads to better performance in for ranking
    logits_strainer = ClassifierTrainer(logits_classifier,
                                        classifier_data,
                                        train_size=(1. - valid_pct),
                                        frequency=2,
                                        metrics_to_monitor=['accuracy'],
                                        use_cuda=gpu,
                                        sampling_model=vae,
                                        sampling_zl=True,
                                        early_stopping_kwargs=stopping_params)
    # models evaluation mode
    vae.eval()
    classifier.eval()
    logits_classifier.eval()
    print('Train accuracy: %.4f' % strainer.train_set.accuracy())
    print('Test accuracy:  %.4f' % strainer.test_set.accuracy())
    # compute predictions manually
    # output logits
    train_y, train_score = strainer.train_set.compute_predictions(soft=True)
    test_y, test_score = strainer.test_set.compute_predictions(soft=True)
    # train_y == true label
    # train_score[:, 0] == singlet score; train_score[:, 1] == doublet score
    train_score = train_score[:, 1]
    train_y = train_y.astype('bool')
    test_score = test_score[:, 1]
    test_y = test_y.astype('bool')
    train_auroc = roc_auc_score(train_y, train_score)
    test_auroc = roc_auc_score(test_y, test_score)
    print('Train AUROC: %.4f' % train_auroc)
    print('Test AUROC:  %.4f' % test_auroc)
    train_fpr, train_tpr, train_t = roc_curve(train_y, train_score)
    test_fpr, test_tpr, test_t = roc_curve(test_y, test_score)
    train_t = np.minimum(train_t, 1 + 1e-9)
    test_t = np.minimum(test_t, 1 + 1e-9)
    train_acc = np.zeros(len(train_t))
    for i in range(len(train_t)):
        train_acc[i] = np.mean(train_y == (train_score > train_t[i]))
    test_acc = np.zeros(len(test_t))
    for i in range(len(test_t)):
        test_acc[i] = np.mean(test_y == (test_score > test_t[i]))
    # write predictions
    # softmax predictions
    order_y, order_score = strainer.compute_predictions(soft=True)
    _, order_pred = strainer.compute_predictions()
    doublet_score = order_score[:, 1]
    np.save(os.path.join(out_dir, 'softmax_scores.npy'),
            doublet_score[:num_cells])
    np.save(os.path.join(out_dir, 'softmax_scores_sim.npy'),
            doublet_score[num_cells:])
    # logit predictions
    logit_y, logit_score = logits_strainer.compute_predictions(soft=True)
    logit_doublet_score = logit_score[:, 1]
    np.save(os.path.join(out_dir, 'logit_scores.npy'),
            logit_doublet_score[:num_cells])
    np.save(os.path.join(out_dir, 'logit_scores_sim.npy'),
            logit_doublet_score[num_cells:])
    if expected_number_of_doublets is not None:
        solo_scores = doublet_score[:num_cells]
        k = len(solo_scores) - expected_number_of_doublets
        if expected_number_of_doublets / len(solo_scores) > .5:
            print(
                'Make sure you actually expect more than half your cells to be doublets. If not change your -e parameter value'
            )
        assert k > 0
        idx = np.argpartition(solo_scores, k)
        threshold = np.max(solo_scores[idx[:k]])
        is_solo_doublet = doublet_score > threshold
    else:
        is_solo_doublet = order_pred[:num_cells]
    is_doublet = known_doublets
    new_doublets_idx = np.where(~(is_doublet) & is_solo_doublet[:num_cells])[0]
    is_doublet[new_doublets_idx] = True
    np.save(os.path.join(out_dir, 'is_doublet.npy'), is_doublet[:num_cells])
    np.save(os.path.join(out_dir, 'is_doublet_sim.npy'),
            is_doublet[num_cells:])
    np.save(os.path.join(out_dir, 'preds.npy'), order_pred[:num_cells])
    np.save(os.path.join(out_dir, 'preds_sim.npy'), order_pred[num_cells:])
    if plot:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import seaborn as sns
        # plot ROC
        plt.figure()
        plt.plot(train_fpr, train_tpr, label='Train')
        plt.plot(test_fpr, test_tpr, label='Test')
        plt.gca().set_xlabel('False positive rate')
        plt.gca().set_ylabel('True positive rate')
        plt.legend()
        plt.savefig(os.path.join(out_dir, 'roc.pdf'))
        plt.close()
        # plot accuracy
        plt.figure()
        plt.plot(train_t, train_acc, label='Train')
        plt.plot(test_t, test_acc, label='Test')
        plt.axvline(0.5, color='black', linestyle='--')
        plt.gca().set_xlabel('Threshold')
        plt.gca().set_ylabel('Accuracy')
        plt.legend()
        plt.savefig(os.path.join(out_dir, 'accuracy.pdf'))
        plt.close()
        # plot distributions
        plt.figure()
        sns.distplot(test_score[test_y], label='Simulated')
        sns.distplot(test_score[~test_y], label='Observed')
        plt.legend()
        plt.savefig(os.path.join(out_dir, 'train_v_test_dist.pdf'))
        plt.close()
        plt.figure()
        sns.distplot(doublet_score[:num_cells], label='Simulated')
        plt.legend()
        plt.savefig(os.path.join(out_dir, 'real_cells_dist.pdf'))
        plt.close()
                                                   hemat_batch_2)

hemat_vae = VAE(hemat_data.nb_genes,
                n_batch=hemat_data.n_batches,
                n_labels=hemat_data.n_labels,
                n_hidden=128,
                n_latent=30,
                n_layers=2,
                dispersion='gene')

hemat_trainer = UnsupervisedTrainer(hemat_vae, hemat_data, train_size=0.9)

hemat_trainer.train(n_epochs=100)

hemat_full = hemat_trainer.create_posterior(hemat_trainer.model,
                                            hemat_data,
                                            indices=np.arange(len(hemat_data)))
hemat_latent, hemat_batch_indices, hemat_labels = hemat_full.sequential(
).get_latent()
hemat_batch_indices = hemat_batch_indices.ravel()

np.savetxt("scVI_hemat_v1_latent_0716.txt",
           hemat_latent,
           fmt="%10.9f",
           delimiter="\t")

hemat_adata_latent = sc.AnnData(hemat_latent)
sc.pp.neighbors(hemat_adata_latent,
                use_rep='X',
                n_neighbors=30,
                metric='minkowski')
Example #24
0
def scvi(
    adata: AnnData,
    n_hidden: int = 128,
    n_latent: int = 10,
    n_layers: int = 1,
    dispersion: str = "gene",
    n_epochs: int = 400,
    lr: int = 1e-3,
    train_size: int = 1.0,
    batch_key: Optional[str] = None,
    use_highly_variable_genes: bool = True,
    subset_genes: Optional[Sequence[Union[int, str]]] = None,
    linear_decoder: bool = False,
    copy: bool = False,
    use_cuda: bool = True,
    return_posterior: bool = True,
    trainer_kwargs: dict = {},
    model_kwargs: dict = {},
) -> Optional[AnnData]:
    """\
    SCVI [Lopez18]_.

    Fits scVI model onto raw count data given an anndata object

    scVI uses stochastic optimization and deep neural networks to aggregate information 
    across similar cells and genes and to approximate the distributions that underlie
    observed expression values, while accounting for batch effects and limited sensitivity.

    To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.),
    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can 
    be used to inspect which genes contribute to variation in the dataset. It may also be used
    for all scVI tasks, like differential expression, batch correction, imputation, etc.
    However, batch correction may be less powerful as it assumes a linear model.

    .. note::
        More information and bug reports `here <https://github.com/YosefLab/scVI>`__.

    Parameters
    ----------
    adata
        An anndata file with `X` attribute of unnormalized count data
    n_hidden
        Number of nodes per hidden layer
    n_latent
        Dimensionality of the latent space
    n_layers
        Number of hidden layers used for encoder and decoder NNs
    dispersion
        One of the following
        * `'gene'` - dispersion parameter of NB is constant per gene across cells
        * `'gene-batch'` - dispersion can differ between different batches
        * `'gene-label'` - dispersion can differ between different labels
        * `'gene-cell'` - dispersion can differ for every gene in every cell
    n_epochs
        Number of epochs to train
    lr
        Learning rate
    train_size
        The train size, either a float between 0 and 1 or an integer for the number of training samples to use
    batch_key
        Column name in anndata.obs for batches. 
        If None, no batch correction is performed
        If not None, batch correction is performed per batch category
    use_highly_variable_genes
        If true, uses only the genes in anndata.var["highly_variable"]
    subset_genes
        Optional list of indices or gene names to subset anndata. 
        If not None, use_highly_variable_genes is ignored
    linear_decoder
        If true, uses LDVAE model, which is an implementation of [Svensson20]_.
    copy
        If true, a copy of anndata is returned
    return_posterior
        If true, posterior object is returned
    use_cuda
        If true, uses cuda
    trainer_kwargs
        Extra arguments for UnsupervisedTrainer
    model_kwargs
        Extra arguments for VAE or LDVAE model
    
    Returns
    -------
    If `copy` is true, anndata is returned.
    If `return_posterior` is true, the posterior object is returned
    If both `copy` and `return_posterior` are true, 
    a tuple of anndata and the posterior are returned in that order. 

    `adata.obsm['X_scvi']` stores the latent representations
    `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial
    `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial
    
    If linear_decoder is true:
    `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a
    genes by n_latent matrix.

    """
    warnings.warn(
        "scvi via scanpy external API is no longer supported. " +
        "Please use the new scvi-tools package from `scvi-tools.org`",
        FutureWarning,
    )

    try:
        from scvi.models import VAE, LDVAE
        from scvi.inference import UnsupervisedTrainer
        from scvi.dataset import AnnDatasetFromAnnData
    except ImportError:
        raise ImportError(
            "Please install scvi package from https://github.com/YosefLab/scVI"
        )

    # check if observations are unnormalized using first 10
    # code from: https://github.com/theislab/dca/blob/89eee4ed01dd969b3d46e0c815382806fbfc2526/dca/io.py#L63-L69
    if len(adata) > 10:
        X_subset = adata.X[:10]
    else:
        X_subset = adata.X
    norm_error = (
        'Make sure that the dataset (adata.X) contains unnormalized count data.'
    )
    if sp.sparse.issparse(X_subset):
        assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error
    else:
        assert np.all(X_subset.astype(int) == X_subset), norm_error

    if subset_genes is not None:
        adata_subset = adata[:, subset_genes]
    elif use_highly_variable_genes and "highly_variable" in adata.var:
        adata_subset = adata[:, adata.var["highly_variable"]]
    else:
        adata_subset = adata

    if batch_key is not None:
        codes, uniques = pd.factorize(adata_subset.obs[batch_key])
        adata_subset.obs['_tmp_scvi_batch'] = codes
        n_batches = len(uniques)
    else:
        n_batches = 0

    dataset = AnnDatasetFromAnnData(adata_subset.copy(),
                                    batch_label='_tmp_scvi_batch')

    if linear_decoder:
        vae = LDVAE(
            n_input=dataset.nb_genes,
            n_batch=n_batches,
            n_labels=dataset.n_labels,
            n_hidden=n_hidden,
            n_latent=n_latent,
            n_layers_encoder=n_layers,
            dispersion=dispersion,
            **model_kwargs,
        )

    else:
        vae = VAE(
            dataset.nb_genes,
            n_batch=n_batches,
            n_labels=dataset.n_labels,
            n_hidden=n_hidden,
            n_latent=n_latent,
            n_layers=n_layers,
            dispersion=dispersion,
            **model_kwargs,
        )

    trainer = UnsupervisedTrainer(
        model=vae,
        gene_dataset=dataset,
        use_cuda=use_cuda,
        train_size=train_size,
        **trainer_kwargs,
    )

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(trainer.model,
                                    dataset,
                                    indices=np.arange(len(dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()

    if copy:
        adata = adata.copy()

    adata.obsm['X_scvi'] = latent
    adata.obsm['X_scvi_denoised'] = full.sequential().get_sample_scale()
    adata.obsm['X_scvi_sample_rate'] = full.sequential().imputation()

    if linear_decoder:
        loadings = vae.get_loadings()
        df = pd.DataFrame(loadings, index=adata_subset.var_names)
        adata.uns['ldvae_loadings'] = df

    if copy and return_posterior:
        return adata, full
    elif copy:
        return adata
    elif return_posterior:
        return full
simulation_vae = VAE(simulation_data.nb_genes,
                     n_batch=simulation_data.n_batches,
                     n_labels=simulation_data.n_labels,
                     n_hidden=128,
                     n_latent=30,
                     n_layers=2,
                     dispersion='gene')

simulation_trainer = UnsupervisedTrainer(simulation_vae,
                                         simulation_data,
                                         train_size=0.9)

simulation_trainer.train(n_epochs=100)

simulation_full = simulation_trainer.create_posterior(
    simulation_trainer.model,
    simulation_data,
    indices=np.arange(len(simulation_data)))
simulation_latent, simulation_batch_indices, simulation_labels = simulation_full.sequential(
).get_latent()
simulation_batch_indices = simulation_batch_indices.ravel()

np.savetxt("scVI_simulation_v1_latent.txt",
           simulation_latent,
           fmt="%10.9f",
           delimiter="\t")

simulation_adata_latent = sc.AnnData(simulation_latent)
sc.pp.neighbors(simulation_adata_latent,
                use_rep='X',
                n_neighbors=30,
                metric='minkowski')
Example #26
0
    # LOAD
    full_file_save_path = os.path.join(save_path, vae_file_name)
    trainer.model.load_state_dict(torch.load(full_file_save_path))
    trainer.model.eval()
    print('	### ### ###  loaded vae')
    print(datetime.datetime.now())

    # n_epochs = 5
    # lr = 0.001
    # full_file_save_path = os.path.join(save_path, vae_file_name)
    # trainer.train(n_epochs=n_epochs, lr=lr)
    # torch.save(trainer.model.state_dict(), full_file_save_path)
    # train_test_results = pd.DataFrame(trainer.history).rename(columns={'elbo_train_set':'Train', 'elbo_test_set':'Test'})
    # print(train_test_results)

    full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset)))
    latent, batch_indices, labels = full.sequential().get_latent()
    batch_indices = batch_indices.ravel()

    print('	### ### ###  computed full posterior')


    print('	### ### ###  url = ', url)
    # read submission csv and fetch selected cells
    submission = pd.read_csv(io.StringIO(requests.get(url).content.decode('utf-8')), index_col=0)
    selected_cells_csv_string = submission.to_csv(index=False).replace('\n', '<br>')

    # reconstruct user email from submission url
    email = url.split('https://aavcells-de.s3.us-west-2.amazonaws.com/submissions/')[1]
    email = email.split('%25')[0]
    email = email.replace('%40', '@')
Example #27
0
File: solo.py Project: yynst2/solo
def main():
    usage = 'solo'
    parser = ArgumentParser(usage,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument(dest='model_json_file',
                        help='json file to pass VAE parameters')
    parser.add_argument(
        dest='data_path',
        help=
        'path to h5ad, loom or 10x directory containing cell by genes counts')
    parser.add_argument('-d',
                        dest='doublet_depth',
                        default=2.,
                        type=float,
                        help='Depth multiplier for a doublet relative to the \
                        average of its constituents')
    parser.add_argument('-g',
                        dest='gpu',
                        default=True,
                        action='store_true',
                        help='Run on GPU')
    parser.add_argument('-a',
                        dest='anndata_output',
                        default=False,
                        action='store_true',
                        help='output modified anndata object with solo scores \
                        Only works for anndata')
    parser.add_argument('-o', dest='out_dir', default='solo_out')
    parser.add_argument('-r',
                        dest='doublet_ratio',
                        default=2.,
                        type=float,
                        help='Ratio of doublets to true \
                        cells')
    parser.add_argument('-s',
                        dest='seed',
                        default=None,
                        help='Path to previous solo output  \
                        directory. Seed VAE models with previously \
                        trained solo model. Directory structure is assumed to \
                        be the same as solo output directory structure. \
                        should at least have a vae.pt a pickled object of \
                        vae weights and a latent.npy an np.ndarray of the \
                        latents of your cells.')
    parser.add_argument('-k',
                        dest='known_doublets',
                        help='Experimentally defined doublets tsv file. \
                        Should be a single column of True/False. True \
                        indicates the cell is a doublet. No header.',
                        type=str)
    parser.add_argument('-t',
                        dest='doublet_type',
                        help='Please enter \
                        multinomial, average, or sum',
                        default='multinomial',
                        choices=['multinomial', 'average', 'sum'])
    parser.add_argument('-e',
                        dest='expected_number_of_doublets',
                        help='Experimentally expected number of doublets',
                        type=int,
                        default=None)
    parser.add_argument('-p',
                        dest='plot',
                        default=False,
                        action='store_true',
                        help='Plot outputs for solo')
    parser.add_argument('-l',
                        dest='normal_logging',
                        default=False,
                        action='store_true',
                        help='Logging level set to normal (aka not debug)')
    parser.add_argument('--random_size',
                        dest='randomize_doublet_size',
                        default=False,
                        action='store_true',
                        help='Sample depth multipliers from Unif(1, \
                        DoubletDepth) \
                        to provide a diversity of possible doublet depths.')
    args = parser.parse_args()

    if not args.normal_logging:
        scvi._settings.set_verbosity(10)

    model_json_file = args.model_json_file
    data_path = args.data_path
    if args.gpu and not torch.cuda.is_available():
        args.gpu = torch.cuda.is_available()
        print('Cuda is not available, switching to cpu running!')

    if not os.path.isdir(args.out_dir):
        os.mkdir(args.out_dir)

    ##################################################
    # data

    # read loom/anndata
    data_ext = os.path.splitext(data_path)[-1]
    if data_ext == '.loom':
        scvi_data = LoomDataset(data_path)
    elif data_ext == '.h5ad':
        adata = anndata.read(data_path)
        if issparse(adata.X):
            adata.X = adata.X.todense()
        scvi_data = AnnDatasetFromAnnData(adata)
    elif os.path.isdir(data_path):
        scvi_data = Dataset10X(save_path=data_path,
                               measurement_names_column=1,
                               dense=True)
        cell_umi_depth = scvi_data.X.sum(axis=1)
        fifth, ninetyfifth = np.percentile(cell_umi_depth, [5, 95])
        min_cell_umi_depth = np.min(cell_umi_depth)
        max_cell_umi_depth = np.max(cell_umi_depth)
        if fifth * 10 < ninetyfifth:
            print("""WARNING YOUR DATA HAS A WIDE RANGE OF CELL DEPTHS.
            PLEASE MANUALLY REVIEW YOUR DATA""")
        print(
            f"Min cell depth: {min_cell_umi_depth}, Max cell depth: {max_cell_umi_depth}"
        )
    else:
        msg = f'{data_path} is not a recognized format.\n'
        msg += 'must be one of {h5ad, loom, 10x directory}'
        raise TypeError(msg)

    num_cells, num_genes = scvi_data.X.shape

    if args.known_doublets is not None:
        print('Removing known doublets for in silico doublet generation')
        print('Make sure known doublets are in the same order as your data')
        known_doublets = np.loadtxt(args.known_doublets, dtype=str) == 'True'

        assert len(known_doublets) == scvi_data.X.shape[0]
        known_doublet_data = make_gene_expression_dataset(
            scvi_data.X[known_doublets], scvi_data.gene_names)
        known_doublet_data.labels = np.ones(known_doublet_data.X.shape[0])
        singlet_scvi_data = make_gene_expression_dataset(
            scvi_data.X[~known_doublets], scvi_data.gene_names)
        singlet_num_cells, _ = singlet_scvi_data.X.shape
    else:
        known_doublet_data = None
        singlet_num_cells = num_cells
        known_doublets = np.zeros(num_cells, dtype=bool)
        singlet_scvi_data = scvi_data
    singlet_scvi_data.labels = np.zeros(singlet_scvi_data.X.shape[0])
    scvi_data.labels = known_doublets.astype(int)
    ##################################################
    # parameters

    # check for parameters
    if not os.path.exists(model_json_file):
        raise FileNotFoundError(f'{model_json_file} does not exist.')
    # read parameters
    with open(model_json_file, 'r') as model_json_open:
        params = json.load(model_json_open)

    # set VAE params
    vae_params = {}
    for par in [
            'n_hidden', 'n_latent', 'n_layers', 'dropout_rate', 'ignore_batch'
    ]:
        if par in params:
            vae_params[par] = params[par]
    vae_params['n_batch'] = 0 if params.get('ignore_batch',
                                            False) else scvi_data.n_batches

    # training parameters
    batch_size = params.get('batch_size', 128)
    valid_pct = params.get('valid_pct', 0.1)
    learning_rate = params.get('learning_rate', 1e-3)
    stopping_params = {'patience': params.get('patience', 10), 'threshold': 0}

    # protect against single example batch
    while num_cells % batch_size == 1:
        batch_size = int(np.round(1.25 * batch_size))
        print('Increasing batch_size to %d to avoid single example batch.' %
              batch_size)

    ##################################################
    # VAE

    vae = VAE(n_input=singlet_scvi_data.nb_genes,
              n_labels=2,
              reconstruction_loss='nb',
              log_variational=True,
              **vae_params)

    if args.seed:
        if args.gpu:
            device = torch.device('cuda')
            vae.load_state_dict(torch.load(os.path.join(args.seed, 'vae.pt')))
            vae.to(device)
        else:
            map_loc = 'cpu'
            vae.load_state_dict(
                torch.load(os.path.join(args.seed, 'vae.pt'),
                           map_location=map_loc))

        # save latent representation
        utrainer = \
            UnsupervisedTrainer(vae, singlet_scvi_data,
                                train_size=(1. - valid_pct),
                                frequency=2,
                                metrics_to_monitor=['reconstruction_error'],
                                use_cuda=args.gpu,
                                early_stopping_kwargs=stopping_params,
                                batch_size=batch_size)

        full_posterior = utrainer.create_posterior(utrainer.model,
                                                   singlet_scvi_data,
                                                   indices=np.arange(
                                                       len(singlet_scvi_data)))
        latent, _, _ = full_posterior.sequential(batch_size).get_latent()
        np.save(os.path.join(args.out_dir, 'latent.npy'),
                latent.astype('float32'))

    else:
        stopping_params['early_stopping_metric'] = 'reconstruction_error'
        stopping_params['save_best_state_metric'] = 'reconstruction_error'

        # initialize unsupervised trainer
        utrainer = \
            UnsupervisedTrainer(vae, singlet_scvi_data,
                                train_size=(1. - valid_pct),
                                frequency=2,
                                metrics_to_monitor=['reconstruction_error'],
                                use_cuda=args.gpu,
                                early_stopping_kwargs=stopping_params,
                                batch_size=batch_size)
        utrainer.history['reconstruction_error_test_set'].append(0)
        # initial epoch
        utrainer.train(n_epochs=2000, lr=learning_rate)

        # drop learning rate and continue
        utrainer.early_stopping.wait = 0
        utrainer.train(n_epochs=500, lr=0.5 * learning_rate)

        # save VAE
        torch.save(vae.state_dict(), os.path.join(args.out_dir, 'vae.pt'))

        # save latent representation
        full_posterior = utrainer.create_posterior(utrainer.model,
                                                   singlet_scvi_data,
                                                   indices=np.arange(
                                                       len(singlet_scvi_data)))
        latent, _, _ = full_posterior.sequential(batch_size).get_latent()
        np.save(os.path.join(args.out_dir, 'latent.npy'),
                latent.astype('float32'))

    ##################################################
    # simulate doublets

    non_zero_indexes = np.where(singlet_scvi_data.X > 0)
    cells = non_zero_indexes[0]
    genes = non_zero_indexes[1]
    cells_ids = defaultdict(list)
    for cell_id, gene in zip(cells, genes):
        cells_ids[cell_id].append(gene)

    # choose doublets function type
    if args.doublet_type == 'average':
        doublet_function = create_average_doublet
    elif args.doublet_type == 'sum':
        doublet_function = create_summed_doublet
    else:
        doublet_function = create_multinomial_doublet

    cell_depths = singlet_scvi_data.X.sum(axis=1)
    num_doublets = int(args.doublet_ratio * singlet_num_cells)
    if known_doublet_data is not None:
        num_doublets -= known_doublet_data.X.shape[0]
        # make sure we are making a non negative amount of doublets
        assert num_doublets >= 0

    in_silico_doublets = np.zeros((num_doublets, num_genes), dtype='float32')
    # for desired # doublets
    for di in range(num_doublets):
        # sample two cells
        i, j = np.random.choice(singlet_num_cells, size=2)

        # generate doublets
        in_silico_doublets[di, :] = \
            doublet_function(singlet_scvi_data.X, i, j,
                             doublet_depth=args.doublet_depth,
                             cell_depths=cell_depths, cells_ids=cells_ids,
                             randomize_doublet_size=args.randomize_doublet_size)

    # merge datasets
    # we can maybe up sample the known doublets
    # concatentate
    classifier_data = GeneExpressionDataset()
    classifier_data.populate_from_data(
        X=np.vstack([scvi_data.X, in_silico_doublets]),
        labels=np.hstack(
            [np.ravel(scvi_data.labels),
             np.ones(in_silico_doublets.shape[0])]),
        remap_attributes=False)

    assert (len(np.unique(classifier_data.labels.flatten())) == 2)

    ##################################################
    # classifier

    # model
    classifier = Classifier(n_input=(vae.n_latent + 1),
                            n_hidden=params['cl_hidden'],
                            n_layers=params['cl_layers'],
                            n_labels=2,
                            dropout_rate=params['dropout_rate'])

    # trainer
    stopping_params['early_stopping_metric'] = 'accuracy'
    stopping_params['save_best_state_metric'] = 'accuracy'
    strainer = ClassifierTrainer(classifier,
                                 classifier_data,
                                 train_size=(1. - valid_pct),
                                 frequency=2,
                                 metrics_to_monitor=['accuracy'],
                                 use_cuda=args.gpu,
                                 sampling_model=vae,
                                 sampling_zl=True,
                                 early_stopping_kwargs=stopping_params,
                                 batch_size=batch_size)

    # initial
    strainer.train(n_epochs=1000, lr=learning_rate)

    # drop learning rate and continue
    strainer.early_stopping.wait = 0
    strainer.train(n_epochs=300, lr=0.1 * learning_rate)
    torch.save(classifier.state_dict(),
               os.path.join(args.out_dir, 'classifier.pt'))

    ##################################################
    # post-processing
    # use logits for predictions for better results
    logits_classifier = Classifier(n_input=(vae.n_latent + 1),
                                   n_hidden=params['cl_hidden'],
                                   n_layers=params['cl_layers'],
                                   n_labels=2,
                                   dropout_rate=params['dropout_rate'],
                                   logits=True)
    logits_classifier.load_state_dict(classifier.state_dict())

    # using logits leads to better performance in for ranking
    logits_strainer = ClassifierTrainer(logits_classifier,
                                        classifier_data,
                                        train_size=(1. - valid_pct),
                                        frequency=2,
                                        metrics_to_monitor=['accuracy'],
                                        use_cuda=args.gpu,
                                        sampling_model=vae,
                                        sampling_zl=True,
                                        early_stopping_kwargs=stopping_params,
                                        batch_size=batch_size)

    # models evaluation mode
    vae.eval()
    classifier.eval()
    logits_classifier.eval()

    print('Train accuracy: %.4f' % strainer.train_set.accuracy())
    print('Test accuracy:  %.4f' % strainer.test_set.accuracy())

    # compute predictions manually
    # output logits
    train_y, train_score = strainer.train_set.compute_predictions(soft=True)
    test_y, test_score = strainer.test_set.compute_predictions(soft=True)
    # train_y == true label
    # train_score[:, 0] == singlet score; train_score[:, 1] == doublet score
    train_score = train_score[:, 1]
    train_y = train_y.astype('bool')
    test_score = test_score[:, 1]
    test_y = test_y.astype('bool')

    train_auroc = roc_auc_score(train_y, train_score)
    test_auroc = roc_auc_score(test_y, test_score)

    print('Train AUROC: %.4f' % train_auroc)
    print('Test AUROC:  %.4f' % test_auroc)

    train_fpr, train_tpr, train_t = roc_curve(train_y, train_score)
    test_fpr, test_tpr, test_t = roc_curve(test_y, test_score)
    train_t = np.minimum(train_t, 1 + 1e-9)
    test_t = np.minimum(test_t, 1 + 1e-9)

    train_acc = np.zeros(len(train_t))
    for i in range(len(train_t)):
        train_acc[i] = np.mean(train_y == (train_score > train_t[i]))
    test_acc = np.zeros(len(test_t))
    for i in range(len(test_t)):
        test_acc[i] = np.mean(test_y == (test_score > test_t[i]))

    # write predictions
    # softmax predictions
    order_y, order_score = strainer.compute_predictions(soft=True)
    _, order_pred = strainer.compute_predictions()
    doublet_score = order_score[:, 1]
    np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores.npy'),
            doublet_score[:num_cells])
    np.save(os.path.join(args.out_dir, 'no_updates_softmax_scores_sim.npy'),
            doublet_score[num_cells:])

    # logit predictions
    logit_y, logit_score = logits_strainer.compute_predictions(soft=True)
    logit_doublet_score = logit_score[:, 1]
    np.save(os.path.join(args.out_dir, 'logit_scores.npy'),
            logit_doublet_score[:num_cells])
    np.save(os.path.join(args.out_dir, 'logit_scores_sim.npy'),
            logit_doublet_score[num_cells:])

    # update threshold as a function of Solo's estimate of the number of
    # doublets
    # essentially a log odds update
    # TODO put in a function
    diff = np.inf
    counter_update = 0
    solo_scores = doublet_score[:num_cells]
    logit_scores = logit_doublet_score[:num_cells]
    d_s = (args.doublet_ratio / (args.doublet_ratio + 1))
    while (diff > .01) | (counter_update < 5):

        # calculate log odss calibration for logits
        d_o = np.mean(solo_scores)
        c = np.log(d_o / (1 - d_o)) - np.log(d_s / (1 - d_s))

        # update soloe scores
        solo_scores = 1 / (1 + np.exp(-(logit_scores + c)))

        # update while conditions
        diff = np.abs(d_o - np.mean(solo_scores))
        counter_update += 1

    np.save(os.path.join(args.out_dir, 'softmax_scores.npy'), solo_scores)

    if args.expected_number_of_doublets is not None:
        k = len(solo_scores) - args.expected_number_of_doublets
        if args.expected_number_of_doublets / len(solo_scores) > .5:
            print('''Make sure you actually expect more than half your cells
                   to be doublets. If not change your
                   -e parameter value''')
        assert k > 0
        idx = np.argpartition(solo_scores, k)
        threshold = np.max(solo_scores[idx[:k]])
        is_solo_doublet = solo_scores > threshold
    else:
        is_solo_doublet = solo_scores > .5

    is_doublet = known_doublets
    new_doublets_idx = np.where(~(is_doublet) & is_solo_doublet[:num_cells])[0]
    is_doublet[new_doublets_idx] = True

    np.save(os.path.join(args.out_dir, 'is_doublet.npy'),
            is_doublet[:num_cells])
    np.save(os.path.join(args.out_dir, 'is_doublet_sim.npy'),
            is_doublet[num_cells:])

    np.save(os.path.join(args.out_dir, 'preds.npy'), order_pred[:num_cells])
    np.save(os.path.join(args.out_dir, 'preds_sim.npy'),
            order_pred[num_cells:])

    smoothed_preds = knn_smooth_pred_class(X=latent,
                                           pred_class=is_doublet[:num_cells])
    np.save(os.path.join(args.out_dir, 'smoothed_preds.npy'), smoothed_preds)

    if args.anndata_output and data_ext == '.h5ad':
        adata.obs['is_doublet'] = is_doublet[:num_cells]
        adata.obs['logit_scores'] = logit_doublet_score[:num_cells]
        adata.obs['softmax_scores'] = doublet_score[:num_cells]
        adata.write(os.path.join(args.out_dir, "soloed.h5ad"))

    if args.plot:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import seaborn as sns
        # plot ROC
        plt.figure()
        plt.plot(train_fpr, train_tpr, label='Train')
        plt.plot(test_fpr, test_tpr, label='Test')
        plt.gca().set_xlabel('False positive rate')
        plt.gca().set_ylabel('True positive rate')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'roc.pdf'))
        plt.close()

        # plot accuracy
        plt.figure()
        plt.plot(train_t, train_acc, label='Train')
        plt.plot(test_t, test_acc, label='Test')
        plt.axvline(0.5, color='black', linestyle='--')
        plt.gca().set_xlabel('Threshold')
        plt.gca().set_ylabel('Accuracy')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'accuracy.pdf'))
        plt.close()

        # plot distributions
        plt.figure()
        sns.distplot(test_score[test_y], label='Simulated')
        sns.distplot(test_score[~test_y], label='Observed')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'train_v_test_dist.pdf'))
        plt.close()

        plt.figure()
        sns.distplot(doublet_score[:num_cells], label='Observed')
        plt.legend()
        plt.savefig(os.path.join(args.out_dir, 'real_cells_dist.pdf'))
        plt.close()

        scvi_umap = umap.UMAP(n_neighbors=16).fit_transform(latent)
        fig, ax = plt.subplots(1, 1, figsize=(10, 10))
        ax.scatter(scvi_umap[:, 0],
                   scvi_umap[:, 1],
                   c=doublet_score[:num_cells],
                   s=8,
                   cmap="GnBu")

        ax.set_xlabel("UMAP 1")
        ax.set_ylabel("UMAP 2")
        ax.set_xticks([], [])
        ax.set_yticks([], [])
        fig.savefig(os.path.join(args.out_dir, 'umap_solo_scores.pdf'))
Example #28
0
def runScvi(adata, batch, hvg=None):
    # Use non-normalized (count) data for scvi!
    # Expects data only on HVGs

    checkSanity(adata, batch, hvg)

    # Check for counts data layer
    if 'counts' not in adata.layers:
        raise TypeError(
            'Adata does not contain a `counts` layer in `adata.layers[`counts`]`'
        )

    from scvi.models import VAE
    from scvi.inference import UnsupervisedTrainer
    from sklearn.preprocessing import LabelEncoder
    from scvi.dataset import AnnDatasetFromAnnData

    # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization
    n_epochs = np.min([round((20000 / adata.n_obs) * 400), 400])
    n_latent = 30
    n_hidden = 128
    n_layers = 2

    net_adata = adata.copy()
    net_adata.X = adata.layers['counts']
    del net_adata.layers['counts']
    # Ensure that the raw counts are not accidentally used
    del net_adata.raw  # Note that this only works from anndata 0.7

    # Define batch indices
    le = LabelEncoder()
    net_adata.obs['batch_indices'] = le.fit_transform(
        net_adata.obs[batch].values)

    net_adata = AnnDatasetFromAnnData(net_adata)

    vae = VAE(
        net_adata.nb_genes,
        reconstruction_loss='nb',
        n_batch=net_adata.n_batches,
        n_layers=n_layers,
        n_latent=n_latent,
        n_hidden=n_hidden,
    )

    trainer = UnsupervisedTrainer(
        vae,
        net_adata,
        train_size=1.0,
        use_cuda=False,
    )

    trainer.train(n_epochs=n_epochs, lr=1e-3)

    full = trainer.create_posterior(trainer.model,
                                    net_adata,
                                    indices=np.arange(len(net_adata)))
    latent, _, _ = full.sequential().get_latent()

    adata.obsm['X_emb'] = latent

    return adata
Example #29
0
                    n_latent=30,
                    n_layers=2,
                    dispersion='gene',
                )

                print('Prepare the trainer')
                trainer = UnsupervisedTrainer(vae, all_dataset, train_size=1.0)

                print('Train neural network')
                n_epochs = 100
                trainer.train(n_epochs=n_epochs)

                print('Get posteriors (latent space)')
                full = trainer.create_posterior(
                    trainer.model,
                    all_dataset,
                    indices=np.arange(len(all_dataset)),
                )
                latent, batch_indices, labels = full.sequential().get_latent()
                batch_indices = batch_indices.ravel()

                print('Use scanpy and Leiden to cluster in latent space')
                adata_latent = sc.AnnData(latent)
                sc.pp.neighbors(adata_latent,
                                use_rep='X',
                                n_neighbors=30,
                                metric='minkowski')
                sc.tl.leiden(adata_latent, resolution=0.8)
                clusters = adata_latent.obs.leiden.values.to_dense().astype(
                    str)
Example #30
0
def benchmark_scvi(dataset, dataset_name, cfg, **kwargs):
    log_name = dataset_name
    n_genes = min(dataset.X.shape[1], cfg.n_genes)

    vae = VAE(
        dataset.nb_genes,
        n_batch=dataset.n_batches,
        n_labels=dataset.n_labels,
        n_hidden=128,
        n_latent=30,
        n_layers=2,
        dispersion="gene",
    )

    trainer = UnsupervisedTrainer(vae, dataset, train_size=0.75)
    n_epochs = cfg.epochs if not "epochs" in kwargs else kwargs["epochs"]
    trainer.train(n_epochs=n_epochs)

    full = trainer.create_posterior(trainer.model,
                                    dataset,
                                    indices=np.arange(len(dataset)))
    latents, batch_indices, labels = full.sequential().get_latent()

    res = {}
    res["knn purity"] = []
    res["entropy batch mixing"] = []
    res["knn purity"].append(get_knn_purity(latents, labels.reshape((-1, 1))))
    ebm = entropy_batch_mixing(latents, batch_indices)
    res["entropy batch mixing"].append(
        ebm[1] if isinstance(ebm, tuple) else ebm)

    cfg.input_dim = latents.shape[1]
    cfg.count_classes = np.unique(dataset.batch_indices).shape[0]
    cfg.count_labels = np.unique(dataset.labels).shape[0]

    (
        latents_train,
        latents_test,
        batches_train,
        batches_test,
        labels_train,
        labels_test,
    ) = train_test_split(
        latents,
        batch_indices,
        labels,
        test_size=0.25,
        stratify=batch_indices.reshape(-1),
    )

    latents_train = torch.Tensor(latents_train).cuda()
    latents_test = torch.Tensor(latents_test).cuda()

    batches_train_tensor = torch.zeros(latents_train.shape[0],
                                       cfg.count_classes)
    batches_train_tensor = batches_train_tensor.scatter(
        1,
        LongTensor(batches_train.astype("int16")).view(-1, 1), 1)
    batches_train_tensor = batches_train_tensor.cuda()

    labels_train_tensor = torch.zeros(latents_train.shape[0], cfg.count_labels)
    labels_train_tensor = labels_train_tensor.scatter(
        1,
        LongTensor(labels_train.astype("int16")).view(-1, 1), 1)
    labels_train_tensor = labels_train_tensor.cuda()

    train_dataset = torch.utils.data.TensorDataset(latents_train,
                                                   batches_train_tensor,
                                                   labels_train_tensor)
    dataloader = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=cfg.batch_size)

    cfg.classifier_input_dim = cfg.bottleneck
    ohe_classifier, form_classifier = train_classifiers(
        cfg, dataloader, cfg.count_labels, cfg.count_classes)
    preds_batches = ohe_classifier(latents_test)
    preds_labels = form_classifier(latents_test)

    res["batch classifing accuracy"] = (
        preds_batches.argmax(1).cpu().detach().numpy() == batches_test).mean()
    res["labels classifing accuracy"] = (
        preds_labels.argmax(1).cpu().detach().numpy() == labels_test).mean()

    (Path(cfg.metrics_dir) / 'scVI').mkdir(parents=True, exist_ok=True)
    with open(os.path.join(Path(cfg.metrics_dir) / "scVI", log_name + ".json"),
              "w") as file:
        for key in res.keys():
            if type(key) is not str:
                try:
                    res[str(key)] = res[key]
                except:
                    try:
                        res[repr(key)] = res[key]
                    except:
                        raise TypeError("Unexpected key")
        json.dump(res, file)

    del vae, trainer
    del latents, batch_indices, labels, full
    del preds_batches, preds_labels, train_dataset, dataloader
    cuda.empty_cache()