def trainVAE(gene_dataset, filename, rep, nlayers=2, n_hidden=128, reconstruction_loss: str = 'zinb'): vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=n_hidden, n_latent=10, n_layers=nlayers, dispersion='gene', reconstruction_loss=reconstruction_loss) trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) filename = '../' + filename + '/' + 'vae' + '.' + reconstruction_loss + '.rep' + str( rep) + '.pkl' if os.path.isfile(filename): trainer.model.load_state_dict(torch.load(filename)) trainer.model.eval() else: trainer.train(n_epochs=250) torch.save(trainer.model.state_dict(), filename) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) return full
def trainVAE(gene_dataset, rmCellTypes,rep): vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) if os.path.isfile('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)): trainer.model.load_state_dict(torch.load('../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep))) trainer.model.eval() else: trainer.train(n_epochs=150) torch.save(trainer.model.state_dict(), '../NoOverlap/vae.%s%s.pkl' % (rmCellTypes,rep)) full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) latent, batch_indices, labels = full.sequential().get_latent() batch_indices = batch_indices.ravel() return latent, batch_indices,labels,trainer
def compute_scvi_latent( adata: sc.AnnData, n_latent: int = 5, n_epochs: int = 100, lr: float = 1e-3, use_batches: bool = False, use_cuda: bool = True, ) -> Tuple[scvi.inference.Posterior, np.ndarray]: """Train and return a scVI model and sample a latent space :param adata: sc.AnnData object non-normalized :param n_latent: dimension of the latent space :param n_epochs: number of training epochs :param lr: learning rate :param use_batches :param use_cuda :return: (scvi.Posterior, latent_space) """ # Convert easily to scvi dataset scviDataset = AnnDataset(adata) # Train a model vae = VAE( scviDataset.nb_genes, n_batch=scviDataset.n_batches * use_batches, n_latent=n_latent, ) trainer = UnsupervisedTrainer(vae, scviDataset, train_size=1.0, use_cuda=use_cuda) trainer.train(n_epochs=n_epochs, lr=lr) #### # Extract latent space posterior = trainer.create_posterior(trainer.model, scviDataset, indices=np.arange( len(scviDataset))).sequential() latent, _, _ = posterior.get_latent() return posterior, latent
nsamples = np.asarray(nsamples) sample = sample_by_batch(labels[batch_id == batch], nsamples) sample = cellid[batch_id == batch][sample] count.append(nsamples) cells.append(sample) correlation = (np.corrcoef(count[0], count[1])[0, 1]) print("dataset 1 has %d cells" % (np.sum(count[0]))) print("dataset 2 has %d cells" % (np.sum(count[1]))) print( "correlation between the cell-type composition of the subsampled dataset is %.3f" % correlation) sub_dataset = deepcopy(gene_dataset) sub_dataset.update_cells(np.concatenate(cells)) vae = VAE(sub_dataset.nb_genes, n_batch=sub_dataset.n_batches, n_labels=sub_dataset.n_labels, n_hidden=128, dispersion='gene') infer = VariationalInference(vae, sub_dataset, use_cuda=use_cuda) infer.train(n_epochs=250) latent, batch_indices, labels = infer.get_latent('sequential') keys = sub_dataset.cell_types batch_entropy = entropy_batch_mixing(latent, batch_indices) print("Entropy batch mixing :", batch_entropy) sample = select_indices_evenly(1000, labels) res = knn_purity_avg(latent[sample, :], labels[sample].astype('int'), keys=keys, acc=True) print('average classification accuracy per cluster') for x in res:
from scvi.harmonization.clustering.Combat import COMBAT from scvi.harmonization.benchmark import knn_purity_avg from scvi.metrics.clustering import select_indices_evenly,entropy_batch_mixing,clustering_scores import sys model_type = str(sys.argv[1]) plotname = 'Macosko_Regev' dataset1 = MacoskoDataset() dataset2 = RegevDataset() gene_dataset = GeneExpressionDataset.concat_datasets(dataset1, dataset2) gene_dataset.subsample_genes(5000) if model_type == 'vae': vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') infer = VariationalInference(vae, gene_dataset, use_cuda=use_cuda) infer.train(n_epochs=250) data_loader = infer.data_loaders['sequential'] latent, batch_indices, labels = get_latent(vae, data_loader) keys = gene_dataset.cell_types batch_indices = np.concatenate(batch_indices) keys = gene_dataset.cell_types elif model_type == 'svaec': svaec = SCANVI(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels, use_labels_groups=False, n_latent=10, n_layers=2) infer = SemiSupervisedVariationalInference(svaec, gene_dataset) infer.train(n_epochs=50) print('svaec acc =', infer.accuracy('unlabelled')) data_loader = infer.data_loaders['unlabelled']
from scvi.dataset import LoomDataset, CsvDataset, Dataset10X ## Correction for batch effects gene_dataset = RetinaDataset(save_path=save_path) #tenX_dataset = Dataset10X("neuron_9k", save_path=save_path) n_epochs=50 if n_epochs_all is None else n_epochs_all lr=1e-3 use_batches=True use_cuda=True ### Train the model and output model likelihood every 5 epochs from scvi.models.vae import VAE vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches) n_batch = gene_dataset.n_batches * use_batches n_input = gene_dataset.nb_genes n_hidden = 128 n_latent = 10 n_layers= 1 dropout_rate = 0.1 dispersion = "gene" log_variational = True reconstruction_loss: str = "zinb" x = torch.rand([n_batch,n_input]) px_scale, px_r, px_rate, px_dropout, qz_m, qz_v, z, ql_m, ql_v, library = vae.inference(x)
UMI = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( csr_matrix(countUMI), labels=labelUMI), gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)]) nonUMI = GeneExpressionDataset( *GeneExpressionDataset.get_attributes_from_matrix( csr_matrix(countnonUMI), labels=labelnonUMI), gene_names=['gene'+str(i) for i in range(2000)], cell_types=['type'+str(i+1) for i in range(5)]) if model_type in ['vae', 'svaec', 'Seurat', 'Combat']: gene_dataset = GeneExpressionDataset.concat_datasets(UMI, nonUMI) if model_type == 'vae': vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, n_hidden=128, n_latent=10, n_layers=2, dispersion='gene') infer_vae = VariationalInference(vae, gene_dataset, use_cuda=use_cuda) infer_vae.train(n_epochs=250) data_loader = infer_vae.data_loaders['sequential'] latent, batch_indices, labels = get_latent(vae, data_loader) keys = gene_dataset.cell_types batch_indices = np.concatenate(batch_indices) keys = gene_dataset.cell_types elif model_type == 'svaec': gene_dataset.subsample_genes(1000) n_epochs_vae = 100 n_epochs_scanvi = 50 vae = VAE(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels, n_latent=10, n_layers=2) trainer = UnsupervisedTrainer(vae, gene_dataset, train_size=1.0) trainer.train(n_epochs=n_epochs_vae)