def SCANVI_acc(gene_dataset:GeneExpressionDataset, plotname: str,pred1,pred2,coral1,coral2, rep='0'): fname = '../%s/scanvi_acc.txt'%(plotname) methods = ['scanvi','scanvi1','scanvi2'] f = open(fname, "w+") f.write('method\t' + "%s\t" * len(gene_dataset.cell_types) % tuple(gene_dataset.cell_types) + "\n") for i,method in enumerate(methods): vae_posterior = trainVAE(gene_dataset,plotname,rep) scanvi = SCANVI(gene_dataset.nb_genes, gene_dataset.n_batches, gene_dataset.n_labels, n_layers=2) scanvi.load_state_dict(vae_posterior.model.state_dict(), strict=False) if method=='scanvi1': trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=10, n_epochs_classifier=50, lr_classification=5 * 1e-3) trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 0)) trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 1)) elif method=='scanvi2': trainer_scanvi = AlternateSemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=10, n_epochs_classifier=50, lr_classification=5 * 1e-3) trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 1)) trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(gene_dataset.batch_indices == 0)) else: trainer_scanvi = SemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=50, n_epochs_classifier=1, lr_classification=5 * 1e-3) trainer_scanvi.train(n_epochs=5) labelled_idx = trainer_scanvi.labelled_set.indices unlabelled_idx = trainer_scanvi.unlabelled_set.indices full = trainer_scanvi.create_posterior(trainer_scanvi.model, gene_dataset, indices=np.arange(len(gene_dataset))) labels, labels_pred = full.sequential().compute_predictions() shared = set(labels[labelled_idx]).intersection(set(labels[unlabelled_idx])) acc = [np.mean(labels_pred[unlabelled_idx][labels[unlabelled_idx] == i] == i) for i in np.unique(labels)] for x in np.unique(labels): if x not in [*shared] and method!='scanvi': acc[x]=-1 f.write(method + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n") labels = gene_dataset.labels.ravel() batch = gene_dataset.batch_indices.ravel() acc = [np.mean(pred1[labels[batch == 1] == i] == i) for i in np.unique(labels)] f.write('scmap1' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n") acc = [np.mean(pred2[labels[batch == 0] == i] == i) for i in np.unique(labels)] f.write('scmap2' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n") acc = [np.mean(coral1[labels[batch == 1] == i] == i) for i in np.unique(labels)] f.write('coral1' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n") acc = [np.mean(coral2[labels[batch == 0] == i] == i) for i in np.unique(labels)] f.write('coral2' + "\t" + "%.4f\t" * len(acc) % tuple(acc) + "\n") f.close()
def runScanvi(adata, batch, labels): # Use non-normalized (count) data for scanvi! # Check for counts data layer if 'counts' not in adata.layers: raise TypeError( 'Adata does not contain a `counts` layer in `adata.layers[`counts`]`' ) from scvi.models import VAE, SCANVI from scvi.inference import UnsupervisedTrainer, SemiSupervisedTrainer from sklearn.preprocessing import LabelEncoder from scvi.dataset import AnnDatasetFromAnnData import numpy as np # STEP 1: prepare the data net_adata = adata.copy() net_adata.X = adata.layers['counts'] del net_adata.layers['counts'] # Ensure that the raw counts are not accidentally used del net_adata.raw # Note that this only works from anndata 0.7 # Define batch indices le = LabelEncoder() net_adata.obs['batch_indices'] = le.fit_transform( net_adata.obs[batch].values) net_adata.obs['labels'] = le.fit_transform(net_adata.obs[labels].values) net_adata = AnnDatasetFromAnnData(net_adata) print("scANVI dataset object with {} batches and {} cell types".format( net_adata.n_batches, net_adata.n_labels)) #if hvg is True: # # this also corrects for different batches by default # net_adata.subsample_genes(2000, mode="seurat_v3") # # Defaults from SCVI github tutorials scanpy_pbmc3k and harmonization n_epochs_scVI = np.min([round((20000 / adata.n_obs) * 400), 400]) #400 n_epochs_scANVI = int(np.min([10, np.max([2, round(n_epochs_scVI / 3.)])])) n_latent = 30 n_hidden = 128 n_layers = 2 # STEP 2: RUN scVI to initialize scANVI vae = VAE( net_adata.nb_genes, reconstruction_loss='nb', n_batch=net_adata.n_batches, n_latent=n_latent, n_hidden=n_hidden, n_layers=n_layers, ) trainer = UnsupervisedTrainer( vae, net_adata, train_size=1.0, use_cuda=False, ) trainer.train(n_epochs=n_epochs_scVI, lr=1e-3) # STEP 3: RUN scANVI scanvi = SCANVI(net_adata.nb_genes, net_adata.n_batches, net_adata.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layers, dispersion='gene', reconstruction_loss='nb') scanvi.load_state_dict(trainer.model.state_dict(), strict=False) # use default parameter from semi-supervised trainer class trainer_scanvi = SemiSupervisedTrainer(scanvi, net_adata) # use all cells as labelled set trainer_scanvi.labelled_set = trainer_scanvi.create_posterior( trainer_scanvi.model, net_adata, indices=np.arange(len(net_adata))) # put one cell in the unlabelled set trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior( indices=[0]) trainer_scanvi.train(n_epochs=n_epochs_scANVI) # extract info from posterior scanvi_full = trainer_scanvi.create_posterior(trainer_scanvi.model, net_adata, indices=np.arange( len(net_adata))) latent, _, _ = scanvi_full.sequential().get_latent() adata.obsm['X_emb'] = latent return adata