def entropy_batch_mixing(self, name, verbose=False, **kwargs): if self.gene_dataset.n_batches == 2: latent, batch_indices, labels = get_latent(self.model, self.data_loaders[name]) be_score = entropy_batch_mixing(latent, batch_indices, **kwargs) if verbose: print("Entropy batch mixing :", be_score) return be_score
def harmonization_stat(model, data_loader, keys, pop1, pop2): latent, batch_indices, labels = get_latent(model, data_loader) batch_indices = np.concatenate(batch_indices) sample = sample_by_batch(batch_indices, 2000) sample_2batch = sample[(batch_indices[sample] == pop1) + (batch_indices[sample] == pop2)] batch_entropy = entropy_batch_mixing(latent[sample_2batch, :], batch_indices[sample_2batch]) print("Entropy batch mixing : %f.3" % batch_entropy) sample = sample_by_batch(labels, 200) res = knn_purity_avg(latent[sample, :], labels.astype('int')[sample], keys, acc=True) print("Average knn purity : %f.3" % np.mean([x[1] for x in res])) return (batch_entropy, res)
print("dataset 2 has %d cells" % (np.sum(count[1]))) print( "correlation between the cell-type composition of the subsampled dataset is %.3f" % correlation) sub_dataset = deepcopy(gene_dataset) sub_dataset.update_cells(np.concatenate(cells)) vae = VAE(sub_dataset.nb_genes, n_batch=sub_dataset.n_batches, n_labels=sub_dataset.n_labels, n_hidden=128, dispersion='gene') infer = VariationalInference(vae, sub_dataset, use_cuda=use_cuda) infer.train(n_epochs=250) latent, batch_indices, labels = infer.get_latent('sequential') keys = sub_dataset.cell_types batch_entropy = entropy_batch_mixing(latent, batch_indices) print("Entropy batch mixing :", batch_entropy) sample = select_indices_evenly(1000, labels) res = knn_purity_avg(latent[sample, :], labels[sample].astype('int'), keys=keys, acc=True) print('average classification accuracy per cluster') for x in res: print(x) knn_acc = np.mean([x[1] for x in res]) print("average KNN accuracy:", knn_acc) res = clustering_scores( np.asarray(latent)[sample, :], labels[sample], 'knn', len(np.unique(labels[sample]))) for x in res:
latent = np.genfromtxt('../macosko_regev.CCA.txt') label = np.genfromtxt('../macosko_regev.CCA.label.txt',dtype='str') keys = gene_dataset.cell_types batch_indices = np.genfromtxt('../macosko_regev.CCA.batch.txt') elif model_type == 'Combat': COMBAT = COMBAT() latent = COMBAT.combat_pca(gene_dataset) latent = latent.T batch_indices = np.concatenate(gene_dataset.batch_indices) labels = np.concatenate(gene_dataset.labels) keys = gene_dataset.cell_types sample = select_indices_evenly(2000,batch_indices) batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample]) print("Entropy batch mixing :", batch_entropy) sample = select_indices_evenly(1000,labels) res = knn_purity_avg( latent[sample, :], labels[sample], keys=keys[np.unique(labels)], acc=True ) print('average classification accuracy per cluster',np.mean([x[1] for x in res])) for x in res: print(x) res = clustering_scores(np.asarray(latent)[sample,:],labels[sample],'knn',len(np.unique(labels[sample]))) for x in res:
def run_benchmarks(dataset_name, model=VAE, n_epochs=1000, lr=1e-3, use_batches=False, use_cuda=True, show_batch_mixing=True, benchmark=False, tt_split=0.9, unit_test=False): # options: # - gene_dataset: a GeneExpressionDataset object # call each of the 4 benchmarks: # - log-likelihood # - imputation # - batch mixing # - cluster scores gene_dataset = load_datasets(dataset_name, unit_test=unit_test) example_indices = np.random.permutation(len(gene_dataset)) tt_split = int(tt_split * len(gene_dataset)) # 90%/10% train/test split data_loader_train = DataLoader(gene_dataset, batch_size=128, pin_memory=use_cuda, sampler=SubsetRandomSampler( example_indices[:tt_split]), collate_fn=gene_dataset.collate_fn) data_loader_test = DataLoader(gene_dataset, batch_size=128, pin_memory=use_cuda, sampler=SubsetRandomSampler( example_indices[tt_split:]), collate_fn=gene_dataset.collate_fn) vae = model(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches, n_labels=gene_dataset.n_labels, use_cuda=use_cuda) stats = train(vae, data_loader_train, data_loader_test, n_epochs=n_epochs, lr=lr, benchmark=benchmark) if isinstance(vae, VAE): best_ll = adapt_encoder(vae, data_loader_test, n_path=1, n_epochs=1, record_freq=1) print("Best ll was :", best_ll) # - log-likelihood print("Log-likelihood Train:", stats.history["LL_train"][stats.best_index]) print("Log-likelihood Test:", stats.history["LL_test"][stats.best_index]) # - imputation imputation_test = imputation(vae, data_loader_test) print("Imputation score on test (MAE) is:", imputation_test.item()) # - batch mixing if gene_dataset.n_batches == 2: latent, batch_indices, labels = get_latent(vae, data_loader_train) print( "Entropy batch mixing :", entropy_batch_mixing(latent.cpu().numpy(), batch_indices.cpu().numpy())) if show_batch_mixing: show_t_sne( latent.cpu().numpy(), np.array([batch[0] for batch in batch_indices.cpu().numpy()])) # - differential expression if type(gene_dataset) == CortexDataset: get_statistics(vae, data_loader_train, M_sampling=1, M_permutation=1) # 200 - 100000