コード例 #1
0
 def entropy_batch_mixing(self, name, verbose=False, **kwargs):
     if self.gene_dataset.n_batches == 2:
         latent, batch_indices, labels = get_latent(self.model,
                                                    self.data_loaders[name])
         be_score = entropy_batch_mixing(latent, batch_indices, **kwargs)
         if verbose:
             print("Entropy batch mixing :", be_score)
         return be_score
コード例 #2
0
ファイル: benchmark.py プロジェクト: Edouard360/scVI
def harmonization_stat(model, data_loader, keys, pop1, pop2):
    latent, batch_indices, labels = get_latent(model, data_loader)
    batch_indices = np.concatenate(batch_indices)
    sample = sample_by_batch(batch_indices, 2000)
    sample_2batch = sample[(batch_indices[sample] == pop1) +
                           (batch_indices[sample] == pop2)]
    batch_entropy = entropy_batch_mixing(latent[sample_2batch, :],
                                         batch_indices[sample_2batch])
    print("Entropy batch mixing : %f.3" % batch_entropy)
    sample = sample_by_batch(labels, 200)
    res = knn_purity_avg(latent[sample, :],
                         labels.astype('int')[sample],
                         keys,
                         acc=True)
    print("Average knn purity : %f.3" % np.mean([x[1] for x in res]))
    return (batch_entropy, res)
コード例 #3
0
ファイル: Prop_cor.py プロジェクト: Edouard360/scVI
 print("dataset 2 has %d cells" % (np.sum(count[1])))
 print(
     "correlation between the cell-type composition of the subsampled dataset is %.3f"
     % correlation)
 sub_dataset = deepcopy(gene_dataset)
 sub_dataset.update_cells(np.concatenate(cells))
 vae = VAE(sub_dataset.nb_genes,
           n_batch=sub_dataset.n_batches,
           n_labels=sub_dataset.n_labels,
           n_hidden=128,
           dispersion='gene')
 infer = VariationalInference(vae, sub_dataset, use_cuda=use_cuda)
 infer.train(n_epochs=250)
 latent, batch_indices, labels = infer.get_latent('sequential')
 keys = sub_dataset.cell_types
 batch_entropy = entropy_batch_mixing(latent, batch_indices)
 print("Entropy batch mixing :", batch_entropy)
 sample = select_indices_evenly(1000, labels)
 res = knn_purity_avg(latent[sample, :],
                      labels[sample].astype('int'),
                      keys=keys,
                      acc=True)
 print('average classification accuracy per cluster')
 for x in res:
     print(x)
 knn_acc = np.mean([x[1] for x in res])
 print("average KNN accuracy:", knn_acc)
 res = clustering_scores(
     np.asarray(latent)[sample, :], labels[sample], 'knn',
     len(np.unique(labels[sample])))
 for x in res:
コード例 #4
0
ファイル: Easycase2.compare.py プロジェクト: Edouard360/scVI
    latent = np.genfromtxt('../macosko_regev.CCA.txt')
    label = np.genfromtxt('../macosko_regev.CCA.label.txt',dtype='str')
    keys = gene_dataset.cell_types
    batch_indices = np.genfromtxt('../macosko_regev.CCA.batch.txt')
elif model_type == 'Combat':
    COMBAT = COMBAT()
    latent = COMBAT.combat_pca(gene_dataset)
    latent = latent.T
    batch_indices = np.concatenate(gene_dataset.batch_indices)
    labels = np.concatenate(gene_dataset.labels)
    keys = gene_dataset.cell_types



sample = select_indices_evenly(2000,batch_indices)
batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample])
print("Entropy batch mixing :", batch_entropy)


sample = select_indices_evenly(1000,labels)
res = knn_purity_avg(
    latent[sample, :], labels[sample],
    keys=keys[np.unique(labels)], acc=True
)

print('average classification accuracy per cluster',np.mean([x[1] for x in res]))
for x in res:
    print(x)

res = clustering_scores(np.asarray(latent)[sample,:],labels[sample],'knn',len(np.unique(labels[sample])))
for x in res:
コード例 #5
0
ファイル: benchmark.py プロジェクト: jstjohn/scVI-dev
def run_benchmarks(dataset_name,
                   model=VAE,
                   n_epochs=1000,
                   lr=1e-3,
                   use_batches=False,
                   use_cuda=True,
                   show_batch_mixing=True,
                   benchmark=False,
                   tt_split=0.9,
                   unit_test=False):
    # options:
    # - gene_dataset: a GeneExpressionDataset object
    # call each of the 4 benchmarks:
    # - log-likelihood
    # - imputation
    # - batch mixing
    # - cluster scores
    gene_dataset = load_datasets(dataset_name, unit_test=unit_test)
    example_indices = np.random.permutation(len(gene_dataset))
    tt_split = int(tt_split * len(gene_dataset))  # 90%/10% train/test split

    data_loader_train = DataLoader(gene_dataset,
                                   batch_size=128,
                                   pin_memory=use_cuda,
                                   sampler=SubsetRandomSampler(
                                       example_indices[:tt_split]),
                                   collate_fn=gene_dataset.collate_fn)
    data_loader_test = DataLoader(gene_dataset,
                                  batch_size=128,
                                  pin_memory=use_cuda,
                                  sampler=SubsetRandomSampler(
                                      example_indices[tt_split:]),
                                  collate_fn=gene_dataset.collate_fn)
    vae = model(gene_dataset.nb_genes,
                n_batch=gene_dataset.n_batches * use_batches,
                n_labels=gene_dataset.n_labels,
                use_cuda=use_cuda)
    stats = train(vae,
                  data_loader_train,
                  data_loader_test,
                  n_epochs=n_epochs,
                  lr=lr,
                  benchmark=benchmark)

    if isinstance(vae, VAE):
        best_ll = adapt_encoder(vae,
                                data_loader_test,
                                n_path=1,
                                n_epochs=1,
                                record_freq=1)
        print("Best ll was :", best_ll)

    # - log-likelihood
    print("Log-likelihood Train:", stats.history["LL_train"][stats.best_index])
    print("Log-likelihood Test:", stats.history["LL_test"][stats.best_index])

    # - imputation
    imputation_test = imputation(vae, data_loader_test)
    print("Imputation score on test (MAE) is:", imputation_test.item())

    # - batch mixing
    if gene_dataset.n_batches == 2:
        latent, batch_indices, labels = get_latent(vae, data_loader_train)
        print(
            "Entropy batch mixing :",
            entropy_batch_mixing(latent.cpu().numpy(),
                                 batch_indices.cpu().numpy()))
        if show_batch_mixing:
            show_t_sne(
                latent.cpu().numpy(),
                np.array([batch[0] for batch in batch_indices.cpu().numpy()]))

    # - differential expression
    if type(gene_dataset) == CortexDataset:
        get_statistics(vae, data_loader_train, M_sampling=1,
                       M_permutation=1)  # 200 - 100000