Beispiel #1
0
 def entropy_batch_mixing(self, verbose=False, **kwargs):
     if self.gene_dataset.n_batches == 2:
         latent, batch_indices, labels = self.get_latent()
         sample = select_indices_evenly(2000, batch_indices)
         be_score = entropy_batch_mixing(latent[sample, :],
                                         batch_indices[sample], **kwargs)
         if verbose:
             print("Entropy batch mixing :", be_score)
         return be_score
def eval_latent(batch_indices,
                labels,
                latent,
                keys,
                labelled_idx=None,
                unlabelled_idx=None,
                plotname=None,
                plotting=False,
                partial_only=True):
    res_knn_partial = clustering_scores(latent, labels, 'knn', True,
                                        labelled_idx, unlabelled_idx)
    res_kmeans_partial = clustering_scores(latent, labels, 'KMeans', True,
                                           labelled_idx, unlabelled_idx)
    if partial_only == False:
        res_knn = clustering_scores(np.asarray(latent), labels, 'knn')
        res_kmeans = clustering_scores(np.asarray(latent), labels, 'KMeans')
    # sample = select_indices_evenly(2000, batch_indices)
    # batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample])
    # print("Entropy batch mixing :", batch_entropy)
    if plotting == True and (os.path.isfile('../' + plotname + '.labels.pdf')
                             is False):
        sample = select_indices_evenly(2000, batch_indices)
        # sample = select_indices_evenly(2000, labels)
        if plotname is not None:
            colors = sns.color_palette('bright') + \
                     sns.color_palette('muted') + \
                     sns.color_palette('dark') + \
                     sns.color_palette('pastel') + \
                     sns.color_palette('colorblind')
            latent_s = latent[sample, :]
            label_s = labels[sample]
            batch_s = batch_indices[sample]
            if latent_s.shape[1] != 2:
                latent_s = UMAP(spread=2).fit_transform(latent_s)
            fig, ax = plt.subplots(figsize=(18, 18))
            key_order = np.argsort(keys)
            for i, k in enumerate(key_order):
                ax.scatter(latent_s[label_s == k, 0],
                           latent_s[label_s == k, 1],
                           c=colors[i % 30],
                           label=keys[k],
                           edgecolors='none')
                # ax.legend(bbox_to_anchor=(1.1, 0.5), borderaxespad=0, fontsize='x-large')
            fig.patch.set_visible(False)
            ax.axis('off')
            fig.tight_layout()
            plt.savefig('../' + plotname + '.labels.pdf')
            fig, ax = plt.subplots(figsize=(18, 18))
            # for i, x in enumerate(batch):
            ax.scatter(latent_s[:, 0], latent_s[:, 1], c=batch_s, alpha=0.8)
            # ax.legend(bbox_to_anchor=(1.1, 0.5), borderaxespad=0, fontsize='x-large')
            plt.axis('off')
            plt.savefig('../' + plotname + '.batchid.pdf')
    if partial_only == False:
        return res_knn, res_knn_partial, res_kmeans, res_kmeans_partial
    else:
        return 0, res_knn_partial, 0, res_kmeans_partial
def VAEstats(full):
    ll = full.ll(verbose=True)
    latent, batch_indices, labels = full.sequential().get_latent()
    batch_indices = batch_indices.ravel()
    if len(np.unique(batch_indices)) == 2:
        sample = select_indices_evenly(
            np.min(np.unique(batch_indices, return_counts=True)[1]),
            batch_indices)
        batch_entropy = entropy_batch_mixing(latent[sample, :],
                                             batch_indices[sample])
    else:
        batch_entropy = -1
    labels = labels.ravel()
    stats = [ll, batch_entropy, -1, -1, np.arange(0, len(labels))]
    return latent, batch_indices, labels, stats
def SCANVIstats(trainer_scanvi, gene_dataset):
    full = trainer_scanvi.create_posterior(trainer_scanvi.model,
                                           gene_dataset,
                                           indices=np.arange(
                                               len(gene_dataset)))
    ll = full.ll(verbose=True)
    latent, batch_indices, labels = full.sequential().get_latent()
    batch_indices = batch_indices.ravel()
    if len(np.unique(batch_indices)) == 2:
        sample = select_indices_evenly(
            np.min(np.unique(batch_indices, return_counts=True)[1]),
            batch_indices)
        batch_entropy = entropy_batch_mixing(latent[sample, :],
                                             batch_indices[sample])
    else:
        batch_entropy = -1
    labelled_idx = trainer_scanvi.labelled_set.indices
    unlabelled_idx = trainer_scanvi.unlabelled_set.indices
    trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(
        trainer_scanvi.model, gene_dataset, indices=unlabelled_idx)
    acc = trainer_scanvi.unlabelled_set.accuracy()
    stats = [ll, batch_entropy, acc, labelled_idx, unlabelled_idx]
    return latent, batch_indices, labels, stats
Beispiel #5
0
        "correlation between the cell-type composition of the subsampled dataset is %.3f"
        % correlation)
    sub_dataset = deepcopy(gene_dataset)
    sub_dataset.update_cells(np.concatenate(cells))
    vae = VAE(sub_dataset.nb_genes,
              n_batch=sub_dataset.n_batches,
              n_labels=sub_dataset.n_labels,
              n_hidden=128,
              dispersion='gene')
    infer = VariationalInference(vae, sub_dataset, use_cuda=use_cuda)
    infer.train(n_epochs=250)
    latent, batch_indices, labels = infer.get_latent('sequential')
    keys = sub_dataset.cell_types
    batch_entropy = entropy_batch_mixing(latent, batch_indices)
    print("Entropy batch mixing :", batch_entropy)
    sample = select_indices_evenly(1000, labels)
    res = knn_purity_avg(latent[sample, :],
                         labels[sample].astype('int'),
                         keys=keys,
                         acc=True)
    print('average classification accuracy per cluster')
    for x in res:
        print(x)
    knn_acc = np.mean([x[1] for x in res])
    print("average KNN accuracy:", knn_acc)
    res = clustering_scores(
        np.asarray(latent)[sample, :], labels[sample], 'knn',
        len(np.unique(labels[sample])))
    for x in res:
        print(x, res[x])
Beispiel #6
0
    # latent, batch_indices,labels,keys = SEURAT.get_cca()
    latent = np.genfromtxt('../macosko_regev.CCA.txt')
    label = np.genfromtxt('../macosko_regev.CCA.label.txt',dtype='str')
    keys = gene_dataset.cell_types
    batch_indices = np.genfromtxt('../macosko_regev.CCA.batch.txt')
elif model_type == 'Combat':
    COMBAT = COMBAT()
    latent = COMBAT.combat_pca(gene_dataset)
    latent = latent.T
    batch_indices = np.concatenate(gene_dataset.batch_indices)
    labels = np.concatenate(gene_dataset.labels)
    keys = gene_dataset.cell_types



sample = select_indices_evenly(2000,batch_indices)
batch_entropy = entropy_batch_mixing(latent[sample, :], batch_indices[sample])
print("Entropy batch mixing :", batch_entropy)


sample = select_indices_evenly(1000,labels)
res = knn_purity_avg(
    latent[sample, :], labels[sample],
    keys=keys[np.unique(labels)], acc=True
)

print('average classification accuracy per cluster',np.mean([x[1] for x in res]))
for x in res:
    print(x)

res = clustering_scores(np.asarray(latent)[sample,:],labels[sample],'knn',len(np.unique(labels[sample])))
def CompareModels(gene_dataset, dataset1, dataset2, plotname, models):
    KNeighbors = np.concatenate(
        [np.arange(10, 100, 10),
         np.arange(100, 500, 50)])
    K_int = np.concatenate([np.repeat(10, 10), np.repeat(50, 7)])
    f = open('../' + plotname + '/' + models + '.res.txt', "w+")
    f.write("model_type " + \
            "knn_asw knn_nmi knn_ari knn_uca knn_wuca " + \
            "p_knn_asw p_knn_nmi p_knn_ari p_knn_uca p_knn_wuca " + \
            "p1_knn_asw p1_knn_nmi p1_knn_ari p1_knn_uca p1_knn_wuca " + \
            "p2_knn_asw p2_knn_nmi p2_knn_ari p2_knn_uca p2_knn_wuca " + \
            "kmeans_asw kmeans_nmi kmeans_ari kmeans_uca kmeans_wuca " + \
            "p_kmeans_asw p_kmeans_nmi p_kmeans_ari p_kmeans_uca p_kmeans_wuca " + \
            "p1_kmeans_asw p1_kmeans_nmi p1_kmeans_ari p1_kmeans_uca p1_kmeans_wuca " + \
            "p2_kmeans_asw p2_kmeans_nmi p2_kmeans_ari p2_kmeans_uca p2_kmeans_wuca " + \
            " ".join(['res_jaccard' + x for x in
                      np.concatenate([np.repeat(10, 10), np.repeat(50, 7)]).astype('str')]) + " " + \
            'jaccard_score likelihood BE classifier_acc\n'
            )
    g = open('../' + plotname + '/' + models + '.percluster.res.txt', "w+")
    g.write("model_type\tannotation\t" + "\t".join(gene_dataset.cell_types) +
            "\n")

    scanvi = SCANVI(gene_dataset.nb_genes, gene_dataset.n_batches,
                    gene_dataset.n_labels)
    trainer_scanvi = SemiSupervisedTrainer(scanvi,
                                           gene_dataset,
                                           classification_ratio=1,
                                           n_epochs_classifier=1,
                                           lr_classification=5 * 1e-3)
    labelled_idx = trainer_scanvi.labelled_set.indices
    unlabelled_idx = trainer_scanvi.unlabelled_set.indices

    if models == 'others':
        latent1 = np.genfromtxt('../harmonization/Seurat_data/' + plotname +
                                '.1.CCA.txt')
        latent2 = np.genfromtxt('../harmonization/Seurat_data/' + plotname +
                                '.2.CCA.txt')
        for model_type in [
                'scmap', 'readSeurat', 'coral', 'Combat', 'MNN', 'PCA'
        ]:
            print(model_type)
            if (model_type == 'scmap') or (model_type == 'coral'):
                latent, batch_indices, labels, keys, stats = run_model(
                    model_type,
                    gene_dataset,
                    dataset1,
                    dataset2,
                    filename=plotname)
                pred1 = latent
                pred2 = stats
                res1 = scmap_eval(pred1, labels[batch_indices == 1], labels)
                res2 = scmap_eval(pred2, labels[batch_indices == 0], labels)
                g.write("%s\t" % (model_type) + "p1\t" +
                        ("%.4f\t" * len(gene_dataset.cell_types) %
                         tuple(res1['clusteracc']) + "\n"))
                g.write("%s\t" % (model_type) + "p2\t" +
                        ("%.4f\t" * len(gene_dataset.cell_types) %
                         tuple(res2['clusteracc']) + "\n"))
                res = [-1] * 10 + \
                      [-1] + [res1[x] for x in ['nmi', 'ari', 'ca', 'weighted ca']] + \
                      [-1] + [res2[x] for x in ['nmi', 'ari', 'ca', 'weighted ca']] + \
                      [-1] * 41

                f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res))
            else:
                if model_type == 'readSeurat':
                    dataset1, dataset2, gene_dataset = SubsetGenes(
                        dataset1, dataset2, gene_dataset, plotname)

                latent, batch_indices, labels, keys, stats = run_model(
                    model_type,
                    gene_dataset,
                    dataset1,
                    dataset2,
                    filename=plotname)

                res_jaccard = [
                    KNNJaccardIndex(latent1, latent2, latent, batch_indices,
                                    k)[0] for k in KNeighbors
                ]
                res_jaccard_score = np.sum(res_jaccard * K_int)
                res_knn, res_knn_partial, res_kmeans, res_kmeans_partial = \
                    eval_latent(batch_indices, labels, latent, keys,
                                labelled_idx, unlabelled_idx,
                                plotname=plotname + '.' + model_type, plotting=False, partial_only=False)

                _, res_knn_partial1, _, res_kmeans_partial1 = \
                    eval_latent(batch_indices, labels, latent, keys,
                                batch_indices == 0, batch_indices == 1,
                                plotname=plotname + '.' + model_type, plotting=False)

                _, res_knn_partial2, _, res_kmeans_partial2 = \
                    eval_latent(batch_indices, labels, latent, keys,
                                batch_indices == 1, batch_indices == 0,
                                plotname=plotname + '.' + model_type, plotting=False)

                sample = select_indices_evenly(
                    np.min(np.unique(batch_indices, return_counts=True)[1]),
                    batch_indices)
                batch_entropy = entropy_batch_mixing(latent[sample, :],
                                                     batch_indices[sample])

                res = [res_knn[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                      [res_knn_partial[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                      [res_knn_partial1[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                      [res_knn_partial2[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                      [res_kmeans[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                      [res_kmeans_partial[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                      [res_kmeans_partial1[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                      [res_kmeans_partial2[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                      res_jaccard + \
                      [res_jaccard_score, -1, batch_entropy, -1]

                f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res))
                g.write("%s\t" % (model_type) + 'all\t' +
                        ("%.4f\t" * len(gene_dataset.cell_types) %
                         tuple(res_knn['clusteracc']) + "\n"))
                g.write("%s\t" % (model_type) + 'p\t' +
                        ("%.4f\t" * len(gene_dataset.cell_types) %
                         tuple(res_knn_partial['clusteracc']) + "\n"))
                g.write("%s\t" % (model_type) + 'p1\t' +
                        ("%.4f\t" * len(gene_dataset.cell_types) %
                         tuple(res_knn_partial1['clusteracc']) + "\n"))
                g.write("%s\t" % (model_type) + 'p2\t' +
                        ("%.4f\t" * len(gene_dataset.cell_types) %
                         tuple(res_knn_partial2['clusteracc']) + "\n"))

    elif (models == 'scvi') or (models == 'scvi_nb'):
        dataset1, dataset2, gene_dataset = SubsetGenes(dataset1, dataset2,
                                                       gene_dataset, plotname)
        if models == 'scvi_nb':
            latent1, _, _, _, _ = run_model('vae_nb',
                                            dataset1,
                                            0,
                                            0,
                                            filename=plotname,
                                            rep='vae1_nb')
            latent2, _, _, _, _ = run_model('vae_nb',
                                            dataset2,
                                            0,
                                            0,
                                            filename=plotname,
                                            rep='vae2_nb')
        else:
            latent1, _, _, _, _ = run_model('vae',
                                            dataset1,
                                            0,
                                            0,
                                            filename=plotname,
                                            rep='vae1')
            latent2, _, _, _, _ = run_model('vae',
                                            dataset2,
                                            0,
                                            0,
                                            filename=plotname,
                                            rep='vae2')

        for model_type in [
                'vae', 'scanvi1', 'scanvi2', 'vae_nb', 'scanvi1_nb',
                'scanvi2_nb'
        ]:
            print(model_type)
            latent, batch_indices, labels, keys, stats = run_model(
                model_type,
                gene_dataset,
                dataset1,
                dataset2,
                filename=plotname,
                rep='0')

            res_jaccard = [
                KNNJaccardIndex(latent1, latent2, latent, batch_indices, k)[0]
                for k in KNeighbors
            ]
            res_jaccard_score = np.sum(res_jaccard * K_int)
            res_knn, res_knn_partial, res_kmeans, res_kmeans_partial = \
                eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys,
                            labelled_idx=labelled_idx, unlabelled_idx=unlabelled_idx,
                            plotname=plotname + '.' + model_type, plotting=False, partial_only=False)

            _, res_knn_partial1, _, res_kmeans_partial1 = \
                eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys,
                            labelled_idx=(batch_indices == 0), unlabelled_idx=(batch_indices == 1),
                            plotname=plotname + '.' + model_type, plotting=False)

            _, res_knn_partial2, _, res_kmeans_partial2 = \
                eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys,
                            labelled_idx=(batch_indices == 1), unlabelled_idx=(batch_indices == 0),
                            plotname=plotname + '.' + model_type, plotting=False)

            res = [res_knn[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                  [res_knn_partial[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                  [res_knn_partial1[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                  [res_knn_partial2[x] for x in ['asw', 'nmi', 'ari', 'ca', 'weighted ca']] + \
                  [res_kmeans[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                  [res_kmeans_partial[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                  [res_kmeans_partial1[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                  [res_kmeans_partial2[x] for x in ['asw', 'nmi', 'ari', 'uca', 'weighted uca']] + \
                  res_jaccard + \
                  [res_jaccard_score, stats[0], stats[1], stats[2]]

            f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res))
            g.write("%s\t" % (model_type) + 'all\t' +
                    ("%.4f\t" * len(gene_dataset.cell_types) %
                     tuple(res_knn['clusteracc']) + "\n"))
            g.write("%s\t" % (model_type) + 'p\t' +
                    ("%.4f\t" * len(gene_dataset.cell_types) %
                     tuple(res_knn_partial['clusteracc']) + "\n"))
            g.write("%s\t" % (model_type) + 'p1\t' +
                    ("%.4f\t" * len(gene_dataset.cell_types) %
                     tuple(res_knn_partial1['clusteracc']) + "\n"))
            g.write("%s\t" % (model_type) + 'p2\t' +
                    ("%.4f\t" * len(gene_dataset.cell_types) %
                     tuple(res_knn_partial2['clusteracc']) + "\n"))
            # for i in [1, 2, 3]:
            #     latent, batch_indices, labels, keys, stats = run_model(model_type, gene_dataset, dataset1, dataset2,
            #                                                            filename=plotname, rep=str(i))
            #     res_jaccard, res_jaccard_score = KNNJaccardIndex(latent1, latent2, latent, batch_indices)
            #
            #     res_knn, res_knn_partial, res_kmeans, res_kmeans_partial = \
            #         eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys,
            #                     labelled_idx=labelled_idx, unlabelled_idx=unlabelled_idx,
            #                     plotname=plotname + '.' + model_type, plotting=False,partial_only=False)
            #
            #     _, res_knn_partial1, _, res_kmeans_partial1 = \
            #         eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys,
            #                     labelled_idx=(batch_indices == 0), unlabelled_idx=(batch_indices == 1),
            #                     plotname=plotname + '.' + model_type, plotting=False)
            #
            #     _, res_knn_partial2, _, res_kmeans_partial2 = \
            #         eval_latent(batch_indices=batch_indices, labels=labels, latent=latent, keys=keys,
            #                     labelled_idx=(batch_indices == 1), unlabelled_idx=(batch_indices == 0),
            #                     plotname=plotname + '.' + model_type, plotting=False)
            #
            #     res = [res_knn[x] for x in res_knn] + \
            #           [res_knn_partial[x] for x in res_knn_partial] + \
            #           [res_knn_partial1[x] for x in res_knn_partial1] + \
            #           [res_knn_partial2[x] for x in res_knn_partial2] + \
            #           [res_kmeans[x] for x in res_kmeans] + \
            #           [res_kmeans_partial[x] for x in res_kmeans_partial] + \
            #           [res_kmeans_partial1[x] for x in res_kmeans_partial1] + \
            #           [res_kmeans_partial2[x] for x in res_kmeans_partial2] + \
            #           res_jaccard + \
            #           [res_jaccard_score,stats[0], stats[1], stats[2]]
            #     f.write(model_type + (" %.4f" * 61 + "\n") % tuple(res))

    elif models == 'writedata':
        _, _, _, _, _ = run_model('writedata',
                                  gene_dataset,
                                  dataset1,
                                  dataset2,
                                  filename=plotname)
    f.close()
    g.close()