Esempio n. 1
0
    def test_populate_from_datasets_dummy_data(self):
        data1 = np.random.randint(1, 5, size=(5, 10))
        gene_names1 = np.array(["gene_%d" % i for i in range(10)])
        dataset1 = GeneExpressionDataset()
        dataset1.populate_from_data(data1, gene_names=gene_names1)
        data2 = np.random.randint(1, 5, size=(7, 3))
        gene_names2 = np.array(["gene_%d" % i for i in range(3)])
        dataset2 = GeneExpressionDataset()
        dataset2.populate_from_data(data2, gene_names=gene_names2)
        data3 = np.random.randint(1, 5, size=(2, 5))
        gene_names3 = np.array(["gene_%d" % i for i in range(5)])
        dataset3 = GeneExpressionDataset()
        dataset3.populate_from_data(data3, gene_names=gene_names3)

        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset1, dataset2, dataset3])
        self.assertEqual(14, dataset.nb_cells)
        self.assertEqual(3, dataset.nb_genes)
        self.assertListEqual(["GENE_0", "GENE_1", "GENE_2"],
                             dataset.gene_names.tolist())

        # test for labels sharing
        dataset2.labels = [0, 0, 0, 1, 1, 1, 1]
        dataset2.initialize_mapped_attribute("labels", "cell_types",
                                             ["0", "1"])
        dataset3.labels = [0, 1]
        dataset3.initialize_mapped_attribute("labels", "cell_types",
                                             ["0", "2"])
        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset2, dataset3],
                                       shared_labels=True)
        self.assertListEqual(
            np.squeeze(dataset.labels).tolist(), [0, 0, 0, 1, 1, 1, 1, 0, 2])
        self.assertListEqual(dataset.cell_types, ["0", "1", "2"])

        dataset_unshared = GeneExpressionDataset()
        dataset_unshared.populate_from_datasets([dataset2, dataset3],
                                                shared_labels=False)
        self.assertListEqual(
            np.squeeze(dataset_unshared.labels).tolist(),
            [0, 0, 0, 1, 1, 1, 1, 2, 3])
        self.assertListEqual(dataset_unshared.cell_types, ["0", "1", "0", "2"])

        # test for batch_indices offsetting
        dataset2.batch_indices = [0, 0, 0, 1, 1, 1, 1]
        dataset2.initialize_mapped_attribute("batch_indices", "experiment",
                                             ["fish_2", "scrna_2"])
        dataset3.batch_indices = [0, 1]
        dataset3.initialize_mapped_attribute("batch_indices", "experiment",
                                             ["fish_3", "scrna_3"])
        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset2, dataset3])
        self.assertListEqual(
            np.squeeze(dataset.batch_indices).tolist(),
            [0, 0, 0, 1, 1, 1, 1, 2, 3])
        self.assertListEqual(getattr(dataset, "experiment"),
                             ["fish_2", "scrna_2", "fish_3", "scrna_3"])
Esempio n. 2
0
meta.index =  meta.loc[:,"cell_name"].values.astype(str)
var = feather.read_dataframe(input_var)
var.index = var.loc[:,"symbol"].values.astype(str)

annobj = anndata.AnnData(X=rawcounts)
annobj.obs = meta
annobj.var = var

X, local_mean, local_var, batch_indices, labels = GeneExpressionDataset.get_attributes_from_matrix(annobj.X)

geneExp = GeneExpressionDataset(X, local_mean, local_var, batch_indices, labels, gene_names=annobj.var.index)

if bool(batch_id) is not False:
    use_batches=True
    plates, plates_ids = pd.factorize(annobj.obs[batch_id])
    geneExp.batch_indices = plates.reshape(-1, 1)
    geneExp.n_batches = np.unique(plates.reshape(-1, 1)).size
else:
    use_batches = False

ldvae = LDVAE(geneExp.nb_genes, 
            n_batch=geneExp.n_batches * use_batches, 
            n_latent=latent,
            n_layers=layer,
            n_hidden=hidden,
            dispersion=dispersion,
            reconstruction_loss=reconstruction_loss)
            
trainer = UnsupervisedTrainer(ldvae,
                              geneExp,
                              train_size=size,