Example #1
0
    def test_populate_from_datasets_with_measurments(self):
        data = np.random.randint(1, 5, size=(5, 10))
        gene_names = np.array(["gene_%d" % i for i in range(10)])

        paired1 = np.ones((5, 5)) * np.arange(0, 5)
        pair_names1 = ["gabou", "achille", "pedro", "oclivio", "gayoso"]
        y1 = CellMeasurement(name="dev",
                             data=paired1,
                             columns_attr_name="dev_names",
                             columns=pair_names1)
        paired2 = np.ones((5, 4)) * np.arange(0, 4)
        pair_names2 = ["gabou", "oclivio", "achille", "pedro"]
        y2 = CellMeasurement(name="dev",
                             data=paired2,
                             columns_attr_name="dev_names",
                             columns=pair_names2)

        dataset1 = GeneExpressionDataset()
        dataset2 = GeneExpressionDataset()

        dataset1.populate_from_data(data, Ys=[y1], gene_names=gene_names)
        dataset2.populate_from_data(data, Ys=[y2], gene_names=gene_names)

        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset1, dataset2])

        self.assertTrue(hasattr(dataset, "dev"))
        self.assertTrue(hasattr(dataset, "dev_names"))

        self.assertListEqual(dataset.dev_names.tolist(),
                             ["achille", "gabou", "oclivio", "pedro"])
        self.assertListEqual(dataset.dev[0].tolist(), [1, 0, 3, 2])
        self.assertListEqual(dataset.dev[5].tolist(), [2, 0, 1, 3])
Example #2
0
    def test_populate_from_datasets_with_measurments(self):
        data = np.random.randint(1, 5, size=(5, 10))
        gene_names = np.array(["gene_%d" % i for i in range(10)])

        paired1 = np.ones((5, 5)) * np.arange(0, 5)
        pair_names1 = ["gabou", "achille", "pedro", "oclivio", "gayoso"]
        y1 = CellMeasurement(name="dev",
                             data=paired1,
                             columns_attr_name="dev_names",
                             columns=pair_names1)
        paired2 = np.ones((5, 4)) * np.arange(0, 4)
        pair_names2 = ["gabou", "oclivio", "achille", "pedro"]
        y2 = CellMeasurement(name="dev",
                             data=paired2,
                             columns_attr_name="dev_names",
                             columns=pair_names2)

        dataset1 = GeneExpressionDataset()
        dataset2 = GeneExpressionDataset()

        dataset1.populate_from_data(data, Ys=[y1], gene_names=gene_names)
        dataset2.populate_from_data(data, Ys=[y2], gene_names=gene_names)

        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets(
            [copy.deepcopy(dataset1),
             copy.deepcopy(dataset2)])

        self.assertTrue(hasattr(dataset, "dev"))
        self.assertTrue(hasattr(dataset, "dev_names"))

        self.assertListEqual(dataset.dev_names.tolist(),
                             ["achille", "gabou", "oclivio", "pedro"])
        self.assertListEqual(dataset.dev[0].tolist(), [1, 0, 3, 2])
        self.assertListEqual(dataset.dev[5].tolist(), [2, 0, 1, 3])

        # Take union of dev columns, 0s fill remainder
        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets(
            [copy.deepcopy(dataset1),
             copy.deepcopy(dataset2)],
            cell_measurement_intersection={"dev": False},
        )
        self.assertListEqual(
            dataset.dev_names.tolist(),
            ["achille", "gabou", "gayoso", "oclivio", "pedro"],
        )
        mask = dataset.get_batch_mask_cell_measurement("dev")
        self.assertEqual(mask[1][2].astype(int), 0)
Example #3
0
 def test_populate_from_datasets_cortex(self):
     cortex_dataset_1 = CortexDataset(save_path="tests/data")
     cortex_dataset_1.subsample_genes(subset_genes=np.arange(0, 3),
                                      mode="variance")
     cortex_dataset_1.filter_cell_types(["microglia", "oligodendrocytes"])
     cortex_dataset_2 = CortexDataset(save_path="tests/data")
     cortex_dataset_2.subsample_genes(subset_genes=np.arange(1, 4),
                                      mode="variance")
     cortex_dataset_2.filter_cell_types([
         "endothelial-mural", "interneurons", "microglia",
         "oligodendrocytes"
     ])
     cortex_dataset_2.filter_cell_types([2, 0])
     dataset = GeneExpressionDataset()
     dataset.populate_from_datasets([cortex_dataset_1, cortex_dataset_2])
     self.assertEqual(2, dataset.nb_genes)
Example #4
0
    def test_populate_from_datasets_dummy_data(self):
        data1 = np.random.randint(1, 5, size=(5, 10))
        gene_names1 = np.array(["gene_%d" % i for i in range(10)])
        dataset1 = GeneExpressionDataset()
        dataset1.populate_from_data(data1, gene_names=gene_names1)
        data2 = np.random.randint(1, 5, size=(7, 3))
        gene_names2 = np.array(["gene_%d" % i for i in range(3)])
        dataset2 = GeneExpressionDataset()
        dataset2.populate_from_data(data2, gene_names=gene_names2)
        data3 = np.random.randint(1, 5, size=(2, 5))
        gene_names3 = np.array(["gene_%d" % i for i in range(5)])
        dataset3 = GeneExpressionDataset()
        dataset3.populate_from_data(data3, gene_names=gene_names3)

        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset1, dataset2, dataset3])
        self.assertEqual(14, dataset.nb_cells)
        self.assertEqual(3, dataset.nb_genes)
        self.assertListEqual(["GENE_0", "GENE_1", "GENE_2"],
                             dataset.gene_names.tolist())

        # test for labels sharing
        dataset2.labels = [0, 0, 0, 1, 1, 1, 1]
        dataset2.initialize_mapped_attribute("labels", "cell_types",
                                             ["0", "1"])
        dataset3.labels = [0, 1]
        dataset3.initialize_mapped_attribute("labels", "cell_types",
                                             ["0", "2"])
        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset2, dataset3],
                                       shared_labels=True)
        self.assertListEqual(
            np.squeeze(dataset.labels).tolist(), [0, 0, 0, 1, 1, 1, 1, 0, 2])
        self.assertListEqual(dataset.cell_types, ["0", "1", "2"])

        dataset_unshared = GeneExpressionDataset()
        dataset_unshared.populate_from_datasets([dataset2, dataset3],
                                                shared_labels=False)
        self.assertListEqual(
            np.squeeze(dataset_unshared.labels).tolist(),
            [0, 0, 0, 1, 1, 1, 1, 2, 3])
        self.assertListEqual(dataset_unshared.cell_types, ["0", "1", "0", "2"])

        # test for batch_indices offsetting
        dataset2.batch_indices = [0, 0, 0, 1, 1, 1, 1]
        dataset2.initialize_mapped_attribute("batch_indices", "experiment",
                                             ["fish_2", "scrna_2"])
        dataset3.batch_indices = [0, 1]
        dataset3.initialize_mapped_attribute("batch_indices", "experiment",
                                             ["fish_3", "scrna_3"])
        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset2, dataset3])
        self.assertListEqual(
            np.squeeze(dataset.batch_indices).tolist(),
            [0, 0, 0, 1, 1, 1, 1, 2, 3])
        self.assertListEqual(getattr(dataset, "experiment"),
                             ["fish_2", "scrna_2", "fish_3", "scrna_3"])
Example #5
0
    def test_populate_from_datasets_cell_attributes_merging(self):
        data = np.random.randint(1, 5, size=(5, 10))
        gene_names = np.array(["gene_%d" % i for i in range(10)])
        cell_attr1 = np.array([["1"] for _ in range(5)])
        cell_attr2 = np.array([["2"] for _ in range(5)])
        dataset1 = GeneExpressionDataset()
        dataset2 = GeneExpressionDataset()

        dataset1.populate_from_data(data,
                                    gene_names=gene_names,
                                    cell_attributes_dict={"test": cell_attr1})
        dataset2.populate_from_data(data,
                                    gene_names=gene_names,
                                    cell_attributes_dict={"test": cell_attr2})

        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset1, dataset2])
        self.assertTupleEqual(dataset.test.shape, (10, 1))
        self.assertListEqual(
            np.squeeze(dataset.test).tolist(), ["1"] * 5 + ["2"] * 5)
Example #6
0
    def test_populate_from_datasets_gene_attributes_merging(self):
        data = np.random.randint(1, 5, size=(5, 10))
        gene_names = np.array(["gene_%d" % i for i in range(10)])
        gene_attr1 = np.array([["1"] for _ in range(10)])
        gene_attr2 = np.array([["2"] for _ in range(10)])
        dataset1 = GeneExpressionDataset()
        dataset2 = GeneExpressionDataset()

        dataset1.populate_from_data(data,
                                    gene_names=gene_names,
                                    gene_attributes_dict={"test": gene_attr1})
        dataset2.populate_from_data(data,
                                    gene_names=gene_names,
                                    gene_attributes_dict={"test": gene_attr2})

        dataset = GeneExpressionDataset()
        dataset.populate_from_datasets([dataset1, dataset2])

        # Should keep the gene attribute of the first dataset
        self.assertEqual(dataset.test[0, 0], "1")
Example #7
0
class scVI(Base_scVI):
    def __init__(self, data, name, n_latent=10, reconstruction_seq='zinb'):
        super().__init__(data, name, n_latent)

        self.full_dataset = GeneExpressionDataset()

        self.full_dataset.populate_from_datasets([
            copy.deepcopy(data.data_fish_partial),
            copy.deepcopy(data.data_seq)
        ])
        self.full_dataset.compute_library_size_batch()
        self.reconstruction_seq = reconstruction_seq

    def train_both(self, n_epochs=20):
        vae_both = VAE(
            self.full_dataset.nb_genes,
            n_latent=self.n_latent,
            n_batch=self.full_dataset.n_batches,
            dispersion="gene-batch",
            reconstruction_loss=self.reconstruction_seq,
        )
        self.trainer_both = UnsupervisedTrainer(
            vae_both,
            self.full_dataset,
            train_size=0.95,
            use_cuda=self.USE_CUDA,
            frequency=1,
        )
        self.trainer_both.train(n_epochs=n_epochs, lr=0.001)
        # self.posterior_both = self.trainer_both.create_posterior()

    def compute_latent(self):
        """ Return latent_both_fish, latent_both_seq, latent_only_fish, latent_only_seq
        """

        both = self.trainer_both.create_posterior().get_latent()[0]
        self.latent_both = both

        self.latent_both_fish = self.latent_both[:self.data.data_fish_partial.
                                                 X.shape[0], :]
        self.latent_both_seq = self.latent_both[
            self.data.data_fish_partial.X.shape[0]:, :]

        fish = self.trainer_fish.create_posterior().get_latent()[0]
        self.latent_only_fish = fish

        seq = self.trainer_seq.create_posterior().get_latent()[0]
        self.latent_only_seq = seq

        return (
            self.latent_both_fish,
            self.latent_both_seq,
            self.latent_only_fish,
            self.latent_only_seq,
        )

    def compute_imputed_values(self, k=10):
        dataset = self.data.data_seq
        normalized_matrix = dataset.X / np.sum(dataset.X, axis=1)[:,
                                                                  np.newaxis]
        knn = KNeighborsRegressor(k, weights="distance")
        predicted = knn.fit(self.latent_both_seq,
                            normalized_matrix).predict(self.latent_both_fish)
        self.imputed_full = predicted * self.data.data_fish_partial.X.sum(
            axis=1).reshape(-1, 1)
        self.imputed = self.imputed_full[:, self.data.test_indices]
        return self.imputed