Example #1
0
 def populate(self):
     counts_filename = "sourcedata/{}_counts.csv".format(self.file_prefix)
     coordinates_filename = "sourcedata/{}_cellcentroids.csv".format(
         self.file_prefix
     )
     data_path = os.path.join(self.save_path, "seqfishplus")
     if not os.path.exists(data_path):
         os.makedirs(data_path)
     with zipfile.ZipFile(os.path.join(self.save_path, self.filenames[0])) as f:
         f.extract(counts_filename, path=data_path)
         f.extract(coordinates_filename, path=data_path)
     df_counts = pd.read_csv(os.path.join(data_path, counts_filename))
     df_coordinates = pd.read_csv(os.path.join(data_path, coordinates_filename))
     coordinates = CellMeasurement(
         name="coords",
         data=df_coordinates[["X", "Y"]],
         columns_attr_name="axis",
         columns=["x", "y"],
     )
     cell_attributes_name_mapping = {
         "Cell ID": "cell_id",
         "Field of View": "field_of_view",
     }
     if self.tissue_region == "subventricular cortex":
         cell_attributes_name_mapping.update({"Region": "region"})
     cell_attributes_dict = {}
     for column_name, attribute_name in cell_attributes_name_mapping.items():
         cell_attributes_dict[attribute_name] = df_coordinates[column_name]
     self.populate_from_data(
         X=df_counts.values,
         gene_names=df_counts.columns,
         Ys=[coordinates],
         cell_attributes_dict=cell_attributes_dict,
     )
Example #2
0
    def test_special_dataset_size(self):
        gene_dataset = GeneExpressionDataset()
        x = np.random.randint(1, 100, (17 * 2, 10))
        y = np.random.randint(1, 100, (17 * 2, 10))
        gene_dataset.populate_from_data(x)
        protein_data = CellMeasurement(
            name="protein_expression",
            data=y,
            columns_attr_name="protein_names",
            columns=np.arange(10),
        )
        gene_dataset.initialize_cell_measurement(protein_data)

        # Test UnsupervisedTrainer
        vae = VAE(
            gene_dataset.nb_genes,
            n_batch=gene_dataset.n_batches,
            n_labels=gene_dataset.n_labels,
        )
        trainer = UnsupervisedTrainer(
            vae,
            gene_dataset,
            train_size=0.5,
            use_cuda=False,
            data_loader_kwargs={"batch_size": 8},
        )
        trainer.train(n_epochs=1)

        # Test JVATrainer
        jvae = JVAE(
            [gene_dataset.nb_genes, gene_dataset.nb_genes],
            gene_dataset.nb_genes,
            [slice(None)] * 2,
            ["zinb", "zinb"],
            [True, True],
            n_batch=1,
        )
        cls = Classifier(gene_dataset.nb_genes, n_labels=2, logits=True)
        trainer = JVAETrainer(
            jvae,
            cls,
            [gene_dataset, gene_dataset],
            train_size=0.5,
            use_cuda=False,
            data_loader_kwargs={"batch_size": 8},
        )
        trainer.train(n_epochs=1)

        totalvae = TOTALVI(gene_dataset.nb_genes,
                           len(gene_dataset.protein_names))
        trainer = TotalTrainer(
            totalvae,
            gene_dataset,
            train_size=0.5,
            use_cuda=False,
            data_loader_kwargs={"batch_size": 8},
            early_stopping_kwargs=None,
        )
        trainer.train(n_epochs=1)
Example #3
0
 def test_data_loader(self):
     data = np.ones((25, 10)) * 100
     paired = np.ones((25, 4)) * np.arange(0, 4)
     pair_names = ["gabou", "achille", "pedro", "oclivio"]
     y = CellMeasurement(name="dev",
                         data=paired,
                         columns_attr_name="dev_names",
                         columns=pair_names)
     dataset = GeneExpressionDataset()
     dataset.populate_from_data(data, Ys=[y])
     ad = dataset.to_anndata()
     dataset_ad = AnnDatasetFromAnnData(
         ad, cell_measurements_col_mappings={"dev": "dev_names"})
     self.assertTrue((paired == dataset_ad.dev).all())
     self.assertTrue((dataset.X == dataset_ad.X).all())
     self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
    adatas.append(anndataset_111[anndataset_111.obs["batch_indices"] == b, :].copy())
    adatas[-1].obs["batch_indices"] *= 0
for b in np.unique(anndataset_206.obs["batch_indices"]):
    adatas.append(anndataset_206[anndataset_206.obs["batch_indices"] == b, :].copy())
    adatas[-1].obs["batch_indices"] *= 0

names = ["111_d1", "111_d2", "206_d1", "206_d2"]

# Iterate over datasets
for n, adata in zip(names, adatas):
    hvg = adata.var["hvg_encode"]

    dataset = AnnDatasetFromAnnData(ad=adata[:, hvg])
    protein_data = CellMeasurement(
        name="protein_expression",
        data=adata.obsm["protein_expression"].astype(np.float32),
        columns_attr_name="protein_names",
        columns=adata.uns["protein_names"],
    )
    dataset.initialize_cell_measurement(protein_data)
    dataset.gene_names = adata[:, hvg].var_names.values
    
    set_seed(0)

    model = TOTALVI(dataset.nb_genes, dataset.protein_expression.shape[1], n_latent=20,)
    use_cuda = True
    lr = 4e-3
    early_stopping_kwargs = {
        "early_stopping_metric": "elbo",
        "save_best_state_metric": "elbo",
        "patience": 45,
        "threshold": 0,