def populate(self): counts_filename = "sourcedata/{}_counts.csv".format(self.file_prefix) coordinates_filename = "sourcedata/{}_cellcentroids.csv".format( self.file_prefix ) data_path = os.path.join(self.save_path, "seqfishplus") if not os.path.exists(data_path): os.makedirs(data_path) with zipfile.ZipFile(os.path.join(self.save_path, self.filenames[0])) as f: f.extract(counts_filename, path=data_path) f.extract(coordinates_filename, path=data_path) df_counts = pd.read_csv(os.path.join(data_path, counts_filename)) df_coordinates = pd.read_csv(os.path.join(data_path, coordinates_filename)) coordinates = CellMeasurement( name="coords", data=df_coordinates[["X", "Y"]], columns_attr_name="axis", columns=["x", "y"], ) cell_attributes_name_mapping = { "Cell ID": "cell_id", "Field of View": "field_of_view", } if self.tissue_region == "subventricular cortex": cell_attributes_name_mapping.update({"Region": "region"}) cell_attributes_dict = {} for column_name, attribute_name in cell_attributes_name_mapping.items(): cell_attributes_dict[attribute_name] = df_coordinates[column_name] self.populate_from_data( X=df_counts.values, gene_names=df_counts.columns, Ys=[coordinates], cell_attributes_dict=cell_attributes_dict, )
def test_special_dataset_size(self): gene_dataset = GeneExpressionDataset() x = np.random.randint(1, 100, (17 * 2, 10)) y = np.random.randint(1, 100, (17 * 2, 10)) gene_dataset.populate_from_data(x) protein_data = CellMeasurement( name="protein_expression", data=y, columns_attr_name="protein_names", columns=np.arange(10), ) gene_dataset.initialize_cell_measurement(protein_data) # Test UnsupervisedTrainer vae = VAE( gene_dataset.nb_genes, n_batch=gene_dataset.n_batches, n_labels=gene_dataset.n_labels, ) trainer = UnsupervisedTrainer( vae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) # Test JVATrainer jvae = JVAE( [gene_dataset.nb_genes, gene_dataset.nb_genes], gene_dataset.nb_genes, [slice(None)] * 2, ["zinb", "zinb"], [True, True], n_batch=1, ) cls = Classifier(gene_dataset.nb_genes, n_labels=2, logits=True) trainer = JVAETrainer( jvae, cls, [gene_dataset, gene_dataset], train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, ) trainer.train(n_epochs=1) totalvae = TOTALVI(gene_dataset.nb_genes, len(gene_dataset.protein_names)) trainer = TotalTrainer( totalvae, gene_dataset, train_size=0.5, use_cuda=False, data_loader_kwargs={"batch_size": 8}, early_stopping_kwargs=None, ) trainer.train(n_epochs=1)
def test_data_loader(self): data = np.ones((25, 10)) * 100 paired = np.ones((25, 4)) * np.arange(0, 4) pair_names = ["gabou", "achille", "pedro", "oclivio"] y = CellMeasurement(name="dev", data=paired, columns_attr_name="dev_names", columns=pair_names) dataset = GeneExpressionDataset() dataset.populate_from_data(data, Ys=[y]) ad = dataset.to_anndata() dataset_ad = AnnDatasetFromAnnData( ad, cell_measurements_col_mappings={"dev": "dev_names"}) self.assertTrue((paired == dataset_ad.dev).all()) self.assertTrue((dataset.X == dataset_ad.X).all()) self.assertTrue((dataset.cell_types == dataset_ad.cell_types).all())
adatas.append(anndataset_111[anndataset_111.obs["batch_indices"] == b, :].copy()) adatas[-1].obs["batch_indices"] *= 0 for b in np.unique(anndataset_206.obs["batch_indices"]): adatas.append(anndataset_206[anndataset_206.obs["batch_indices"] == b, :].copy()) adatas[-1].obs["batch_indices"] *= 0 names = ["111_d1", "111_d2", "206_d1", "206_d2"] # Iterate over datasets for n, adata in zip(names, adatas): hvg = adata.var["hvg_encode"] dataset = AnnDatasetFromAnnData(ad=adata[:, hvg]) protein_data = CellMeasurement( name="protein_expression", data=adata.obsm["protein_expression"].astype(np.float32), columns_attr_name="protein_names", columns=adata.uns["protein_names"], ) dataset.initialize_cell_measurement(protein_data) dataset.gene_names = adata[:, hvg].var_names.values set_seed(0) model = TOTALVI(dataset.nb_genes, dataset.protein_expression.shape[1], n_latent=20,) use_cuda = True lr = 4e-3 early_stopping_kwargs = { "early_stopping_metric": "elbo", "save_best_state_metric": "elbo", "patience": 45, "threshold": 0,