def test_build_gene_map_from_memory(self): dset1, dset2 = copy.deepcopy(dataset1), copy.deepcopy(dataset2) union = UnionDataset(save_path=save_path, low_memory=True, ignore_batch_annotation=False) union.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) expected_map = pd.Series(np.arange(len(list("ABCDEF"))), index=list("ABCDEF")) self.assertEqual(union.gene_names.tolist(), list("ABCDEF")) self.assertTrue( (union.gene_map.index.values == expected_map.index.values).all()) self.assertTrue((union.gene_map.values == expected_map.values).all())
def test_concatenate_from_memory_to_memory(self): dset1, dset2 = copy.deepcopy(dataset1), copy.deepcopy(dataset2) union = UnionDataset(save_path=save_path, low_memory=True, ignore_batch_annotation=False) union.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union.join_datasets(data_source="memory", data_target="memory", gene_datasets=[dset1, dset2]) expected_gene_names = np.sort( np.unique( np.concatenate([dataset1.gene_names, dataset2.gene_names]))) expected_cell_types = np.sort( np.unique( np.concatenate([dataset1.cell_types, dataset2.cell_types]))) expected_batch_indices = np.concatenate( [dataset1.batch_indices, dataset2.batch_indices + 5]).reshape(-1, 1) cell_types_1, cell_types_2 = dataset1.cell_types[ dataset1.labels], dataset2.cell_types[dataset2.labels] expected_cell_types_rank = np.arange(len(expected_cell_types)) expected_labels = np.concatenate([cell_types_1, cell_types_2]) for rank, ct in zip(expected_cell_types_rank, expected_cell_types): expected_labels[expected_labels == ct] = rank expected_labels = expected_labels.astype(int) self.assertTrue((union.gene_names == expected_gene_names).all()) self.assertTrue((union.cell_types == expected_cell_types).all()) self.assertTrue((union.batch_indices == expected_batch_indices).all()) self.assertTrue((union.labels == expected_labels).all()) unsupervised_training_one_epoch(union)
def test_concatenate_from_loom_to_hdf5(self): try: dset1, dset2 = copy.deepcopy(dataset1), copy.deepcopy(dataset2) # Concatenate the datasets in memory first as reference union_from_mem_to_mem = UnionDataset(save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_mem_to_mem.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_mem_to_mem.join_datasets(data_source='memory', data_target='memory', gene_datasets=[dset1, dset2]) # do the concatenation directly onto a loom file union_from_mem_to_loom = UnionDataset( save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_mem_to_loom.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_mem_to_loom.join_datasets( data_source='memory', data_target='loom', gene_datasets=[dset1, dset2], out_filename="test_concat.loom") # convert the loom file to an hdf5 file union_from_loom_to_hdf5 = UnionDataset( save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_loom_to_hdf5.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_loom_to_hdf5.join_datasets( data_source="loom", data_target="hdf5", in_filename="test_concat.loom", out_filename="test_concat.h5") self.assertTrue( len(union_from_loom_to_hdf5) == len(union_from_mem_to_mem)) random_indices = np.sort( np.random.choice(np.arange(len(union_from_mem_to_mem)), size=int(len(union_from_mem_to_mem) / 5), replace=False)) self.assertTrue( (union_from_loom_to_hdf5.X[random_indices] == union_from_mem_to_mem.X[random_indices].toarray()).all()) self.assertTrue((union_from_loom_to_hdf5.gene_names == union_from_mem_to_mem.gene_names).all()) self.assertTrue((union_from_loom_to_hdf5.cell_types == union_from_mem_to_mem.cell_types).all()) self.assertTrue((union_from_loom_to_hdf5.batch_indices == union_from_mem_to_mem.batch_indices).all()) self.assertTrue( (union_from_loom_to_hdf5.labels == union_from_mem_to_mem.labels ).all()) unsupervised_training_one_epoch(union_from_loom_to_hdf5) except Exception as e: if os.path.exists(os.path.join(save_path, "test_concat.loom")): os.remove(os.path.join(save_path, "test_concat.loom")) raise e
def test_concatenate_from_scvi_to_loom(self): try: random_seed = 0 dset1_args = { "batch_size": 10, "nb_genes": 4, "n_proteins": 4, "n_batches": 4, "n_labels": 3, "seed": random_seed } dset2_args = { "batch_size": 30, "nb_genes": 2, "n_proteins": 6, "n_batches": 2, "n_labels": 4, "seed": random_seed } dset1, dset2 = (SyntheticDataset(**dset1_args), SyntheticDataset(**dset2_args)) # Concatenate the datasets in memory first as reference union_from_mem_to_mem = UnionDataset(save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_mem_to_mem.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_mem_to_mem.join_datasets(data_source='memory', data_target='memory', gene_datasets=[dset1, dset2]) union_from_mem_to_mem_perturb = UnionDataset( save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_mem_to_mem_perturb.build_genemap( data_source="memory", gene_datasets=[dset1, dset2]) union_from_mem_to_mem_perturb.join_datasets( data_source='memory', data_target='memory', gene_datasets=[dset2, dset1]) # Load datasets from scvi and concatenate them in memory union_from_scvi_to_loom = UnionDataset( save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_scvi_to_loom.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_scvi_to_loom.join_datasets( data_source='scvi', data_target='loom', dataset_classes=[SyntheticDataset, SyntheticDataset], dataset_args=[dset1_args, dset2_args], out_filename="test_concat.loom") self.assertTrue( len(union_from_scvi_to_loom) == (len(dset1) + len(dset2))) random_indices = np.sort( np.random.choice(np.arange(len(union_from_scvi_to_loom)), size=int(len(union_from_scvi_to_loom) / 5), replace=False)) self.assertTrue( (union_from_scvi_to_loom.X[random_indices] == union_from_mem_to_mem.X[random_indices].toarray()).all() or (union_from_scvi_to_loom.X[random_indices] == union_from_mem_to_mem_perturb.X[random_indices].toarray() ).all()) self.assertTrue((union_from_scvi_to_loom.gene_names == union_from_mem_to_mem.gene_names).all()) self.assertTrue((union_from_scvi_to_loom.cell_types == union_from_mem_to_mem.cell_types).all()) self.assertTrue( (union_from_scvi_to_loom.batch_indices == union_from_mem_to_mem.batch_indices).all() or (union_from_scvi_to_loom.batch_indices == union_from_mem_to_mem_perturb.batch_indices).all()) self.assertTrue((union_from_scvi_to_loom.labels == union_from_mem_to_mem.labels).all() or (union_from_scvi_to_loom.labels == union_from_mem_to_mem_perturb.labels).all()) unsupervised_training_one_epoch(union_from_scvi_to_loom) except Exception as e: if os.path.exists(os.path.join(save_path, "test_concat.loom")): os.remove(os.path.join(save_path, "test_concat.loom")) raise e
def test_concatenate_from_hdf5_to_memory(self): try: dset1, dset2 = copy.deepcopy(dataset1), copy.deepcopy(dataset2) union_from_mem_to_mem = UnionDataset(save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_mem_to_mem.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_mem_to_mem.join_datasets(data_source='memory', data_target='memory', gene_datasets=[dset1, dset2]) union_from_mem_to_h5 = UnionDataset(save_path=save_path, low_memory=True, ignore_batch_annotation=False) union_from_mem_to_h5.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_mem_to_h5.join_datasets(data_source='memory', data_target='hdf5', gene_datasets=[dset1, dset2], out_filename="test_concat.h5") union_from_h5_to_mem = UnionDataset(save_path=save_path, low_memory=False, ignore_batch_annotation=False) union_from_h5_to_mem.build_genemap(data_source="memory", gene_datasets=[dset1, dset2]) union_from_h5_to_mem.join_datasets(data_source="hdf5", data_target="memory", in_filename="test_concat.h5") self.assertTrue( len(union_from_h5_to_mem) == len(union_from_mem_to_mem)) random_indices = np.sort( np.random.choice(np.arange(len(union_from_mem_to_mem)), size=int(len(union_from_mem_to_mem) / 5), replace=False)) self.assertTrue((union_from_h5_to_mem.X[random_indices] != union_from_mem_to_mem.X[random_indices]).nnz == 0) self.assertTrue((union_from_h5_to_mem.gene_names == union_from_mem_to_mem.gene_names).all()) self.assertTrue((union_from_h5_to_mem.cell_types == union_from_mem_to_mem.cell_types).all()) self.assertTrue((union_from_h5_to_mem.batch_indices == union_from_mem_to_mem.batch_indices).all()) self.assertTrue( (union_from_h5_to_mem.labels == union_from_mem_to_mem.labels ).all()) unsupervised_training_one_epoch(union_from_h5_to_mem) except Exception as e: if os.path.exists(os.path.join(save_path, "test_concat.h5")): os.remove(os.path.join(save_path, "test_concat.h5")) raise e