def test_heterogeneous_collections(self): """Test detecton of heterogeneous collections. Check is done in __init__, so it is sufficient to initialize RNATables with an appropriate collection. """ # Different processes data2 = MagicMock() data2.id = 12345 data2.process.slug = "process-slug2" data2.output.__getitem__.side_effect = {"source": "ENSEMBL"}.__getitem__ self.collection.data.filter = self.web_request([self.data, data2]) with self.assertRaisesRegex(ValueError, r"Expressions of all samples.*"): RNATables(self.collection) # Different source data2 = MagicMock() data2.id = 12345 data2.process.slug = "process-slug" data2.output.__getitem__.side_effect = {"source": "GENCODE"}.__getitem__ self.collection.data.filter = self.web_request([self.data, data2]) with self.assertRaisesRegex(ValueError, r"Alignment of all samples.*"): RNATables(self.collection)
def test_id_to_symbol(self, mapping_mock): mapping_mock.side_effect = self.web_request(self.gene_map) ct = RNATables(self.collection) with self.assertRaises(ValueError): mapping = ct.id_to_symbol ct = RNATables(self.collection) ct.gene_ids = ["ENSG001", "ENSG002", "ENSG003"] t = time() mapping = ct.id_to_symbol self.assertTrue(time() - t > 0.1) mapping_mock.assert_called_with( ["ENSG001", "ENSG002", "ENSG003"], "ENSEMBL", "H**o sapiens" ) self.assertIs(mapping, self.gene_map) # test if use case works new_exp = self.expressions_df.rename(columns=ct.id_to_symbol) self.assertListEqual(new_exp.columns.tolist(), ["GA", "GB", "GC"]) # use cache t = time() mapping = ct.id_to_symbol self.assertTrue(time() - t < 0.1) self.assertIs(mapping, self.gene_map)
def test_get_descriptors(self): ct = RNATables(self.collection) descriptors = ct._get_descriptors() expected = pd.DataFrame([1], columns=["PFS"], index=[123], dtype=float) expected.index.name = "sample_id" assert_frame_equal(descriptors, expected)
def test_get_relations(self): ct = RNATables(self.collection) relations = ct._get_relations() expected = pd.DataFrame(["L1"], columns=["Category"], index=[123]) expected.index.name = "sample_id" assert_frame_equal(relations, expected)
def setUp(self): self.cache_dir = tempfile.mkdtemp() self.test_server_url = "https://app.genialis.com" self.test_collection_slug = "resdk-test-collection-tables" self.res = resdk.Resolwe( url=self.test_server_url, username="******", password="******" ) self.collection = self.res.collection.get(self.test_collection_slug) self.ct = RNATables(self.collection, cache_dir=self.cache_dir)
def test_init(self, exists_mock): ct = RNATables(self.collection) self.assertIs(ct.collection, self.collection) self.assertEqual(ct.cache_dir, "/tmp/resdk/") exists_mock.assert_called_with("/tmp/resdk/") # using different cache dir ct = RNATables(self.collection, cache_dir="/tmp/cache_dir/") self.assertEqual(ct.cache_dir, "/tmp/cache_dir/") exists_mock.assert_called_with("/tmp/cache_dir/")
def test_get_orange_data(self): response = MagicMock() response.content = b"mS#Sample ID\tCol1\n123\t42" self.collection.resolwe.session.get.return_value = response self.collection.data.get = self.web_request(self.orange_data) ct = RNATables(self.collection) orange_data = ct._get_orange_data() expected = pd.DataFrame([42], columns=["Col1"], index=[123]) expected.index.name = "sample_id" assert_frame_equal(orange_data, expected)
def test_get_data_uri(self): self.data.files.return_value = ["exp_file.csv"] ct = RNATables(self.collection) file_url = ct._get_data_uri(self.data, RNATables.EXP) self.assertEqual(file_url, "12345/exp_file.csv") self.data.files.return_value = [] with self.assertRaises(LookupError): file_url = ct._get_data_uri(self.data, RNATables.EXP) self.data.files.return_value = ["exp_file1.csv", "exp_file2.csv"] with self.assertRaises(LookupError): file_url = ct._get_data_uri(self.data, RNATables.EXP)
def test_data_version(self): ct = RNATables(self.collection) version = ct._data_version self.assertEqual(version, str(hash(tuple([12345])))) # use cache t = time() version = ct._data_version self.assertTrue(time() - t < 0.1) self.collection.data.filter = MagicMock(return_value=[]) ct = RNATables(self.collection) with self.assertRaises(ValueError): version = ct._data_version
def test_qc_version(self): self.collection.data.filter = self.web_request([self.data]) ct = RNATables(self.collection) version = ct._qc_version self.assertEqual(version, str(hash(tuple([12345])))) # use cache t = time() version = ct._qc_version self.assertTrue(time() - t < 0.1) self.collection.data.filter = self.web_request([]) ct1 = RNATables(self.collection) with self.assertRaises(ValueError): version = ct1._qc_version
def test_caching(self): # Call rc first time with self.ct to populate the cache t0 = time.time() rc1 = self.ct.rc t1 = time.time() - t0 # Make sure that cache file is created cache_file = self.ct._cache_file(self.ct.RC) self.assertTrue(os.path.isfile(cache_file)) # Make new table instance (to prevent loading from memory) ct2 = RNATables(self.collection, cache_dir=self.cache_dir) # Call rc second time, with it should load from disk cache t0 = time.time() rc2 = ct2.rc t2 = time.time() - t0 self.assertTrue((rc1 == rc2).all(axis=None)) self.assertTrue(t2 < t1) # Call rc second time with rc2 to test loading from memory t0 = time.time() rc3 = ct2.rc t3 = time.time() - t0 self.assertTrue((rc2 == rc3).all(axis=None)) self.assertTrue(t3 < t2)
def test_metadata_version(self): self.collection.samples.get = self.web_request(self.sample) self.collection.relations.get = self.web_request(self.relation) self.collection.data.get = self.web_request(self.orange_data) ct = RNATables(self.collection) version = ct._metadata_version self.assertEqual(version, "2020-11-01T12:15:00Z") # use cache t = time() version = ct._metadata_version self.assertTrue(time() - t < 0.1) self.collection.samples.get = MagicMock(side_effect=LookupError()) ct1 = RNATables(self.collection) with self.assertRaises(ValueError): version = ct1._metadata_version
def test_download_metadata(self, orange_mock, relations_mock, descriptors_mock): descriptors_mock.return_value = self.metadata_df relations_mock.return_value = pd.DataFrame( [["A"]], index=[123], columns=["Replicate"] ) orange_mock.return_value = pd.DataFrame( [["X"]], index=[123], columns=["Clinical"] ) ct = RNATables(self.collection) meta = ct._download_metadata() expected_content = [[0, "A", "X"]] expected_columns = ["PFS", "Replicate", "Clinical"] expected_meta = pd.DataFrame( expected_content, columns=expected_columns, index=[123] ) expected_meta.index.name = "sample_id" assert_frame_equal(meta, expected_meta)
def test_download_mapping(self): def create_feature(fid, name): m = MagicMock(feature_id=fid) # name can't be set on initialization m.name = name return m self.resolwe.feature.filter.return_value = [ create_feature(fid, name) for fid, name in self.gene_map.items() ] ct = RNATables(self.collection) mapping = ct._download_mapping( ["ENSG001", "ENSG002", "ENSG003"], "ENSEMBL", "H**o sapiens" ) self.resolwe.feature.filter.assert_called_once() self.resolwe.feature.filter.assert_called_once_with( source="ENSEMBL", species="H**o sapiens", feature_id__in=["ENSG001", "ENSG002", "ENSG003"], ) self.assertDictEqual(mapping, self.gene_map)
def test_meta(self, load_mock): load_mock.side_effect = self.web_request(self.metadata_df) ct = RNATables(self.collection) t = time() meta = ct.meta self.assertTrue(time() - t > 0.1) self.assertIs(meta, self.metadata_df) load_mock.assert_called_with(RNATables.META) # use cache t = time() meta = ct.meta self.assertTrue(time() - t < 0.1) self.assertIs(meta, self.metadata_df)
def test_mapping(self, download_mock, save_mock, load_mock): load_mock.return_value = None download_mock.return_value = self.gene_map ct = RNATables(self.collection) mapping = ct._mapping( ["ENSG001", "ENSG002", "ENSG003"], "ENSEMBL", "H**o sapiens" ) self.assertDictEqual(mapping, self.gene_map) self.assertListEqual( sorted(download_mock.call_args[0][0]), ["ENSG001", "ENSG002", "ENSG003"] ) save_mock.assert_called_with( self.gene_map, "/tmp/resdk/ENSEMBL_Homo sapiens.pickle", override=True ) # download only missing values download_mock.reset_mock() load_mock.return_value = {"ENSG002": "GB", "ENSG003": "GC"} mapping = ct._mapping( ["ENSG001", "ENSG002", "ENSG003"], "ENSEMBL", "H**o sapiens" ) self.assertDictEqual(mapping, self.gene_map) self.assertListEqual(sorted(download_mock.call_args[0][0]), ["ENSG001"])
def test_rc(self, load_mock): load_mock.side_effect = self.web_request(self.expressions_df) ct = RNATables(self.collection) t = time() rc = ct.rc self.assertTrue(time() - t > 0.1) self.assertIs(rc, self.expressions_df) load_mock.assert_called_with(RNATables.RC) self.assertListEqual(ct.gene_ids, ["ENSG001", "ENSG002", "ENSG003"]) # use cache t = time() rc = ct.rc self.assertTrue(time() - t < 0.1) self.assertIs(rc, self.expressions_df)
def test_qc(self, load_mock): qc_df = pd.DataFrame( [[None, 30], [12, 42]], index=["0", "1"], columns=["total_read_count_raw", "total_read_count_trimmed"], ) load_mock.side_effect = self.web_request(qc_df) ct = RNATables(self.collection) t = time() meta = ct.meta self.assertTrue(time() - t > 0.1) self.assertIs(meta, qc_df) load_mock.assert_called_with(RNATables.META) # use cache t = time() meta = ct.meta self.assertTrue(time() - t < 0.1) self.assertIs(meta, qc_df)
def test_load_fetch(self, data_mock, meta_mock, save_mock, load_mock): data_mock.return_value = self.expressions_df meta_mock.return_value = self.metadata_df load_mock.return_value = None self.collection.samples.get = self.web_request(self.sample) self.collection.relations.get = self.web_request(self.relation) self.collection.data.get = self.web_request(self.orange_data) ct = RNATables(self.collection) data = ct._load_fetch(RNATables.META) self.assertIs(data, self.metadata_df) save_mock.assert_called_with( self.metadata_df, "/tmp/resdk/slug_meta_None_None_2020-11-01T12:15:00Z.pickle", ) save_mock.reset_mock() data = ct._load_fetch(RNATables.EXP) self.assertIs(data, self.expressions_df) data_mock.assert_called_with(RNATables.EXP) save_mock.assert_called_with( self.expressions_df, f"/tmp/resdk/slug_exp_None_None_{str(hash((12345,)))}.pickle", ) data_mock.reset_mock() save_mock.reset_mock() data = ct._load_fetch(RNATables.RC) self.assertIs(data, self.expressions_df) data_mock.assert_called_with(RNATables.RC) save_mock.assert_called_with( self.expressions_df, f"/tmp/resdk/slug_rc_None_None_{str(hash((12345,)))}.pickle", ) data_mock.reset_mock() load_mock.return_value = self.expressions_df data = ct._load_fetch(RNATables.EXP) self.assertIs(data, self.expressions_df) data_mock.assert_not_called()
def test_clear_cache(self, clear_mock): RNATables.clear_cache() clear_mock.assert_called()
class TestTables(BaseResdkFunctionalTest): def setUp(self): self.cache_dir = tempfile.mkdtemp() self.test_server_url = "https://app.genialis.com" self.test_collection_slug = "resdk-test-collection-tables" self.res = resdk.Resolwe( url=self.test_server_url, username="******", password="******" ) self.collection = self.res.collection.get(self.test_collection_slug) self.ct = RNATables(self.collection, cache_dir=self.cache_dir) def tearDown(self): shutil.rmtree(self.cache_dir) def test_meta(self): self.assertEqual(self.ct.meta.shape, (8, 9)) self.assertIn(39000, self.ct.meta.index) self.assertIn("general.species", self.ct.meta.columns) def test_qc(self): self.assertEqual(self.ct.qc.shape, (8, 12)) self.assertIn(39000, self.ct.qc.index) self.assertIn("total_read_count_raw", self.ct.qc.columns) self.assertEqual(int(self.ct.qc.loc[39000, "total_read_count_raw"]), 42738650) def test_rc(self): self.assertEqual(self.ct.rc.shape, (8, 58487)) self.assertIn(39000, self.ct.rc.index) self.assertIn("ENSG00000000003", self.ct.rc.columns) self.assertEqual(self.ct.rc.iloc[0, 0], 792) self.assertIsInstance(self.ct.rc.iloc[0, 0], np.int64) def test_exp(self): self.assertEqual(self.ct.exp.shape, (8, 58487)) self.assertIn(39000, self.ct.exp.index) self.assertIn("ENSG00000000003", self.ct.exp.columns) self.assertAlmostEqual(self.ct.exp.iloc[0, 0], 19.447467, places=3) self.assertIsInstance(self.ct.exp.iloc[0, 0], np.float64) def test_consistent_index(self): self.assertTrue(all(self.ct.exp.index == self.ct.meta.index)) self.assertTrue(all(self.ct.rc.index == self.ct.meta.index)) def test_caching(self): # Call rc first time with self.ct to populate the cache t0 = time.time() rc1 = self.ct.rc t1 = time.time() - t0 # Make sure that cache file is created cache_file = self.ct._cache_file(self.ct.RC) self.assertTrue(os.path.isfile(cache_file)) # Make new table instance (to prevent loading from memory) ct2 = RNATables(self.collection, cache_dir=self.cache_dir) # Call rc second time, with it should load from disk cache t0 = time.time() rc2 = ct2.rc t2 = time.time() - t0 self.assertTrue((rc1 == rc2).all(axis=None)) self.assertTrue(t2 < t1) # Call rc second time with rc2 to test loading from memory t0 = time.time() rc3 = ct2.rc t3 = time.time() - t0 self.assertTrue((rc2 == rc3).all(axis=None)) self.assertTrue(t3 < t2)
def test_get_orange_object(self): # Orange Data is found ad-hoc self.collection.data.get = self.web_request(self.orange_data) ct = RNATables(self.collection) obj = ct._get_orange_object() self.assertEqual(obj, self.orange_data)