def test_malformatted_filter_dict(self): ''' Tests that we reject if the cancer type refers to something that is NOT a list ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # This should be a list: 'TCGA-DEF':'abc' } data_src = RnaSeqMixin() # again, the children will provide an EXAMPLE_PAYLOAD attribute # which we patch into this mixin class here data_src.EXAMPLE_PAYLOAD = { 'TCGA-UVM': ["<UUID>","<UUID>"], 'TCGA-MESO': ["<UUID>","<UUID>", "<UUID>"] } with self.assertRaisesRegex(Exception, 'a list of sample identifiers'): paths, resource_types = data_src.create_from_query(mock_db_record, query)
def test_empty_filters(self): ''' Tests that we reject if the filtering list is empty ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # This should have some strings: 'TCGA-DEF': [] } data_src = RnaSeqMixin() with self.assertRaisesRegex(Exception, 'empty'): paths, names, resource_types = data_src.create_from_query(mock_db_record, query)
def test_filters_with_bad_sample_id(self): ''' Tests that we handle missing samples appropriately ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # add a bad sample ID to the TCGA-ABC set: 'TCGA-ABC': ['s1111', 's3'], 'TCGA-DEF': ['s5'] } data_src = RnaSeqMixin() with self.assertRaisesRegex(Exception, 's1111'): paths, resource_types = data_src.create_from_query(mock_db_record, query)
def test_filters_with_cancer_type(self): ''' Tests that we handle a bad group ID appropriately ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # the only datasets in the hdf5 file are for TCGA-ABC # and TCGA-DEF. Below, we ask for a non-existant one 'TCGA-ABC': ['s1', 's3'], 'TCGA-XYZ': ['s5'] } data_src = RnaSeqMixin() with self.assertRaisesRegex(Exception, 'TCGA-XYZ'): paths, resource_types = data_src.create_from_query(mock_db_record, query)
def test_rejects_whole_dataset_with_null_filter(self): ''' Tests that we reject the request (raise an exception) if a filter of None is applied. This would be too large for us to handle. ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping data_src = RnaSeqMixin() data_src.PUBLIC_NAME = 'foo' # the actual implementing class would define this attr typically with self.assertRaisesRegex(Exception, 'too large'): path, resource_type = data_src.create_from_query(mock_db_record, None)
def test_filters_hdf_correctly(self, mock_uuid_mod): ''' Tests that we filter properly for a dummy dataset stored in HDF5 format. ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) ann_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq_ann.csv' ) # create 5 mock UUIDs. The first two are used in the # first call to the tested method. The final 3 are used in the second # call to the tested method. The reason for that is we auto-generate # the output filename when the calling function has not provided an # `output_name` arg to the method. In the first call to the tested # method, we provide that name, so only two calls are made to the # uuid.uuid4 function. In the second call, we omit that arg and we # hence make an extra call to the uuid4 func. mock_uuids = [uuid.uuid4() for i in range(5)] mock_uuid_mod.uuid4.side_effect = mock_uuids # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct RnaSeqMixin.ANNOTATION_FILE_KEY: [ann_path], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { 'TCGA-ABC': ['s1', 's3'], 'TCGA-DEF': ['s5'] } data_src = RnaSeqMixin() # the children classes will have a TAG attribute. Since we are # testing this mixin here, we simply patch it tag = 'foo' data_src.TAG = tag output_name = 'abc' paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query, output_name) # The order of these doesn't matter in practice, but to check the file contents, # we need to be sure we're looking at the correct files for this test. self.assertTrue(resource_types[0] == 'RNASEQ_COUNT_MTX') self.assertTrue(resource_types[1] == 'ANN') expected_df = pd.DataFrame( [[26,86,67],[54,59,29],[24,12,37]], index = ['gA', 'gB', 'gC'], columns = ['s1','s3','s5'] ) actual_df = pd.read_table(paths[0], index_col=0) self.assertTrue(actual_df.equals(expected_df)) ann_df = pd.DataFrame( [['TCGA-ABC', 1990],['TCGA-ABC', 1992], ['TCGA-DEF', 1994]], index = ['s1','s3','s5'], columns = ['cancer_type', 'year_of_birth'] ) actual_df = pd.read_table(paths[1], index_col=0) self.assertTrue(actual_df.equals(ann_df)) self.assertEqual(filenames[0], '{x}_counts.{t}.tsv'.format(x=output_name, t=tag)) self.assertEqual(filenames[1], '{x}_ann.{t}.tsv'.format(x=output_name, t=tag)) # use index 4 below as 2 uuid.uuid4 calls were 'consumed' by the first call to `create_From_query` # while the second call (the one we are testing now) uses 3 calls to paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query) self.assertEqual(filenames[0], '{t}_counts.{u}.tsv'.format(u=mock_uuids[4], t=tag)) self.assertEqual(filenames[1], '{t}_ann.{u}.tsv'.format(u=mock_uuids[4], t=tag))