def test_empty_filters(self): ''' Tests that we reject if the filtering list is empty ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # This should have some strings: 'TCGA-DEF': [] } data_src = RnaSeqMixin() with self.assertRaisesRegex(Exception, 'empty'): paths, names, resource_types = data_src.create_from_query(mock_db_record, query)
def test_malformatted_filter_dict(self): ''' Tests that we reject if the cancer type refers to something that is NOT a list ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # This should be a list: 'TCGA-DEF':'abc' } data_src = RnaSeqMixin() # again, the children will provide an EXAMPLE_PAYLOAD attribute # which we patch into this mixin class here data_src.EXAMPLE_PAYLOAD = { 'TCGA-UVM': ["<UUID>","<UUID>"], 'TCGA-MESO': ["<UUID>","<UUID>", "<UUID>"] } with self.assertRaisesRegex(Exception, 'a list of sample identifiers'): paths, resource_types = data_src.create_from_query(mock_db_record, query)
def test_filters_with_cancer_type(self): ''' Tests that we handle a bad group ID appropriately ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # the only datasets in the hdf5 file are for TCGA-ABC # and TCGA-DEF. Below, we ask for a non-existant one 'TCGA-ABC': ['s1', 's3'], 'TCGA-XYZ': ['s5'] } data_src = RnaSeqMixin() with self.assertRaisesRegex(Exception, 'TCGA-XYZ'): paths, resource_types = data_src.create_from_query(mock_db_record, query)
def test_filters_with_bad_sample_id(self): ''' Tests that we handle missing samples appropriately ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { # add a bad sample ID to the TCGA-ABC set: 'TCGA-ABC': ['s1111', 's3'], 'TCGA-DEF': ['s5'] } data_src = RnaSeqMixin() with self.assertRaisesRegex(Exception, 's1111'): paths, resource_types = data_src.create_from_query(mock_db_record, query)
def test_rejects_whole_dataset_with_null_filter(self): ''' Tests that we reject the request (raise an exception) if a filter of None is applied. This would be too large for us to handle. ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct # representation of the database record RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping data_src = RnaSeqMixin() data_src.PUBLIC_NAME = 'foo' # the actual implementing class would define this attr typically with self.assertRaisesRegex(Exception, 'too large'): path, resource_type = data_src.create_from_query(mock_db_record, None)
def test_indexes_only_annotation_file(self): ''' An RNA-seq dataset consists of a metadata file and a count matrix. This verifies that the `get_indexable_files` method only returns the annotation file ''' data_src = RnaSeqMixin() fd = { RnaSeqMixin.ANNOTATION_FILE_KEY: ['/path/to/A.txt'], RnaSeqMixin.COUNTS_FILE_KEY:['/path/to/counts.tsv'] } result = data_src.get_indexable_files(fd) self.assertCountEqual(result, fd[RnaSeqMixin.ANNOTATION_FILE_KEY])
def _pull_data(self, program_id, tag): ''' Method for downloading and munging an RNA-seq dataset to a HDF5 file Note that creating a flat file of everything was not performant and created a >2Gb matrix. Instead, we organize the RNA-seq data hierarchically by splitting into the individual projects (e.g. TCGA cancer types). Each of those is assigned to a "dataset" in the HDF5 file. Therefore, instead of a giant matrix we have to load each time, we can directly go to cancer-specific count matrices for much better performance. ''' # first get all the cancer types so we can split the downloads # and HDFS file project_dict = GDCDataSource.query_for_project_names_within_program( program_id) # Get the data dictionary, which will tell us the universe of available # fields and how to interpret them: data_fields = self.get_data_dictionary() total_annotation_df = pd.DataFrame() counts_output_path = os.path.join( self.ROOT_DIR, self.COUNT_OUTPUT_FILE_TEMPLATE.format(tag=tag, date=self.date_str)) with pd.HDFStore(counts_output_path) as hdf_out: for project_id in project_dict.keys(): logger.info('Pull data for %s' % project_id) ann_df, count_df = self._download_cohort( project_id, data_fields) total_annotation_df = pd.concat([total_annotation_df, ann_df], axis=0) # save the counts to a cancer-specific dataset. Store each # dataset in a cancer-specific group. On testing, this seemed # to be a bit faster for recall than keeping all the dataframes # as datasets in the root group group_id = ( RnaSeqMixin.create_python_compatible_id(project_id) + '/ds') hdf_out.put(group_id, count_df) logger.info('Added the {ct} matrix to the HDF5' ' count matrix'.format(ct=project_id)) # Write all the metadata to a file ann_output_path = os.path.join( self.ROOT_DIR, self.ANNOTATION_OUTPUT_FILE_TEMPLATE.format(tag=tag, date=self.date_str)) total_annotation_df.to_csv(ann_output_path, sep=',', index_label='id') logger.info( 'The metadata/annnotation file for your {program} RNA-seq data' 'is available at {p}'.format(p=ann_output_path, program=program_id))
def prepare(self): ''' Handles prep of the dataset. Does NOT index! ''' tmp_dir = self._create_tmp_dir() ann_df = self._get_sample_annotations(tmp_dir) pheno_df = self._get_phenotype_data(tmp_dir) # Merge the sample-level table with the patient-level data ann_df['subject_id'] = ann_df['SAMPID'].apply( lambda x: '-'.join(x.split('-')[:2])) # In the phenotypes file, sex is 2=F, 1=M pheno_df['_SEX'] = pheno_df['SEX'].apply(lambda x: 'M' if x == 1 else 'F') merged_ann = pd.merge(ann_df, pheno_df, left_on='subject_id', right_on='SUBJID') # remap the column names and drop the others merged_ann.rename(columns=self.COLUMN_MAPPING, inplace=True) merged_ann = merged_ann[self.COLUMN_MAPPING.values()] merged_ann = merged_ann.set_index('sample_id') final_ann = pd.DataFrame() counts_output_path = os.path.join( self.ROOT_DIR, self.COUNT_OUTPUT_FILE_TEMPLATE.format(tag=self.TAG, date=self.date_str)) with pd.HDFStore(counts_output_path) as hdf_out: for i, (tissue, tissue_subdf) in enumerate(merged_ann.groupby('tissue')): logger.info('Handling tissue {t}'.format(t=tissue)) try: url = self.TISSUE_TO_FILE_MAP[tissue] except KeyError as ex: logger.info( 'No file exists in the map for {t}. Skipping.'.format( t=tissue)) continue output_file = '{d}/f{i}.gct.gz'.format(d=tmp_dir, i=i) self._download_file(url, output_file) run_shell_command('gunzip {f}'.format(f=output_file)) output_file = output_file[:-3] # the GCT-format file has two header lines. The third line has the usual # column headers counts = pd.read_table(output_file, sep='\t', skiprows=2, header=0, index_col=1) counts.drop(['Description'], axis=1, inplace=True) counts.drop(['id'], axis=1, inplace=True) # As of this writing, there are alternate ENSG Ids that are suffixed with _PAR_Y # to denote features that are on the regions of chrY which are identical to those # on chrX. # https://www.gencodegenes.org/pages/faq.html (search "PAR_Y") # We drop those here. # It appears the mapping does not count to these regions anyway, since the rows are all # zeros (while the canonical transcript is generally non-zero) idx_par = pd.Series( [x.endswith('_PAR_Y') for x in counts.index]) counts = counts.loc[~idx_par.values] # Remove the version from the ENSG gene ID counts.index = [x.split('.')[0] for x in counts.index] samples_in_matrix = counts.columns tissue_subdf = tissue_subdf.loc[samples_in_matrix] final_ann = pd.concat([final_ann, tissue_subdf], axis=0) group_id = RnaSeqMixin.create_python_compatible_id( tissue) + '/ds' hdf_out.put(group_id, counts) final_ann.to_csv(os.path.join( self.ROOT_DIR, self.ANNOTATION_OUTPUT_FILE_TEMPLATE.format(tag=self.TAG, date=self.date_str)), sep=',', index_label='sample_id')
def test_filters_hdf_correctly(self, mock_uuid_mod): ''' Tests that we filter properly for a dummy dataset stored in HDF5 format. ''' hdf_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq.hd5' ) ann_path = os.path.join( THIS_DIR, 'public_data_test_files', 'tcga_rnaseq_ann.csv' ) # create 5 mock UUIDs. The first two are used in the # first call to the tested method. The final 3 are used in the second # call to the tested method. The reason for that is we auto-generate # the output filename when the calling function has not provided an # `output_name` arg to the method. In the first call to the tested # method, we provide that name, so only two calls are made to the # uuid.uuid4 function. In the second call, we omit that arg and we # hence make an extra call to the uuid4 func. mock_uuids = [uuid.uuid4() for i in range(5)] mock_uuid_mod.uuid4.side_effect = mock_uuids # this dict is what the database record is expected to contain # in the file_mapping field mock_mapping = { # this key doesn't matter- we just include it as a correct RnaSeqMixin.ANNOTATION_FILE_KEY: [ann_path], RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] } mock_db_record = mock.MagicMock() mock_db_record.file_mapping = mock_mapping query = { 'TCGA-ABC': ['s1', 's3'], 'TCGA-DEF': ['s5'] } data_src = RnaSeqMixin() # the children classes will have a TAG attribute. Since we are # testing this mixin here, we simply patch it tag = 'foo' data_src.TAG = tag output_name = 'abc' paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query, output_name) # The order of these doesn't matter in practice, but to check the file contents, # we need to be sure we're looking at the correct files for this test. self.assertTrue(resource_types[0] == 'RNASEQ_COUNT_MTX') self.assertTrue(resource_types[1] == 'ANN') expected_df = pd.DataFrame( [[26,86,67],[54,59,29],[24,12,37]], index = ['gA', 'gB', 'gC'], columns = ['s1','s3','s5'] ) actual_df = pd.read_table(paths[0], index_col=0) self.assertTrue(actual_df.equals(expected_df)) ann_df = pd.DataFrame( [['TCGA-ABC', 1990],['TCGA-ABC', 1992], ['TCGA-DEF', 1994]], index = ['s1','s3','s5'], columns = ['cancer_type', 'year_of_birth'] ) actual_df = pd.read_table(paths[1], index_col=0) self.assertTrue(actual_df.equals(ann_df)) self.assertEqual(filenames[0], '{x}_counts.{t}.tsv'.format(x=output_name, t=tag)) self.assertEqual(filenames[1], '{x}_ann.{t}.tsv'.format(x=output_name, t=tag)) # use index 4 below as 2 uuid.uuid4 calls were 'consumed' by the first call to `create_From_query` # while the second call (the one we are testing now) uses 3 calls to paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query) self.assertEqual(filenames[0], '{t}_counts.{u}.tsv'.format(u=mock_uuids[4], t=tag)) self.assertEqual(filenames[1], '{t}_ann.{u}.tsv'.format(u=mock_uuids[4], t=tag))
def test_data_prep(self, \ mock_run_shell_command, \ mock_create_tmp_dir, \ mock_download_file, \ mock_get_phenotype_data, \ mock_get_sample_annotations): ''' Test that we munge everything correctly ''' mock_tmp_dir = os.path.join( THIS_DIR, 'public_data_test_files' ) mock_create_tmp_dir.return_value = mock_tmp_dir ann_path = os.path.join( THIS_DIR, 'public_data_test_files', 'gtex_rnaseq_ann.tsv' ) pheno_path = os.path.join( THIS_DIR, 'public_data_test_files', 'gtex_rnaseq_pheno.tsv' ) mock_get_sample_annotations.return_value = pd.read_table(ann_path) mock_get_phenotype_data.return_value = pd.read_table(pheno_path) tmp_testing_dir = os.path.join(settings.DATA_DIR, 'test-gtex-rnaseq') os.mkdir(tmp_testing_dir) GtexRnaseqDataSource.ROOT_DIR = tmp_testing_dir mock_adipose_url = 'adipose_url' mock_blood_url = 'blood_url' GtexRnaseqDataSource.TISSUE_TO_FILE_MAP = { 'Adipose - Subcutaneous': mock_adipose_url, 'Whole Blood': mock_blood_url } data_src = GtexRnaseqDataSource() data_src.prepare() f = RnaSeqMixin.COUNT_OUTPUT_FILE_TEMPLATE.format( tag = data_src.TAG, date = data_src.date_str ) ann_output = RnaSeqMixin.ANNOTATION_OUTPUT_FILE_TEMPLATE.format( tag = data_src.TAG, date = data_src.date_str ) expected_output_hdf = os.path.join(tmp_testing_dir, f) expected_output_ann = os.path.join(tmp_testing_dir, ann_output) self.assertTrue(os.path.exists(expected_output_hdf)) self.assertTrue(os.path.exists(expected_output_ann)) expected_tissue_list = [ 'Adipose - Subcutaneous', 'Whole Blood' ] converted_tissue_list = [RnaSeqMixin.create_python_compatible_id(x) for x in expected_tissue_list] groups_list = ['/{x}/ds'.format(x=x) for x in converted_tissue_list] with pd.HDFStore(expected_output_hdf) as hdf: self.assertCountEqual(groups_list, list(hdf.keys())) # cleanup the test folder shutil.rmtree(tmp_testing_dir) shell_calls = [] for i in range(2): f = os.path.join(mock_tmp_dir, 'f{x}.gct.gz'.format(x=i)) shell_calls.append(mock.call('gunzip {f}'.format(f=f))) mock_run_shell_command.assert_has_calls(shell_calls) mock_download_file.assert_has_calls([ mock.call( mock_adipose_url, os.path.join(mock_tmp_dir, 'f0.gct.gz') ), mock.call( mock_blood_url, os.path.join(mock_tmp_dir, 'f1.gct.gz')) ])