Beispiel #1
0
    def test_empty_filters(self):
        '''
        Tests that we reject if the filtering list is empty
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # This should have some strings:
            'TCGA-DEF': []
        }
        data_src = RnaSeqMixin()
        with self.assertRaisesRegex(Exception, 'empty'):
            paths, names, resource_types = data_src.create_from_query(mock_db_record, query)
Beispiel #2
0
    def test_malformatted_filter_dict(self):
        '''
        Tests that we reject if the cancer type refers to something
        that is NOT a list
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # This should be a list:
            'TCGA-DEF':'abc'
        }
        data_src = RnaSeqMixin()
        # again, the children will provide an EXAMPLE_PAYLOAD attribute
        # which we patch into this mixin class here
        data_src.EXAMPLE_PAYLOAD = {
        'TCGA-UVM': ["<UUID>","<UUID>"],
        'TCGA-MESO': ["<UUID>","<UUID>", "<UUID>"]
        }
        with self.assertRaisesRegex(Exception, 'a list of sample identifiers'):
            paths, resource_types = data_src.create_from_query(mock_db_record, query)
Beispiel #3
0
    def test_filters_with_cancer_type(self):
        '''
        Tests that we handle a bad group ID appropriately
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # the only datasets in the hdf5 file are for TCGA-ABC
            # and TCGA-DEF. Below, we ask for a non-existant one
            'TCGA-ABC': ['s1', 's3'],
            'TCGA-XYZ': ['s5']
        }
        data_src = RnaSeqMixin()
        with self.assertRaisesRegex(Exception, 'TCGA-XYZ'):
            paths, resource_types = data_src.create_from_query(mock_db_record, query)
Beispiel #4
0
    def test_filters_with_bad_sample_id(self):
        '''
        Tests that we handle missing samples appropriately
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # add a bad sample ID to the TCGA-ABC set:
            'TCGA-ABC': ['s1111', 's3'],
            'TCGA-DEF': ['s5']
        }
        data_src = RnaSeqMixin()
        with self.assertRaisesRegex(Exception, 's1111'):
            paths, resource_types = data_src.create_from_query(mock_db_record, query)
Beispiel #5
0
    def test_rejects_whole_dataset_with_null_filter(self):
        '''
        Tests that we reject the request (raise an exception)
        if a filter of None is applied. This would be too large 
        for us to handle.
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        data_src = RnaSeqMixin()
        data_src.PUBLIC_NAME = 'foo' # the actual implementing class would define this attr typically
        with self.assertRaisesRegex(Exception, 'too large'):
            path, resource_type = data_src.create_from_query(mock_db_record, None)
Beispiel #6
0
    def test_indexes_only_annotation_file(self):
        '''
        An RNA-seq dataset consists of a metadata file and a count matrix.
        This verifies that the `get_indexable_files`  method only returns
        the annotation file
        '''

        data_src = RnaSeqMixin()

        fd = {
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/path/to/A.txt'],
            RnaSeqMixin.COUNTS_FILE_KEY:['/path/to/counts.tsv'] 
        }
        result = data_src.get_indexable_files(fd)
        self.assertCountEqual(result, fd[RnaSeqMixin.ANNOTATION_FILE_KEY])
Beispiel #7
0
    def _pull_data(self, program_id, tag):
        '''
        Method for downloading and munging an RNA-seq dataset
        to a HDF5 file

        Note that creating a flat file of everything was not performant
        and created a >2Gb matrix. Instead, we organize the RNA-seq data
        hierarchically by splitting into the individual projects (e.g.
        TCGA cancer types).
        Each of those is assigned to a "dataset" in the HDF5 file. Therefore,
        instead of a giant matrix we have to load each time, we can directly
        go to cancer-specific count matrices for much better performance.
        '''

        # first get all the cancer types so we can split the downloads
        # and HDFS file
        project_dict = GDCDataSource.query_for_project_names_within_program(
            program_id)

        # Get the data dictionary, which will tell us the universe of available
        # fields and how to interpret them:
        data_fields = self.get_data_dictionary()

        total_annotation_df = pd.DataFrame()
        counts_output_path = os.path.join(
            self.ROOT_DIR,
            self.COUNT_OUTPUT_FILE_TEMPLATE.format(tag=tag,
                                                   date=self.date_str))
        with pd.HDFStore(counts_output_path) as hdf_out:
            for project_id in project_dict.keys():
                logger.info('Pull data for %s' % project_id)
                ann_df, count_df = self._download_cohort(
                    project_id, data_fields)
                total_annotation_df = pd.concat([total_annotation_df, ann_df],
                                                axis=0)

                # save the counts to a cancer-specific dataset. Store each
                # dataset in a cancer-specific group. On testing, this seemed
                # to be a bit faster for recall than keeping all the dataframes
                # as datasets in the root group
                group_id = (
                    RnaSeqMixin.create_python_compatible_id(project_id) +
                    '/ds')
                hdf_out.put(group_id, count_df)
                logger.info('Added the {ct} matrix to the HDF5'
                            ' count matrix'.format(ct=project_id))

        # Write all the metadata to a file
        ann_output_path = os.path.join(
            self.ROOT_DIR,
            self.ANNOTATION_OUTPUT_FILE_TEMPLATE.format(tag=tag,
                                                        date=self.date_str))
        total_annotation_df.to_csv(ann_output_path, sep=',', index_label='id')
        logger.info(
            'The metadata/annnotation file for your {program} RNA-seq data'
            'is available at {p}'.format(p=ann_output_path,
                                         program=program_id))
Beispiel #8
0
    def prepare(self):
        '''
        Handles prep of the dataset. Does NOT index!
        '''
        tmp_dir = self._create_tmp_dir()

        ann_df = self._get_sample_annotations(tmp_dir)
        pheno_df = self._get_phenotype_data(tmp_dir)

        # Merge the sample-level table with the patient-level data
        ann_df['subject_id'] = ann_df['SAMPID'].apply(
            lambda x: '-'.join(x.split('-')[:2]))

        # In the phenotypes file, sex is 2=F, 1=M
        pheno_df['_SEX'] = pheno_df['SEX'].apply(lambda x: 'M'
                                                 if x == 1 else 'F')
        merged_ann = pd.merge(ann_df,
                              pheno_df,
                              left_on='subject_id',
                              right_on='SUBJID')

        # remap the column names and drop the others
        merged_ann.rename(columns=self.COLUMN_MAPPING, inplace=True)
        merged_ann = merged_ann[self.COLUMN_MAPPING.values()]
        merged_ann = merged_ann.set_index('sample_id')

        final_ann = pd.DataFrame()
        counts_output_path = os.path.join(
            self.ROOT_DIR,
            self.COUNT_OUTPUT_FILE_TEMPLATE.format(tag=self.TAG,
                                                   date=self.date_str))
        with pd.HDFStore(counts_output_path) as hdf_out:
            for i, (tissue,
                    tissue_subdf) in enumerate(merged_ann.groupby('tissue')):
                logger.info('Handling tissue {t}'.format(t=tissue))
                try:
                    url = self.TISSUE_TO_FILE_MAP[tissue]
                except KeyError as ex:
                    logger.info(
                        'No file exists in the map for {t}. Skipping.'.format(
                            t=tissue))
                    continue
                output_file = '{d}/f{i}.gct.gz'.format(d=tmp_dir, i=i)
                self._download_file(url, output_file)
                run_shell_command('gunzip {f}'.format(f=output_file))
                output_file = output_file[:-3]

                # the GCT-format file has two header lines. The third line has the usual
                # column headers
                counts = pd.read_table(output_file,
                                       sep='\t',
                                       skiprows=2,
                                       header=0,
                                       index_col=1)
                counts.drop(['Description'], axis=1, inplace=True)
                counts.drop(['id'], axis=1, inplace=True)

                # As of this writing, there are alternate ENSG Ids that are suffixed with _PAR_Y
                # to denote features that are on the regions of chrY which are identical to those
                # on chrX.
                # https://www.gencodegenes.org/pages/faq.html (search "PAR_Y")
                # We drop those here.
                # It appears the mapping does not count to these regions anyway, since the rows are all
                # zeros (while the canonical transcript is generally non-zero)
                idx_par = pd.Series(
                    [x.endswith('_PAR_Y') for x in counts.index])
                counts = counts.loc[~idx_par.values]

                # Remove the version from the ENSG gene ID
                counts.index = [x.split('.')[0] for x in counts.index]

                samples_in_matrix = counts.columns
                tissue_subdf = tissue_subdf.loc[samples_in_matrix]
                final_ann = pd.concat([final_ann, tissue_subdf], axis=0)

                group_id = RnaSeqMixin.create_python_compatible_id(
                    tissue) + '/ds'
                hdf_out.put(group_id, counts)

        final_ann.to_csv(os.path.join(
            self.ROOT_DIR,
            self.ANNOTATION_OUTPUT_FILE_TEMPLATE.format(tag=self.TAG,
                                                        date=self.date_str)),
                         sep=',',
                         index_label='sample_id')
Beispiel #9
0
    def test_filters_hdf_correctly(self, mock_uuid_mod):
        '''
        Tests that we filter properly for a 
        dummy dataset stored in HDF5 format.
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        ann_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq_ann.csv'
        )

        # create 5 mock UUIDs. The first two are used in the 
        # first call to the tested method. The final 3 are used in the second
        # call to the tested method. The reason for that is we auto-generate
        # the output filename when the calling function has not provided an 
        # `output_name` arg to the method. In the first call to the tested
        # method, we provide that name, so only two calls are made to the 
        # uuid.uuid4 function. In the second call, we omit that arg and we 
        # hence make an extra call to the uuid4 func.
        mock_uuids = [uuid.uuid4() for i in range(5)]
        mock_uuid_mod.uuid4.side_effect = mock_uuids

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            RnaSeqMixin.ANNOTATION_FILE_KEY: [ann_path],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            'TCGA-ABC': ['s1', 's3'],
            'TCGA-DEF': ['s5']
        }
        data_src = RnaSeqMixin()
        # the children classes will have a TAG attribute. Since we are
        # testing this mixin here, we simply patch it
        tag = 'foo'
        data_src.TAG = tag
        output_name = 'abc'
        paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query, output_name)

        # The order of these doesn't matter in practice, but to check the file contents,
        # we need to be sure we're looking at the correct files for this test.
        self.assertTrue(resource_types[0] == 'RNASEQ_COUNT_MTX')
        self.assertTrue(resource_types[1] == 'ANN')
        expected_df = pd.DataFrame(
            [[26,86,67],[54,59,29],[24,12,37]],
            index = ['gA', 'gB', 'gC'],
            columns = ['s1','s3','s5']
        )
        actual_df = pd.read_table(paths[0], index_col=0)
        self.assertTrue(actual_df.equals(expected_df))

        ann_df = pd.DataFrame(
            [['TCGA-ABC', 1990],['TCGA-ABC', 1992], ['TCGA-DEF', 1994]],
            index = ['s1','s3','s5'],
            columns = ['cancer_type', 'year_of_birth']
        )
        actual_df = pd.read_table(paths[1], index_col=0)
        self.assertTrue(actual_df.equals(ann_df))

        self.assertEqual(filenames[0], '{x}_counts.{t}.tsv'.format(x=output_name, t=tag))
        self.assertEqual(filenames[1], '{x}_ann.{t}.tsv'.format(x=output_name, t=tag))

        # use index 4 below as 2 uuid.uuid4 calls were 'consumed' by the first call to `create_From_query`
        # while the second call  (the one we are testing now) uses 3 calls to
        paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query)
        self.assertEqual(filenames[0], '{t}_counts.{u}.tsv'.format(u=mock_uuids[4], t=tag))
        self.assertEqual(filenames[1], '{t}_ann.{u}.tsv'.format(u=mock_uuids[4], t=tag))
Beispiel #10
0
    def test_data_prep(self, \
        mock_run_shell_command, \
        mock_create_tmp_dir, \
        mock_download_file, \
        mock_get_phenotype_data, \
        mock_get_sample_annotations):
        '''
        Test that we munge everything correctly
        '''
        mock_tmp_dir = os.path.join(
            THIS_DIR, 
            'public_data_test_files'
        )
        mock_create_tmp_dir.return_value = mock_tmp_dir
        ann_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'gtex_rnaseq_ann.tsv'
        )
        pheno_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'gtex_rnaseq_pheno.tsv'
        )

        mock_get_sample_annotations.return_value = pd.read_table(ann_path)
        mock_get_phenotype_data.return_value = pd.read_table(pheno_path)

        tmp_testing_dir = os.path.join(settings.DATA_DIR, 'test-gtex-rnaseq')
        os.mkdir(tmp_testing_dir)
        GtexRnaseqDataSource.ROOT_DIR = tmp_testing_dir 
        mock_adipose_url = 'adipose_url'
        mock_blood_url = 'blood_url'
        GtexRnaseqDataSource.TISSUE_TO_FILE_MAP = {
            'Adipose - Subcutaneous': mock_adipose_url,
            'Whole Blood': mock_blood_url
        }
        data_src = GtexRnaseqDataSource()
        data_src.prepare()
        f = RnaSeqMixin.COUNT_OUTPUT_FILE_TEMPLATE.format(
            tag = data_src.TAG,
            date = data_src.date_str
        )
        ann_output = RnaSeqMixin.ANNOTATION_OUTPUT_FILE_TEMPLATE.format(
            tag = data_src.TAG,
            date = data_src.date_str
        )
        expected_output_hdf = os.path.join(tmp_testing_dir, f)
        expected_output_ann = os.path.join(tmp_testing_dir, ann_output)
        self.assertTrue(os.path.exists(expected_output_hdf))
        self.assertTrue(os.path.exists(expected_output_ann))

        expected_tissue_list = [
            'Adipose - Subcutaneous', 
            'Whole Blood'
        ]
        converted_tissue_list = [RnaSeqMixin.create_python_compatible_id(x) for x in expected_tissue_list]
        groups_list = ['/{x}/ds'.format(x=x) for x in converted_tissue_list]
        with pd.HDFStore(expected_output_hdf) as hdf:
            self.assertCountEqual(groups_list, list(hdf.keys()))

        # cleanup the test folder
        shutil.rmtree(tmp_testing_dir)

        shell_calls = []
        for i in range(2):
            f = os.path.join(mock_tmp_dir, 'f{x}.gct.gz'.format(x=i))
            shell_calls.append(mock.call('gunzip {f}'.format(f=f)))
        mock_run_shell_command.assert_has_calls(shell_calls)
        mock_download_file.assert_has_calls([
            mock.call(
                mock_adipose_url, os.path.join(mock_tmp_dir, 'f0.gct.gz')
            ),
            mock.call(
                mock_blood_url, os.path.join(mock_tmp_dir, 'f1.gct.gz'))
        ])