Exemple #1
0
    def test_malformatted_filter_dict(self):
        '''
        Tests that we reject if the cancer type refers to something
        that is NOT a list
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # This should be a list:
            'TCGA-DEF':'abc'
        }
        data_src = RnaSeqMixin()
        # again, the children will provide an EXAMPLE_PAYLOAD attribute
        # which we patch into this mixin class here
        data_src.EXAMPLE_PAYLOAD = {
        'TCGA-UVM': ["<UUID>","<UUID>"],
        'TCGA-MESO': ["<UUID>","<UUID>", "<UUID>"]
        }
        with self.assertRaisesRegex(Exception, 'a list of sample identifiers'):
            paths, resource_types = data_src.create_from_query(mock_db_record, query)
Exemple #2
0
    def test_empty_filters(self):
        '''
        Tests that we reject if the filtering list is empty
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # This should have some strings:
            'TCGA-DEF': []
        }
        data_src = RnaSeqMixin()
        with self.assertRaisesRegex(Exception, 'empty'):
            paths, names, resource_types = data_src.create_from_query(mock_db_record, query)
Exemple #3
0
    def test_filters_with_bad_sample_id(self):
        '''
        Tests that we handle missing samples appropriately
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # add a bad sample ID to the TCGA-ABC set:
            'TCGA-ABC': ['s1111', 's3'],
            'TCGA-DEF': ['s5']
        }
        data_src = RnaSeqMixin()
        with self.assertRaisesRegex(Exception, 's1111'):
            paths, resource_types = data_src.create_from_query(mock_db_record, query)
Exemple #4
0
    def test_filters_with_cancer_type(self):
        '''
        Tests that we handle a bad group ID appropriately
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            # the only datasets in the hdf5 file are for TCGA-ABC
            # and TCGA-DEF. Below, we ask for a non-existant one
            'TCGA-ABC': ['s1', 's3'],
            'TCGA-XYZ': ['s5']
        }
        data_src = RnaSeqMixin()
        with self.assertRaisesRegex(Exception, 'TCGA-XYZ'):
            paths, resource_types = data_src.create_from_query(mock_db_record, query)
Exemple #5
0
    def test_rejects_whole_dataset_with_null_filter(self):
        '''
        Tests that we reject the request (raise an exception)
        if a filter of None is applied. This would be too large 
        for us to handle.
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            # representation of the database record
            RnaSeqMixin.ANNOTATION_FILE_KEY: ['/dummy.tsv'],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        data_src = RnaSeqMixin()
        data_src.PUBLIC_NAME = 'foo' # the actual implementing class would define this attr typically
        with self.assertRaisesRegex(Exception, 'too large'):
            path, resource_type = data_src.create_from_query(mock_db_record, None)
Exemple #6
0
    def test_filters_hdf_correctly(self, mock_uuid_mod):
        '''
        Tests that we filter properly for a 
        dummy dataset stored in HDF5 format.
        '''
        hdf_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq.hd5'
        )

        ann_path = os.path.join(
            THIS_DIR, 
            'public_data_test_files', 
            'tcga_rnaseq_ann.csv'
        )

        # create 5 mock UUIDs. The first two are used in the 
        # first call to the tested method. The final 3 are used in the second
        # call to the tested method. The reason for that is we auto-generate
        # the output filename when the calling function has not provided an 
        # `output_name` arg to the method. In the first call to the tested
        # method, we provide that name, so only two calls are made to the 
        # uuid.uuid4 function. In the second call, we omit that arg and we 
        # hence make an extra call to the uuid4 func.
        mock_uuids = [uuid.uuid4() for i in range(5)]
        mock_uuid_mod.uuid4.side_effect = mock_uuids

        # this dict is what the database record is expected to contain
        # in the file_mapping field
        mock_mapping = {
            # this key doesn't matter- we just include it as a correct
            RnaSeqMixin.ANNOTATION_FILE_KEY: [ann_path],
            RnaSeqMixin.COUNTS_FILE_KEY:[hdf_path] 

        }
        mock_db_record = mock.MagicMock()
        mock_db_record.file_mapping = mock_mapping
        query = {
            'TCGA-ABC': ['s1', 's3'],
            'TCGA-DEF': ['s5']
        }
        data_src = RnaSeqMixin()
        # the children classes will have a TAG attribute. Since we are
        # testing this mixin here, we simply patch it
        tag = 'foo'
        data_src.TAG = tag
        output_name = 'abc'
        paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query, output_name)

        # The order of these doesn't matter in practice, but to check the file contents,
        # we need to be sure we're looking at the correct files for this test.
        self.assertTrue(resource_types[0] == 'RNASEQ_COUNT_MTX')
        self.assertTrue(resource_types[1] == 'ANN')
        expected_df = pd.DataFrame(
            [[26,86,67],[54,59,29],[24,12,37]],
            index = ['gA', 'gB', 'gC'],
            columns = ['s1','s3','s5']
        )
        actual_df = pd.read_table(paths[0], index_col=0)
        self.assertTrue(actual_df.equals(expected_df))

        ann_df = pd.DataFrame(
            [['TCGA-ABC', 1990],['TCGA-ABC', 1992], ['TCGA-DEF', 1994]],
            index = ['s1','s3','s5'],
            columns = ['cancer_type', 'year_of_birth']
        )
        actual_df = pd.read_table(paths[1], index_col=0)
        self.assertTrue(actual_df.equals(ann_df))

        self.assertEqual(filenames[0], '{x}_counts.{t}.tsv'.format(x=output_name, t=tag))
        self.assertEqual(filenames[1], '{x}_ann.{t}.tsv'.format(x=output_name, t=tag))

        # use index 4 below as 2 uuid.uuid4 calls were 'consumed' by the first call to `create_From_query`
        # while the second call  (the one we are testing now) uses 3 calls to
        paths, filenames, resource_types = data_src.create_from_query(mock_db_record, query)
        self.assertEqual(filenames[0], '{t}_counts.{u}.tsv'.format(u=mock_uuids[4], t=tag))
        self.assertEqual(filenames[1], '{t}_ann.{u}.tsv'.format(u=mock_uuids[4], t=tag))