Ejemplo n.º 1
0
def preprocess_mapping_file(data, headers, columns, unique=False, single=False,
                            clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if 'SampleID' != columns[0]:
        columns = ['SampleID'] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if '&&' in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [headers.index(header_name) for header_name in
            new_column.split('&&')]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append(''.join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [column_name for column_name in headers[1::]
                if metadata.hasUniqueCategoryValues(column_name)]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [column_name for column_name in headers[1::]
                if metadata.hasSingleCategoryValue(column_name)]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data, headers,
            columns_to_remove, negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0]+'_%d' % index]+element[1::]
                for element in data])
        data = out_data

    return data, headers
Ejemplo n.º 2
0
class MetadataMapTests(TestCase):
    """Tests for the MetadataMap class."""
    def setUp(self):
        """Create MetadataMap objects that will be used in the tests."""
        # Create a map using the overview tutorial mapping file.
        self.overview_map_str = [
            "#SampleID\tBarcodeSequence\tTreatment\tDOB\tDescription",
            "PC.354\tAGCACGAGCCTA\tControl\t20061218\t354",
            "PC.355\tAACTCGTCGATG\tControl\t20061218\t355",
            "PC.356\tACAGACCACTCA\tControl\t20061126\t356",
            "PC.481\tACCAGCGACTAG\tControl\t20070314\t481",
            "PC.593\tAGCAGCACTTGT\tControl\t20071210\t593",
            "PC.607\tAACTGTGCGTAC\tFast\t20071112\t607",
            "PC.634\tACAGAGTCGGCT\tFast\t20080116\t634",
            "PC.635\tACCGCAGAGTCA\tFast\t20080116\t635",
            "PC.636\tACGGTGAGTGTC\tFast\t20080116\t636"
        ]
        self.overview_map = MetadataMap(
            *parse_mapping_file_to_dict(self.overview_map_str))

        # Create the same overview tutorial map, but this time with some
        # comments.
        self.comment = "# Some comments about this mapping file"
        self.map_with_comments_str = self.overview_map_str[:]
        self.map_with_comments_str.insert(1, self.comment)
        self.map_with_comments = MetadataMap(
            *parse_mapping_file_to_dict(self.map_with_comments_str))

        # Create a MetadataMap object that has no metadata (i.e. no sample IDs,
        # so no metadata about samples).
        self.empty_map = MetadataMap({}, [])

        # Create a MetadataMap object that has samples (i.e. sample IDs) but
        # not associated metadata (i.e. no columns other than SampleID).
        self.no_metadata_str = [
            "#SampleID", "PC.354", "PC.355", "PC.356", "PC.481", "PC.593",
            "PC.607", "PC.634", "PC.635", "PC.636"
        ]
        self.no_metadata = MetadataMap(
            *parse_mapping_file_to_dict(self.no_metadata_str))

        # Create a MetadataMap object that has a category with only one value
        # throughout the entire column.
        self.single_value_str = [
            "#SampleID\tFoo", "PC.354\tfoo", "PC.355\tfoo", "PC.356\tfoo",
            "PC.481\tfoo", "PC.593\tfoo", "PC.607\tfoo", "PC.634\tfoo",
            "PC.635\tfoo", "PC.636\tfoo"
        ]
        self.single_value = MetadataMap(
            *parse_mapping_file_to_dict(self.single_value_str))

    def test_parseMetadataMap(self):
        """Test parsing a mapping file into a MetadataMap instance."""
        obs = MetadataMap.parseMetadataMap(self.overview_map_str)
        self.assertEqual(obs, self.overview_map)

    def test_parseMetadataMap_empty(self):
        """Test parsing empty mapping file contents."""
        self.assertRaises(QiimeParseError, MetadataMap.parseMetadataMap, [])

    def test_eq(self):
        """Test whether two MetadataMaps are equal."""
        self.assertTrue(self.empty_map == MetadataMap({}, []))
        self.assertTrue(self.overview_map == MetadataMap(
            self.overview_map._metadata, self.overview_map.Comments))

    def test_ne(self):
        """Test whether two MetadataMaps are not equal."""
        self.assertTrue(self.empty_map != MetadataMap({}, ["foo"]))
        self.assertTrue(self.overview_map != MetadataMap(
            self.overview_map._metadata, ["foo"]))
        self.assertTrue(
            self.overview_map != MetadataMap({}, self.overview_map.Comments))
        self.assertTrue(self.overview_map != self.empty_map)
        self.assertTrue(self.overview_map != self.map_with_comments)
        self.assertTrue(self.overview_map != self.no_metadata)

    def test_getSampleMetadata(self):
        """Test metadata by sample ID accessor with valid sample IDs."""
        exp = {
            'BarcodeSequence': 'AGCACGAGCCTA',
            'Treatment': 'Control',
            'DOB': '20061218',
            'Description': '354'
        }
        obs = self.overview_map.getSampleMetadata('PC.354')
        self.assertEqual(obs, exp)

        exp = {
            'BarcodeSequence': 'ACCAGCGACTAG',
            'Treatment': 'Control',
            'DOB': '20070314',
            'Description': '481'
        }
        obs = self.map_with_comments.getSampleMetadata('PC.481')
        self.assertEqual(obs, exp)

        exp = {
            'BarcodeSequence': 'ACGGTGAGTGTC',
            'Treatment': 'Fast',
            'DOB': '20080116',
            'Description': '636'
        }
        obs = self.map_with_comments.getSampleMetadata('PC.636')
        self.assertEqual(obs, exp)

        exp = {}
        obs = self.no_metadata.getSampleMetadata('PC.636')
        self.assertEqual(obs, exp)

    def test_getSampleMetadata_bad_sample_id(self):
        """Test metadata by sample ID accessor with invalid sample IDs."""
        # Nonexistent sample ID.
        self.assertRaises(KeyError, self.overview_map.getSampleMetadata,
                          'PC.000')
        self.assertRaises(KeyError, self.no_metadata.getSampleMetadata,
                          'PC.000')
        # Integer sample ID.
        self.assertRaises(KeyError, self.overview_map.getSampleMetadata, 42)
        # Sample ID of type None.
        self.assertRaises(KeyError, self.overview_map.getSampleMetadata, None)

        # Sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 's1')
        # Integer sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 1)
        # Sample ID of None on empty map.
        self.assertRaises(KeyError, self.empty_map.getSampleMetadata, None)

    def test_getCategoryValue(self):
        """Test category value by sample ID/category name accessor."""
        exp = "Fast"
        obs = self.overview_map.getCategoryValue('PC.634', 'Treatment')
        self.assertEqual(obs, exp)

        exp = "20070314"
        obs = self.overview_map.getCategoryValue('PC.481', 'DOB')
        self.assertEqual(obs, exp)

        exp = "ACGGTGAGTGTC"
        obs = self.map_with_comments.getCategoryValue('PC.636',
                                                      'BarcodeSequence')
        self.assertEqual(obs, exp)

    def test_getCategoryValues(self):
        """Test category value list by sample ID/category name accessor."""
        smpl_ids = [
            'PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.593', 'PC.607',
            'PC.634', 'PC.635', 'PC.636'
        ]

        exp = [
            'Control', 'Control', 'Control', 'Control', 'Control', 'Fast',
            'Fast', 'Fast', 'Fast'
        ]
        obs = self.overview_map.getCategoryValues(smpl_ids, 'Treatment')
        self.assertEqual(obs, exp)

    def test_isNumericCategory(self):
        """Test checking if a category is numeric."""
        obs = self.overview_map.isNumericCategory('Treatment')
        self.assertEqual(obs, False)

        obs = self.overview_map.isNumericCategory('DOB')
        self.assertEqual(obs, True)

    def test_hasUniqueCategoryValues(self):
        """Test checking if a category has unique values."""
        obs = self.overview_map.hasUniqueCategoryValues('Treatment')
        self.assertEqual(obs, False)

        obs = self.overview_map.hasUniqueCategoryValues('DOB')
        self.assertEqual(obs, False)

        obs = self.overview_map.hasUniqueCategoryValues('Description')
        self.assertEqual(obs, True)

    def test_hasSingleCategoryValue(self):
        """Test checking if a category has only a single value."""
        obs = self.overview_map.hasSingleCategoryValue('Treatment')
        self.assertEqual(obs, False)

        obs = self.single_value.hasSingleCategoryValue('Foo')
        self.assertEqual(obs, True)

    def test_getCategoryValue_bad_sample_id(self):
        """Test category value by sample ID accessor with bad sample IDs."""
        # Nonexistent sample ID.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
                          'PC.000', 'Treatment')
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
                          'PC.000', 'Treatment')
        # Integer sample ID.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue, 42,
                          'DOB')
        # Sample ID of type None.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue, None,
                          'Treatment')

        # Sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getCategoryValue, 's1',
                          'foo')
        # Integer sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getCategoryValue, 1, 'bar')
        # Sample ID of None on empty map.
        self.assertRaises(KeyError, self.empty_map.getCategoryValue, None,
                          'baz')

    def test_getCategoryValue_bad_category(self):
        """Test category value by sample ID accessor with bad categories."""
        # Nonexistent category.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
                          'PC.354', 'foo')
        # Integer category.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
                          'PC.354', 42)
        # Category of type None.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
                          'PC.354', None)

        # Category on map with no metadata, but that has sample IDs.
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
                          'PC.354', 'Treatment')
        # Integer category on map with no metadata.
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
                          'PC.354', 34)
        # Category of type None on map with no metadata.
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
                          'PC.354', None)

    def test_SampleIds(self):
        """Test sample IDs accessor."""
        exp = [
            "PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607",
            "PC.634", "PC.635", "PC.636"
        ]
        obs = self.overview_map.SampleIds
        self.assertEqual(obs, exp)

        obs = self.no_metadata.SampleIds
        self.assertEqual(obs, exp)

        obs = self.empty_map.SampleIds
        self.assertEqual(obs, [])

    def test_CategoryNames(self):
        """Test category names accessor."""
        exp = ["BarcodeSequence", "DOB", "Description", "Treatment"]
        obs = self.overview_map.CategoryNames
        self.assertEqual(obs, exp)

        obs = self.no_metadata.CategoryNames
        self.assertEqual(obs, [])

        obs = self.empty_map.CategoryNames
        self.assertEqual(obs, [])

    def test_filterSamples(self):
        """Test filtering out samples from metadata map."""
        exp = ['PC.356', 'PC.593']
        self.overview_map.filterSamples(['PC.593', 'PC.356'])
        obs = self.overview_map.SampleIds
        self.assertEqual(obs, exp)

        self.overview_map.filterSamples([])
        self.assertEqual(self.overview_map.SampleIds, [])

    def test_filterSamples_strict(self):
        """Test strict checking of sample prescence when filtering."""
        with self.assertRaises(ValueError):
            self.overview_map.filterSamples(['PC.356', 'abc123'])

        with self.assertRaises(ValueError):
            self.empty_map.filterSamples(['foo'])

    def test_filterSamples_no_strict(self):
        """Test missing samples does not raise error."""
        self.overview_map.filterSamples(['PC.356', 'abc123'], strict=False)
        self.assertEqual(self.overview_map.SampleIds, ['PC.356'])

        self.empty_map.filterSamples(['foo'], strict=False)
        self.assertEqual(self.empty_map.SampleIds, [])

    def test_is_valid_git_refname(self):
        """Test correct validation of refnames"""
        # valid branchnames
        self.assertTrue(is_valid_git_refname('master'))
        self.assertTrue(is_valid_git_refname('debuggatron_2000'))
        self.assertTrue(is_valid_git_refname('refname/bar'))
        self.assertTrue(is_valid_git_refname('ref.nameslu/_eggs_/spam'))
        self.assertTrue(is_valid_git_refname('valid{0}char'.format(
            unichr(40))))
        self.assertTrue(is_valid_git_refname('master@head'))
        self.assertTrue(is_valid_git_refname('bar{thing}foo'))

        # case happening with git < 1.6.6
        self.assertFalse(
            is_valid_git_refname(
                '--abbrev-ref\nbaa350d7b7063d585ca293fc16ef15e0765dc9ee'))

        # different invalid refnames, for a description of each group see the
        # man page of git check-ref-format
        self.assertFalse(is_valid_git_refname('bar/.spam/eggs'))
        self.assertFalse(is_valid_git_refname('bar.lock/spam/eggs'))
        self.assertFalse(is_valid_git_refname('bar.lock'))
        self.assertFalse(is_valid_git_refname('.foobar'))

        self.assertFalse(is_valid_git_refname('ref..name'))

        self.assertFalse(
            is_valid_git_refname(u'invalid{0}char'.format(unichr(177))))
        self.assertFalse(
            is_valid_git_refname('invalid{0}char'.format(unichr(39))))
        self.assertFalse(is_valid_git_refname('ref~name/bar'))
        self.assertFalse(is_valid_git_refname('refname spam'))
        self.assertFalse(is_valid_git_refname('bar/foo/eggs~spam'))
        self.assertFalse(is_valid_git_refname('bar:_spam_'))
        self.assertFalse(is_valid_git_refname('eggtastic^2'))

        self.assertFalse(is_valid_git_refname('areyourandy?'))
        self.assertFalse(is_valid_git_refname('bar/*/spam'))
        self.assertFalse(is_valid_git_refname('bar[spam]/eggs'))

        self.assertFalse(is_valid_git_refname('/barfooeggs'))
        self.assertFalse(is_valid_git_refname('barfooeggs/'))
        self.assertFalse(is_valid_git_refname('bar/foo//////eggs'))

        self.assertFalse(is_valid_git_refname('dotEnding.'))

        self.assertFalse(is_valid_git_refname('@{branch'))

        self.assertFalse(is_valid_git_refname('contains\\slash'))

        self.assertFalse(is_valid_git_refname('$newbranch'))

    def test_is_valid_git_sha1(self):
        """ """

        # valid sha1 strings
        self.assertTrue(
            is_valid_git_sha1('65a9ba2ef4b126fb5b054ea6b89b457463db4ec6'))
        self.assertTrue(
            is_valid_git_sha1('a29a9911e41253405494c43889925a6d79ca26db'))
        self.assertTrue(
            is_valid_git_sha1('e099cd5fdea89eba929d6051fbd26cc9e7a0c961'))
        self.assertTrue(
            is_valid_git_sha1('44235d322c3386bd5ce872d9d7ea2e10d27c86cb'))
        self.assertTrue(
            is_valid_git_sha1('7d2fc23E04540EE92c742948cca9ed5bc54d08d1'))
        self.assertTrue(
            is_valid_git_sha1('fb5dc0285a8b11f199c4f3a7547a2da38138373f'))
        self.assertTrue(
            is_valid_git_sha1('0b2abAEb195ba7ebc5cfdb53213a66fbaddefdb8'))

        # invalid length
        self.assertFalse(is_valid_git_sha1('cca9ed5bc54d08d1'))
        self.assertFalse(is_valid_git_sha1(''))

        # invalid characters
        self.assertFalse(
            is_valid_git_sha1('fb5dy0f85a8b11f199c4f3a75474a2das8138373'))
        self.assertFalse(
            is_valid_git_sha1('0x5dcc816fbc1c2e8eX087d7d2ed8d2950a7c16b'))
Ejemplo n.º 3
0
class MetadataMapTests(TestCase):
    """Tests for the MetadataMap class."""

    def setUp(self):
        """Create MetadataMap objects that will be used in the tests."""
        # Create a map using the overview tutorial mapping file.
        self.overview_map_str = [
                "#SampleID\tBarcodeSequence\tTreatment\tDOB\tDescription",
                "PC.354\tAGCACGAGCCTA\tControl\t20061218\t354",
                "PC.355\tAACTCGTCGATG\tControl\t20061218\t355",
                "PC.356\tACAGACCACTCA\tControl\t20061126\t356",
                "PC.481\tACCAGCGACTAG\tControl\t20070314\t481",
                "PC.593\tAGCAGCACTTGT\tControl\t20071210\t593",
                "PC.607\tAACTGTGCGTAC\tFast\t20071112\t607",
                "PC.634\tACAGAGTCGGCT\tFast\t20080116\t634",
                "PC.635\tACCGCAGAGTCA\tFast\t20080116\t635",
                "PC.636\tACGGTGAGTGTC\tFast\t20080116\t636"]
        self.overview_map = MetadataMap(
            *parse_mapping_file_to_dict(self.overview_map_str))

        # Create the same overview tutorial map, but this time with some
        # comments.
        self.comment = "# Some comments about this mapping file"
        self.map_with_comments_str = self.overview_map_str[:]
        self.map_with_comments_str.insert(1, self.comment)
        self.map_with_comments = MetadataMap(*parse_mapping_file_to_dict(
            self.map_with_comments_str))

        # Create a MetadataMap object that has no metadata (i.e. no sample IDs,
        # so no metadata about samples).
        self.empty_map = MetadataMap({}, [])

        # Create a MetadataMap object that has samples (i.e. sample IDs) but
        # not associated metadata (i.e. no columns other than SampleID).
        self.no_metadata_str = ["#SampleID",
                                "PC.354",
                                "PC.355",
                                "PC.356",
                                "PC.481",
                                "PC.593",
                                "PC.607",
                                "PC.634",
                                "PC.635",
                                "PC.636"]
        self.no_metadata = MetadataMap(*parse_mapping_file_to_dict(
            self.no_metadata_str))

        # Create a MetadataMap object that has a category with only one value
        # throughout the entire column.
        self.single_value_str = ["#SampleID\tFoo",
                                "PC.354\tfoo",
                                "PC.355\tfoo",
                                "PC.356\tfoo",
                                "PC.481\tfoo",
                                "PC.593\tfoo",
                                "PC.607\tfoo",
                                "PC.634\tfoo",
                                "PC.635\tfoo",
                                "PC.636\tfoo"]
        self.single_value = MetadataMap(*parse_mapping_file_to_dict(
            self.single_value_str))

    def test_parseMetadataMap(self):
        """Test parsing a mapping file into a MetadataMap instance."""
        obs = MetadataMap.parseMetadataMap(self.overview_map_str)
        self.assertEqual(obs, self.overview_map)

    def test_parseMetadataMap_empty(self):
        """Test parsing empty mapping file contents."""
        self.assertRaises(QiimeParseError, MetadataMap.parseMetadataMap, [])

    def test_eq(self):
        """Test whether two MetadataMaps are equal."""
        self.assertTrue(self.empty_map == MetadataMap({}, []))
        self.assertTrue(self.overview_map == MetadataMap(
            self.overview_map._metadata, self.overview_map.Comments))

    def test_ne(self):
        """Test whether two MetadataMaps are not equal."""
        self.assertTrue(self.empty_map != MetadataMap({}, ["foo"]))
        self.assertTrue(self.overview_map != MetadataMap(
            self.overview_map._metadata, ["foo"]))
        self.assertTrue(self.overview_map != MetadataMap({},
            self.overview_map.Comments))
        self.assertTrue(self.overview_map != self.empty_map)
        self.assertTrue(self.overview_map != self.map_with_comments)
        self.assertTrue(self.overview_map != self.no_metadata)

    def test_getSampleMetadata(self):
        """Test metadata by sample ID accessor with valid sample IDs."""
        exp = {'BarcodeSequence': 'AGCACGAGCCTA', 'Treatment': 'Control',
                'DOB': '20061218', 'Description': '354'}
        obs = self.overview_map.getSampleMetadata('PC.354')
        self.assertEqual(obs, exp)

        exp = {'BarcodeSequence': 'ACCAGCGACTAG', 'Treatment': 'Control',
                'DOB': '20070314', 'Description': '481'}
        obs = self.map_with_comments.getSampleMetadata('PC.481')
        self.assertEqual(obs, exp)

        exp = {'BarcodeSequence': 'ACGGTGAGTGTC', 'Treatment': 'Fast',
                'DOB': '20080116', 'Description': '636'}
        obs = self.map_with_comments.getSampleMetadata('PC.636')
        self.assertEqual(obs, exp)

        exp = {}
        obs = self.no_metadata.getSampleMetadata('PC.636')
        self.assertEqual(obs, exp)

    def test_getSampleMetadata_bad_sample_id(self):
        """Test metadata by sample ID accessor with invalid sample IDs."""
        # Nonexistent sample ID.
        self.assertRaises(KeyError, self.overview_map.getSampleMetadata,
            'PC.000')
        self.assertRaises(KeyError, self.no_metadata.getSampleMetadata,
            'PC.000')
        # Integer sample ID.
        self.assertRaises(KeyError, self.overview_map.getSampleMetadata, 42)
        # Sample ID of type None.
        self.assertRaises(KeyError, self.overview_map.getSampleMetadata, None)

        # Sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 's1')
        # Integer sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 1)
        # Sample ID of None on empty map.
        self.assertRaises(KeyError, self.empty_map.getSampleMetadata, None)

    def test_getCategoryValue(self):
        """Test category value by sample ID/category name accessor."""
        exp = "Fast"
        obs = self.overview_map.getCategoryValue('PC.634', 'Treatment')
        self.assertEqual(obs, exp)

        exp = "20070314"
        obs = self.overview_map.getCategoryValue('PC.481', 'DOB')
        self.assertEqual(obs, exp)

        exp = "ACGGTGAGTGTC"
        obs = self.map_with_comments.getCategoryValue(
                'PC.636', 'BarcodeSequence')
        self.assertEqual(obs, exp)

    def test_getCategoryValues(self):
        """Test category value list by sample ID/category name accessor."""
        smpl_ids = ['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.593', 'PC.607',
                    'PC.634', 'PC.635', 'PC.636']

        exp = ['Control','Control','Control','Control','Control','Fast'
                    ,'Fast','Fast','Fast']
        obs = self.overview_map.getCategoryValues(smpl_ids, 'Treatment')
        self.assertEqual(obs, exp)

    def test_isNumericCategory(self):
        """Test checking if a category is numeric."""
        obs = self.overview_map.isNumericCategory('Treatment')
        self.assertEqual(obs, False)

        obs = self.overview_map.isNumericCategory('DOB')
        self.assertEqual(obs, True)

    def test_hasUniqueCategoryValues(self):
        """Test checking if a category has unique values."""
        obs = self.overview_map.hasUniqueCategoryValues('Treatment')
        self.assertEqual(obs, False)

        obs = self.overview_map.hasUniqueCategoryValues('DOB')
        self.assertEqual(obs, False)

        obs = self.overview_map.hasUniqueCategoryValues('Description')
        self.assertEqual(obs, True)

    def test_hasSingleCategoryValue(self):
        """Test checking if a category has only a single value."""
        obs = self.overview_map.hasSingleCategoryValue('Treatment')
        self.assertEqual(obs, False)

        obs = self.single_value.hasSingleCategoryValue('Foo')
        self.assertEqual(obs, True)

    def test_getCategoryValue_bad_sample_id(self):
        """Test category value by sample ID accessor with bad sample IDs."""
        # Nonexistent sample ID.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
            'PC.000', 'Treatment')
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
            'PC.000', 'Treatment')
        # Integer sample ID.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue, 42,
            'DOB')
        # Sample ID of type None.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue, None,
            'Treatment')

        # Sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getCategoryValue, 's1',
            'foo')
        # Integer sample ID on empty map.
        self.assertRaises(KeyError, self.empty_map.getCategoryValue, 1,
            'bar')
        # Sample ID of None on empty map.
        self.assertRaises(KeyError, self.empty_map.getCategoryValue, None,
            'baz')

    def test_getCategoryValue_bad_category(self):
        """Test category value by sample ID accessor with bad categories."""
        # Nonexistent category.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
            'PC.354', 'foo')
        # Integer category.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
            'PC.354', 42)
        # Category of type None.
        self.assertRaises(KeyError, self.overview_map.getCategoryValue,
            'PC.354', None)

        # Category on map with no metadata, but that has sample IDs.
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
            'PC.354', 'Treatment')
        # Integer category on map with no metadata.
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
            'PC.354', 34)
        # Category of type None on map with no metadata.
        self.assertRaises(KeyError, self.no_metadata.getCategoryValue,
            'PC.354', None)

    def test_SampleIds(self):
        """Test sample IDs accessor."""
        exp = ["PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607",
               "PC.634", "PC.635", "PC.636"]
        obs = self.overview_map.SampleIds
        self.assertEqual(obs, exp)

        obs = self.no_metadata.SampleIds
        self.assertEqual(obs, exp)

        obs = self.empty_map.SampleIds
        self.assertEqual(obs, [])

    def test_CategoryNames(self):
        """Test category names accessor."""
        exp = ["BarcodeSequence", "DOB", "Description", "Treatment"]
        obs = self.overview_map.CategoryNames
        self.assertEqual(obs, exp)

        obs = self.no_metadata.CategoryNames
        self.assertEqual(obs, [])

        obs = self.empty_map.CategoryNames
        self.assertEqual(obs, [])

    def test_filterSamples(self):
        """Test filtering out samples from metadata map."""
        exp = ['PC.356', 'PC.593']
        self.overview_map.filterSamples(['PC.593', 'PC.356'])
        obs = self.overview_map.SampleIds
        self.assertEqual(obs, exp)

        self.overview_map.filterSamples([])
        self.assertEqual(self.overview_map.SampleIds, [])

    def test_filterSamples_strict(self):
        """Test strict checking of sample prescence when filtering."""
        with self.assertRaises(ValueError):
            self.overview_map.filterSamples(['PC.356', 'abc123'])

        with self.assertRaises(ValueError):
            self.empty_map.filterSamples(['foo'])

    def test_filterSamples_no_strict(self):
        """Test missing samples does not raise error."""
        self.overview_map.filterSamples(['PC.356', 'abc123'], strict=False)
        self.assertEqual(self.overview_map.SampleIds, ['PC.356'])

        self.empty_map.filterSamples(['foo'], strict=False)
        self.assertEqual(self.empty_map.SampleIds, [])


    def test_is_valid_git_refname(self):
        """Test correct validation of refnames"""
        # valid branchnames
        self.assertTrue(is_valid_git_refname('master'))
        self.assertTrue(is_valid_git_refname('debuggatron_2000'))
        self.assertTrue(is_valid_git_refname('refname/bar'))
        self.assertTrue(is_valid_git_refname('ref.nameslu/_eggs_/spam'))
        self.assertTrue(is_valid_git_refname('valid{0}char'.format(
            unichr(40))))
        self.assertTrue(is_valid_git_refname('master@head'))
        self.assertTrue(is_valid_git_refname('bar{thing}foo'))

        # case happening with git < 1.6.6
        self.assertFalse(is_valid_git_refname(
            '--abbrev-ref\nbaa350d7b7063d585ca293fc16ef15e0765dc9ee'))

        # different invalid refnames, for a description of each group see the
        # man page of git check-ref-format
        self.assertFalse(is_valid_git_refname('bar/.spam/eggs'))
        self.assertFalse(is_valid_git_refname('bar.lock/spam/eggs'))
        self.assertFalse(is_valid_git_refname('bar.lock'))
        self.assertFalse(is_valid_git_refname('.foobar'))

        self.assertFalse(is_valid_git_refname('ref..name'))

        self.assertFalse(is_valid_git_refname(u'invalid{0}char'.format(
            unichr(177))))
        self.assertFalse(is_valid_git_refname('invalid{0}char'.format(
            unichr(39))))
        self.assertFalse(is_valid_git_refname('ref~name/bar'))
        self.assertFalse(is_valid_git_refname('refname spam'))
        self.assertFalse(is_valid_git_refname('bar/foo/eggs~spam'))
        self.assertFalse(is_valid_git_refname('bar:_spam_'))
        self.assertFalse(is_valid_git_refname('eggtastic^2'))

        self.assertFalse(is_valid_git_refname('areyourandy?'))
        self.assertFalse(is_valid_git_refname('bar/*/spam'))
        self.assertFalse(is_valid_git_refname('bar[spam]/eggs'))

        self.assertFalse(is_valid_git_refname('/barfooeggs'))
        self.assertFalse(is_valid_git_refname('barfooeggs/'))
        self.assertFalse(is_valid_git_refname('bar/foo//////eggs'))

        self.assertFalse(is_valid_git_refname('dotEnding.'))

        self.assertFalse(is_valid_git_refname('@{branch'))

        self.assertFalse(is_valid_git_refname('contains\\slash'))

        self.assertFalse(is_valid_git_refname('$newbranch'))

    def test_is_valid_git_sha1(self):
        """ """

        # valid sha1 strings
        self.assertTrue(is_valid_git_sha1(
            '65a9ba2ef4b126fb5b054ea6b89b457463db4ec6'))
        self.assertTrue(is_valid_git_sha1(
            'a29a9911e41253405494c43889925a6d79ca26db'))
        self.assertTrue(is_valid_git_sha1(
            'e099cd5fdea89eba929d6051fbd26cc9e7a0c961'))
        self.assertTrue(is_valid_git_sha1(
            '44235d322c3386bd5ce872d9d7ea2e10d27c86cb'))
        self.assertTrue(is_valid_git_sha1(
            '7d2fc23E04540EE92c742948cca9ed5bc54d08d1'))
        self.assertTrue(is_valid_git_sha1(
            'fb5dc0285a8b11f199c4f3a7547a2da38138373f'))
        self.assertTrue(is_valid_git_sha1(
            '0b2abAEb195ba7ebc5cfdb53213a66fbaddefdb8'))

        # invalid length
        self.assertFalse(is_valid_git_sha1('cca9ed5bc54d08d1'))
        self.assertFalse(is_valid_git_sha1(''))

        # invalid characters
        self.assertFalse(is_valid_git_sha1(
            'fb5dy0f85a8b11f199c4f3a75474a2das8138373'))
        self.assertFalse(is_valid_git_sha1(
            '0x5dcc816fbc1c2e8eX087d7d2ed8d2950a7c16b'))
Ejemplo n.º 4
0
def preprocess_mapping_file(data,
                            headers,
                            columns,
                            unique=False,
                            single=False,
                            clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if 'SampleID' != columns[0]:
        columns = ['SampleID'] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if '&&' in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [
            headers.index(header_name)
            for header_name in new_column.split('&&')
        ]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append(''.join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data,
                                                       headers,
                                                       columns_to_remove,
                                                       negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + '_%d' % index] + element[1::]
                             for element in data])
        data = out_data

    return data, headers
Ejemplo n.º 5
0
def preprocess_mapping_file(data, headers, columns, unique=False, single=False,
                            clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if 'SampleID' != columns[0]:
        columns = ['SampleID'] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        # the list can contain None so check "if column" before treating as str
        if column and '&&' in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [headers.index(header_name) for header_name in
            new_column.split('&&')]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append(''.join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns that are not included in
    # the list of categories that should be kept i. e. columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # the --coloy_by option in the script interface allows the user to
        # specify the categories you want to use in the generated plot, this
        # the default behaviour is to color by all categories that are not
        # unique. If the user specifies a category with with the --color_by
        # option and this category contains a unique values, this category must
        # still be added thus the structure of the next few lines that
        # form the structure for the two different routes. (1) where no value
        # is specified in the CLI (the value of columns will be [None, x1, x2,
        # x3] where x{1,2,3} are categories requested in other CLI options) and
        # (2) where a value is specified in the CLI.
        #
        # TL;DR
        # see https://github.com/biocore/emperor/issues/271
        if None in columns:
            columns = headers[:]
            f_unique = metadata.hasUniqueCategoryValues
            f_single = metadata.hasSingleCategoryValue
        else:
            f_unique = lambda x: metadata.hasUniqueCategoryValues(x) and\
                                 x not in columns
            f_single = lambda x: metadata.hasSingleCategoryValue(x) and\
                                 x not in columns

        # find columns that have values that are all unique
        if unique:
            for c in headers[1::]:
                if f_unique(c):
                    columns_to_remove.append(c)
        # remove categories where there is only one value
        if single:
            for c in headers[1::]:
                if f_single(c):
                    columns_to_remove.append(c)
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data, headers,
            columns_to_remove, negate=True)
    else:
        # when a None is contained in columns, we imply we want to use all the
        # available categories in the mapping file, thus just overwrite the
        # value
        if None in columns:
            columns = headers[:]


    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0]+'_%d' % index]+element[1::]
                for element in data])
        data = out_data

    return data, headers