Python taxdump_from_text Examples, hgtector.util.taxdump_from_text Python Examples

Example #1

0

Show file

    def test_find_match(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        df = pd.DataFrame(
            [
                [100, '585056'],  # E. coli UMN026
                [99, '1038927'],  # E. coli O104:H4
                [97, '562'],  # Escherichia coli
                [95, '622'],  # Shigella dysenteriae
                [92, '543'],  # Enterobacteriaceae
                [88, '548'],  # Klebsiella aerogenes
                [80, '766']
            ],  # Rickettsiales
            columns=['score', 'taxid'])

        # keep top 1% hits
        me.match_th = 0.99
        self.assertEqual(me.find_match(df), '562')

        # keep top 10% hits
        me.match_th = 0.9
        self.assertEqual(me.find_match(df), '543')

        # keep top 20% hits
        me.match_th = 0.8
        self.assertEqual(me.find_match(df), '1224')

        # input DataFrame is empty
        self.assertEqual(me.find_match(pd.DataFrame()), '0')

Example #2

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

    def test_sort_by_hierarchy(self):
        taxdump = taxdump_from_text(taxdump_archaea)

        # sort by hierarchy from low to high:
        # Lokiarchaeum sp. GC14_75, Lokiarchaeum, Candidatus Lokiarchaeota,
        # Asgard group, Archaea
        tids = ['1935183', '1655637', '2157', '1538547', '1655434']
        obs = sort_by_hierarchy(tids, taxdump)
        exp = ['1538547', '1655637', '1655434', '1935183', '2157']
        self.assertListEqual(obs, exp)

        # add DPANN group, which cannot be sorted
        tids.append('1783276')
        with self.assertRaises(ValueError) as ctx:
            sort_by_hierarchy(tids, taxdump)
        msg = 'Cannot sort taxIds by hierarchy.'
        self.assertEqual(str(ctx.exception), msg)

        # take away Candidatus Lokiarchaeota to break the sequence
        tids.pop()
        tids.pop()
        with self.assertRaises(ValueError) as ctx:
            sort_by_hierarchy(tids, taxdump)
        msg = 'Cannot sort taxIds by hierarchy.'
        self.assertEqual(str(ctx.exception), msg)

Example #3

0

Show file

 def test_genome_lineages(self):
     me = Database()
     me.output = self.tmpdir
     me.taxdump = taxdump_from_text(taxdump_proteo)
     data = (
         ('G1', '1224', ''),  # Proteobacteria
         ('G2', '562', '562'),  # Escherichia coli
         ('G3', '622', '622'),  # Shigella dysenteriae
         ('G4', '548', '548'))  # Klebsiella aerogenes
     me.df = pd.DataFrame(data, columns=['genome', 'taxid',
                                         'species']).set_index('genome')
     for rank in [
             'superkingdom', 'kingdom', 'phylum', 'class', 'order',
             'family', 'genus'
     ]:
         me.df[rank] = ''
     me.genome_lineages()
     with open(join(self.tmpdir, 'lineages.txt'), 'r') as f:
         obs = dict(x.split('\t') for x in f.read().splitlines())
     proteo = 'k__Bacteria; p__Proteobacteria;'
     self.assertEqual(obs['G1'], proteo + ' c__; o__; f__; g__; s__')
     entero = proteo + ' c__Gammaproteobacteria; o__Enterobacterales;' +\
         ' f__Enterobacteriaceae;'
     self.assertEqual(obs['G2'],
                      entero + ' g__Escherichia; s__Escherichia coli')
     self.assertEqual(obs['G3'],
                      entero + ' g__Shigella; s__Shigella dysenteriae')
     self.assertEqual(obs['G4'],
                      entero + ' g__Klebsiella; s__Klebsiella aerogenes')
     remove(join(self.tmpdir, 'lineages.txt'))

Example #4

0

Show file

    def test_infer_close_group(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {}

        # close group is parent of LCA of self group
        me.self_tax = ['562']  # E. coli
        me.groups['self'] = set(['562'] + get_descendants('562', me.taxdump))
        me.close_tax = None
        me.close_size = None
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['561'])  # Escherichia
        self.assertSetEqual(me.groups['close'], {'561', '2580236'})

        # close group must have at least 5 taxa
        me.close_tax = None
        me.groups['close'] = None
        me.close_size = 5
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['543'])  # Enterobacteriaceae
        exp = {'543', '620', '622', '570', '548', '561', '2580236'}
        self.assertSetEqual(me.groups['close'], exp)

        # close group is LCA of multiple self groups
        me.self_tax = ['561', '620']  # Escherichia and Shigella
        me.groups['self'] = set().union(*[[x] + get_descendants(x, me.taxdump)
                                          for x in me.self_tax])
        me.close_tax = None
        me.groups['close'] = None
        me.close_size = None
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['543'])  # Enterobacteriaceae
        exp = {'543', '570', '548'}
        self.assertSetEqual(me.groups['close'], exp)

Example #5

0

Show file

    def test_infer_self_group(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)

        # assign to LCA of all genomes (E. coli)
        me.self_tax = None
        me.lca = '562'
        me.self_rank = None
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['562'])

        # raise LCA to genus level (Escherichia)
        me.self_tax = None
        me.lca = '562'
        me.self_rank = 'genus'
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['561'])

        # LCA (Enterobacteriaceae) is already above designated rank (genus)
        me.self_tax = None
        me.lca = '543'
        me.self_rank = 'genus'
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['543'])

Example #6

0

Show file

    def test_define_groups(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {}

        # user defined groups:
        # self: genera Escherichia and Shigella
        # close: family Enterobacteriaceae
        me.groups = {}
        me.self_tax = '561,620'
        me.close_tax = '543'
        me.define_groups()
        self.assertListEqual(me.self_tax, ['561', '620'])
        exp = {'561', '562', '585056', '1038927', '2580236', '620', '622'}
        self.assertSetEqual(me.groups['self'], exp)
        self.assertListEqual(me.close_tax, ['543'])
        exp = {'543', '548', '570'}
        self.assertSetEqual(me.groups['close'], exp)

        # auto-infer groups
        me.self_tax = {}
        me.close_tax = {}
        me.lca = '562'  # all inputs are E. coli
        me.self_rank = 'genus'  # but we want to raise self to genus
        me.close_size = 2  # close group must be this big or bigger
        me.define_groups()
        self.assertListEqual(me.self_tax, ['561'])
        exp = {'561', '562', '585056', '1038927', '2580236'}
        self.assertSetEqual(me.groups['self'], exp)
        self.assertListEqual(me.close_tax, ['543'])
        exp = {'543', '548', '570', '620', '622'}
        self.assertSetEqual(me.groups['close'], exp)

Example #7

0

Show file

    def test_infer_genome_tax(self):
        taxdump = taxdump_from_text(taxdump_proteo)

        # five proteins, in which four have hits
        taxids = [
            ['562', '620', '570'],  # E. coli
            ['562', '585056', '1038927', '2'],  # E. coli
            ['561', '543', '776'],  # Escherichia
            ['548', '570', '1236'],  # K. aerogenes
            []
        ]
        prots = [{'hits': pd.DataFrame(x, columns=['taxid'])} for x in taxids]
        obs = Analyze.infer_genome_tax(prots, taxdump, 75)
        exp = ('561', 75.0)  # 3 / 4 best hits assigned to Escherichia
        self.assertTupleEqual(obs, exp)

        # reduce coverage threshold
        obs = Analyze.infer_genome_tax(prots, taxdump, 50)
        exp = ('562', 50.0)  # 2 / 4 best hits assigned to Escherichia
        self.assertTupleEqual(obs, exp)

        # remove one protein that best matches E. coli
        prots.pop(0)
        obs = Analyze.infer_genome_tax(prots, taxdump, 75)
        exp = ('543', 100.0)  # 3 / 3 best hits assigned to Enterobacteriaceae
        self.assertTupleEqual(obs, exp)

        # no input protein
        with self.assertRaises(ValueError) as ctx:
            Analyze.infer_genome_tax({}, taxdump, 75)
        msg = 'Cannot auto-infer taxonomy.'
        self.assertEqual(str(ctx.exception), msg)

Example #8

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_add_children(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     add_children(taxdump)
     self.assertSetEqual(set(taxdump['1']['children']), {'131567'})
     self.assertSetEqual(set(taxdump['2157']['children']),
                         {'1935183', '1783276'})
     self.assertSetEqual(set(taxdump['1655434']['children']), {'1655637'})
     self.assertListEqual(taxdump['2']['children'], [])

Example #9

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_get_lineage(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     self.assertListEqual(get_lineage('2157', taxdump),
                          ['2157', '131567', '1'])
     obs = get_lineage('1538547', taxdump)
     exp = [
         '1538547', '1655637', '1655434', '1935183', '2157', '131567', '1'
     ]
     self.assertListEqual(obs, exp)

Example #10

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_taxids_at_ranks(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     ranks = ['phylum', 'class', 'genus', 'species']
     obs = taxids_at_ranks('1538547', ranks, taxdump)
     exp = {
         'phylum': '1655434',
         'class': None,
         'genus': '1655637',
         'species': '1538547'
     }
     self.assertDictEqual(obs, exp)

Example #11

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_taxdump_from_text(self):
     obs = taxdump_from_text(taxdump_archaea)
     self.assertEqual(len(obs), 9)
     self.assertDictEqual(obs['1'], {
         'name': 'root',
         'parent': '1',
         'rank': 'no rank'
     })
     self.assertDictEqual(obs['2157'], {
         'name': 'Archaea',
         'parent': '131567',
         'rank': 'superkingdom'
     })

Example #12

0

Show file

    def test_calc_scores(self):
        columns = ('id', 'taxid', 'score')

        # helper for making hit table
        def _hits_df(data):
            return pd.DataFrame(data, columns=columns).set_index('id')

        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {
            'self': {'561', '562', '585056'},
            'close': {'543', '91347', '1236'}
        }
        me.data = {
            'S1': [{
                'score': 100,
                'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95)))
            }, {
                'score': 90,
                'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72)))
            }],
            'S2': [{
                'score':
                96,
                'hits':
                _hits_df(
                    (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66)))
            }]
        }
        me.weighted = True
        me.match_th = 0.9
        me.calc_scores()

        # helper for get scores
        def _prot_scores(prot):
            return [prot[x] for x in ('self', 'close', 'distal')]

        s1_1 = me.data['S1'][0]
        self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self'])
        self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0])
        self.assertEqual(s1_1['match'], '0')
        s1_2 = me.data['S1'][1]
        self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close'])
        self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0])
        self.assertEqual(s1_2['match'], '0')
        s2_1 = me.data['S2'][0]
        self.assertListEqual(s2_1['hits']['group'].tolist(),
                             ['self', 'close', 'distal'])
        self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875])
        self.assertEqual(s2_1['match'], '620')

Example #13

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

    def test_find_lca(self):
        taxdump = taxdump_from_text(taxdump_archaea)

        self.assertEqual(find_lca(['131567'], taxdump), '131567')
        self.assertEqual(find_lca(['1935183', '1783276'], taxdump), '2157')
        self.assertEqual(find_lca(['1935183', '1783276', '1655434'], taxdump),
                         '2157')
        self.assertEqual(find_lca(['1935183', '1783276', '2157'], taxdump),
                         '2157')
        self.assertEqual(find_lca(['1935183', '2'], taxdump), '131567')
        self.assertEqual(find_lca(['1', '2', '1'], taxdump), '1')

        taxdump['x'] = {'name': 'x', 'parent': 'x'}
        with self.assertRaises(ValueError) as ctx:
            find_lca(['2', 'x'], taxdump)
        msg = 'Cannot find LCA of taxIds in database.'
        self.assertEqual(str(ctx.exception), msg)

Example #14

0

Show file

    def test_write_hgt_list(self):
        me = Analyze()
        me.output = self.tmpdir
        makedirs(join(me.output, 'hgts'), exist_ok=True)
        me.donor_name = False
        me.donor_rank = None
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.df = pd.DataFrame(
            [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True],
             ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True],
             ['S2', 'P5', 0.20, '0', False]],
            columns=['sample', 'protein', 'silh', 'match', 'hgt'])

        # default
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # number format and negative result
        me.write_hgt_list('S2')
        with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f:
            self.assertEqual(f.read(), 'P4\t0.8\t766\n')

        # raise to family
        me.donor_rank = 'family'
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # report taxon name
        me.donor_rank = None
        me.donor_name = True
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\tEscherichia coli\n'
               'P2\t0.95\tShigella dysenteriae\n'
               'P3\t1.05\tN/A\n')
        self.assertEqual(obs, exp)
        rmtree(join(me.output, 'hgts'))

Example #15

0

Show file

 def test_build_taxonmap(self):
     me = Database()
     me.output = self.tmpdir
     me.taxdump = taxdump_from_text(taxdump_proteo)
     me.p2tids = {
         'P1': {'766'},  # Rickettsiales
         'P2': {'570', '548'},  # Klebsiella
         'P3': {'620', '622'},  # Shigella
         'P4': {'561', '562'},  # Escherichia
         'P5': {'126792', '28211'}
     }  # root
     me.build_taxonmap()
     exp = {'P1': '766', 'P2': '570', 'P3': '620', 'P4': '561', 'P5': '1'}
     self.assertDictEqual(me.taxonmap, exp)
     with gzip.open(join(self.tmpdir, 'taxon.map.gz'), 'rt') as f:
         obs = dict(x.split('\t') for x in f.read().splitlines())
     self.assertDictEqual(obs, exp)
     remove(join(self.tmpdir, 'taxon.map.gz'))

Example #16

0

Show file

    def test_sample_by_taxonomy(self):
        me = Database()

        # do nothing
        me.sample = None
        self.assertIsNone(me.sample_by_taxonomy())

        # xxx
        header = ('genome', 'taxid', 'refseq_category', 'assembly_level')
        data = (
            ('G1', '585056', '', 'Chromosome'),  # E. coli UMN026
            ('G2', '1038927', 'representative genome', 'Chromosome'),
            # E. coli O104:H4 (rep. genome to be prioritized over G1)
            ('G3', '2580236', '', 'Contig'),  # sync E. coli
            ('G4', '622', '', 'Scaffold'),  # Shigella
            ('G5', '548', '', 'Scaffold'),  # Klebsiella
            ('G6', '126792', 'reference genome', 'Contig'))  # plasmid
        df = pd.DataFrame(data, columns=header)
        me.reference = False
        me.representative = False
        me.taxdump = taxdump_from_text(taxdump_proteo)

        # up to one genome per genus
        me.rank = 'genus'
        me.sample = 1
        me.df = df.copy()
        me.sample_by_taxonomy()
        self.assertListEqual(me.df.columns.tolist(), list(header) + ['genus'])
        self.assertListEqual(me.df['genome'].tolist(), ['G2', 'G4', 'G5'])

        # include reference genome (plasmid)
        me.reference = True
        me.df = df.copy()
        me.sample_by_taxonomy()
        self.assertEqual(me.df['genome'].tolist()[-1], 'G6')

        # up to two genomes for entire cellular life
        me.rank = 'superkingdom'
        me.sample = 2
        me.reference = False
        me.df = df.copy()
        me.sample_by_taxonomy()
        self.assertListEqual(me.df['genome'].tolist(), ['G1', 'G2'])

Example #17

0

Show file

 def test_build_taxdump(self):
     me = Database()
     me.output = self.tmpdir
     me.tmpdir = join(self.datadir, 'DnaK', 'taxdump')
     me.taxdump = taxdump_from_text(taxdump_proteo)
     data = (
         ('G1', '1224'),  # Proteobacteria
         ('G2', '562'),  # Escherichia coli
         ('G3', '585056'),  # E. coli UMN026
         ('G4', '1038927'))  # E. coli O104:H4
     me.df = pd.DataFrame(data, columns=['genome',
                                         'taxid']).set_index('genome')
     me.build_taxdump()
     with open(join(self.tmpdir, 'taxdump', 'nodes.dmp'), 'r') as f:
         obs = set(x.split('\t')[0] for x in f.read().splitlines())
     exp = {
         '1', '131567', '2', '1224', '1236', '91347', '543', '561', '562',
         '585056', '1038927'
     }
     self.assertSetEqual(obs, exp)
     rmtree(join(self.tmpdir, 'taxdump'))

Example #18

0

Show file

    def test_identify_taxonomy(self):
        me = Database()
        header = ('organism_name', 'taxid', 'species', 'species_taxid')
        data = (('Escherichia coli UMN026', '585056', 'E. coli', '562'),
                ('Escherichia coli O104:H4', '1038927', 'E. coli', '562'),
                ('Klebsiella aerogenes', '548', 'Klebsiella aerogenes',
                 '548'), ('unclassified Gammaproteobacteria', '118884', '',
                          ''), ('Plasmid pPY113', '126792', '', ''))
        df = pd.DataFrame(data, columns=header)

        # organism names must be capital and latinate
        me.capital = True
        me.block = None
        me.latin = True
        me.taxids = None
        me.exclude = False
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.df = df.copy()
        me.identify_taxonomy()
        self.assertNotIn('species_taxid', me.df.columns)
        self.assertListEqual(me.df.index.tolist(), [0, 1, 2])
        self.assertListEqual(me.df['species'].tolist(), ['562', '562', '548'])

        # block word
        me.block = 'plasmid'
        me.latin = False
        me.df = df.copy()
        me.identify_taxonomy()
        self.assertListEqual(me.df.index.tolist(), [0, 1, 2])

        # no Escherichia
        me.taxids = '561'
        me.exclude = True
        me.df = df.copy()
        me.identify_taxonomy()
        self.assertListEqual(me.df.index.tolist(), [2])

Example #19

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test__get_taxon(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     with self.assertRaises(ValueError) as ctx:
         _get_taxon('12345', taxdump)
     msg = 'TaxID 12345 is not found in taxonomy database.'
     self.assertEqual(str(ctx.exception), msg)

Example #20

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_is_ancestral(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     self.assertTrue(is_ancestral('1538547', {'2157'}, taxdump))
     self.assertFalse(is_ancestral('1538547', {'2'}, taxdump))

Example #21

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_get_descendants(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     add_children(taxdump)
     obs = get_descendants('1935183', taxdump)  # Asgard group
     exp = ['1655434', '1655637', '1538547']
     self.assertListEqual(obs, exp)

Example #22

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_taxid_at_rank(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     self.assertEqual(taxid_at_rank('1538547', 'genus', taxdump), '1655637')
     self.assertEqual(taxid_at_rank('1538547', 'phylum', taxdump),
                      '1655434')

Example #23

0

Show file

File: test_util.py Project: DittmarLab/HGTector_legacy

 def test_refine_taxdump(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     tids = ['2', '2157']  # Bacteria and Archaea
     obs = set(refine_taxdump(tids, taxdump))
     exp = {'1', '131567', '2', '2157'}  # plus cellular organisms and root
     self.assertSetEqual(obs, exp)

Example #24

0

Show file

    def test_assign_taxonomy(self):
        # input are two genomes with defined taxonomy
        me = Analyze()
        me.input_tax = 'S1:561,S2:620'  # Escherichia and Shigella
        me.data = {}
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        # test input taxonomy extraction
        self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'})
        # test taxonomy refinement
        exp = {
            '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620'
        }
        self.assertSetEqual(set(me.taxdump.keys()), exp)
        # test LCA discovery
        self.assertEqual(me.lca, '543')

        # helper for making hit table
        def _hits_df(d):
            return pd.Series(d, name='taxid', dtype=object).to_frame()

        # input is one genome with defined taxonomy
        me = Analyze()
        me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]}
        me.input_tax = '561'  # Escherichia
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '561'})

        # input taxonomy not found in database
        me.input_tax = '1234'
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'TaxID 1234 is not present in taxonomy database.'
        self.assertEqual(str(ctx.exception), msg)

        # input are two genome whose taxonomies are to be inferred based on
        # search results
        me = Analyze()
        me.input_tax = None
        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'P1': '561',
                    'P2': '562'
                })
            }, {
                'hits': _hits_df({
                    'P3': '543',
                    'P4': '561'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'P5': '562',
                    'P6': '585056'
                })
            }, {
                'hits': _hits_df({
                    'P7': '561',
                    'P8': '1038927'
                })
            }, {
                'hits': _hits_df({'P9': '2580236'})
            }]
        }
        me.input_cov = 75
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'})
        self.assertEqual(me.lca, '543')

        # cannot auto-infer taxonomy
        me.data['S3'] = [{'hits': _hits_df({})}]
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.'
        self.assertEqual(str(ctx.exception), msg)

        # invalid input taxonomy string
        me.input_tax = '561'
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Invalid input taxonomy format.'
        self.assertEqual(str(ctx.exception), msg)