Exemple #1
0
    def test_find_match(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        df = pd.DataFrame(
            [
                [100, '585056'],  # E. coli UMN026
                [99, '1038927'],  # E. coli O104:H4
                [97, '562'],  # Escherichia coli
                [95, '622'],  # Shigella dysenteriae
                [92, '543'],  # Enterobacteriaceae
                [88, '548'],  # Klebsiella aerogenes
                [80, '766']
            ],  # Rickettsiales
            columns=['score', 'taxid'])

        # keep top 1% hits
        me.match_th = 0.99
        self.assertEqual(me.find_match(df), '562')

        # keep top 10% hits
        me.match_th = 0.9
        self.assertEqual(me.find_match(df), '543')

        # keep top 20% hits
        me.match_th = 0.8
        self.assertEqual(me.find_match(df), '1224')

        # input DataFrame is empty
        self.assertEqual(me.find_match(pd.DataFrame()), '0')
Exemple #2
0
 def test_add_children(self):
     taxdump = {k: v for k, v in taxdump_archaea.items()}
     add_children(taxdump)
     self.assertSetEqual(set(taxdump['2157']['children']),
                         {'1935183', '1783276'})
     self.assertSetEqual(set(taxdump['1655434']['children']), {'1655637'})
     self.assertListEqual(taxdump['2']['children'], [])
Exemple #3
0
    def test_infer_close_group(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {}

        # close group is parent of LCA of self group
        me.self_tax = ['562']  # E. coli
        me.groups['self'] = set(['562'] + get_descendants('562', me.taxdump))
        me.close_tax = None
        me.close_size = None
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['561'])  # Escherichia
        self.assertSetEqual(me.groups['close'], {'561', '2580236'})

        # close group must have at least 5 taxa
        me.close_tax = None
        me.groups['close'] = None
        me.close_size = 5
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['543'])  # Enterobacteriaceae
        exp = {'543', '620', '622', '570', '548', '561', '2580236'}
        self.assertSetEqual(me.groups['close'], exp)

        # close group is LCA of multiple self groups
        me.self_tax = ['561', '620']  # Escherichia and Shigella
        me.groups['self'] = set().union(*[[x] + get_descendants(x, me.taxdump)
                                          for x in me.self_tax])
        me.close_tax = None
        me.groups['close'] = None
        me.close_size = None
        me.infer_close_group()
        self.assertListEqual(me.close_tax, ['543'])  # Enterobacteriaceae
        exp = {'543', '570', '548'}
        self.assertSetEqual(me.groups['close'], exp)
Exemple #4
0
    def test_infer_self_group(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)

        # assign to LCA of all genomes (E. coli)
        me.self_tax = None
        me.lca = '562'
        me.self_rank = None
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['562'])

        # raise LCA to genus level (Escherichia)
        me.self_tax = None
        me.lca = '562'
        me.self_rank = 'genus'
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['561'])

        # LCA (Enterobacteriaceae) is already above designated rank (genus)
        me.self_tax = None
        me.lca = '543'
        me.self_rank = 'genus'
        me.infer_self_group()
        self.assertListEqual(me.self_tax, ['543'])
Exemple #5
0
    def test_define_groups(self):
        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {}

        # user defined groups:
        # self: genera Escherichia and Shigella
        # close: family Enterobacteriaceae
        me.groups = {}
        me.self_tax = '561,620'
        me.close_tax = '543'
        me.define_groups()
        self.assertListEqual(me.self_tax, ['561', '620'])
        exp = {'561', '562', '585056', '1038927', '2580236', '620', '622'}
        self.assertSetEqual(me.groups['self'], exp)
        self.assertListEqual(me.close_tax, ['543'])
        exp = {'543', '548', '570'}
        self.assertSetEqual(me.groups['close'], exp)

        # auto-infer groups
        me.self_tax = {}
        me.close_tax = {}
        me.lca = '562'  # all inputs are E. coli
        me.self_rank = 'genus'  # but we want to raise self to genus
        me.close_size = 2  # close group must be this big or bigger
        me.define_groups()
        self.assertListEqual(me.self_tax, ['561'])
        exp = {'561', '562', '585056', '1038927', '2580236'}
        self.assertSetEqual(me.groups['self'], exp)
        self.assertListEqual(me.close_tax, ['543'])
        exp = {'543', '548', '570', '620', '622'}
        self.assertSetEqual(me.groups['close'], exp)
 def test_add_children(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     add_children(taxdump)
     self.assertSetEqual(set(taxdump['1']['children']), {'131567'})
     self.assertSetEqual(set(taxdump['2157']['children']),
                         {'1935183', '1783276'})
     self.assertSetEqual(set(taxdump['1655434']['children']), {'1655637'})
     self.assertListEqual(taxdump['2']['children'], [])
Exemple #7
0
    def test_calc_scores(self):
        columns = ('id', 'taxid', 'score')

        # helper for making hit table
        def _hits_df(data):
            return pd.DataFrame(data, columns=columns).set_index('id')

        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {
            'self': {'561', '562', '585056'},
            'close': {'543', '91347', '1236'}
        }
        me.data = {
            'S1': [{
                'score': 100,
                'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95)))
            }, {
                'score': 90,
                'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72)))
            }],
            'S2': [{
                'score':
                96,
                'hits':
                _hits_df(
                    (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66)))
            }]
        }
        me.weighted = True
        me.match_th = 0.9
        me.calc_scores()

        # helper for get scores
        def _prot_scores(prot):
            return [prot[x] for x in ('self', 'close', 'distal')]

        s1_1 = me.data['S1'][0]
        self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self'])
        self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0])
        self.assertEqual(s1_1['match'], '0')
        s1_2 = me.data['S1'][1]
        self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close'])
        self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0])
        self.assertEqual(s1_2['match'], '0')
        s2_1 = me.data['S2'][0]
        self.assertListEqual(s2_1['hits']['group'].tolist(),
                             ['self', 'close', 'distal'])
        self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875])
        self.assertEqual(s2_1['match'], '620')
Exemple #8
0
    def assign_taxonomy(self):
        """Assign taxonomy to genomes.
        """
        # take user-defined taxIds of input genomes
        if self.input_tax:
            try:
                self.input_tax = dict_from_param(self.input_tax)
            except ValueError:
                if len(self.data) > 1:
                    raise ValueError('Invalid input taxonomy format.')
                # for single-sample analysis, one can simply enter a taxId
                self.input_tax = {max(self.data.keys()): self.input_tax}
            print('User-specified TaxIDs of input genomes:')
            for sid, tid in sorted(self.input_tax.items()):
                if tid not in self.taxdump:
                    # TODO: read from both temp and master taxdump
                    raise ValueError('TaxID {} is not present in taxonomy '
                                     'database.'.format(tid))
                print('  {}: {} ({}).'.format(sid, tid,
                                              self.taxdump[tid]['name']))
        else:
            self.input_tax = {}

        # auto-infer taxIds of remaining genomes
        sids = sorted([x for x in self.data if x not in self.input_tax])
        if sids:
            print('Auto-inferring plausible taxIds for input genomes based on '
                  'taxonomy of search results...')
            for sid in sids:
                try:
                    tid, cov = self.infer_genome_tax(self.data[sid],
                                                     self.taxdump,
                                                     self.input_cov)
                    self.input_tax[sid] = tid
                except ValueError:
                    raise ValueError('Cannot auto-infer taxonomy for {}. '
                                     'Please specify manually.'.format(sid))
                print('  {}: {} ({}) (covering {:2g}% best hits).'.format(
                    sid, tid, self.taxdump[tid]['name'], cov))

        # refine taxonomy database
        print('Refining taxonomy database...')
        refine_taxdump(self.sum_taxids(), self.taxdump)
        add_children(self.taxdump)
        print('Done. Retained {} taxa.'.format(len(self.taxdump)))

        # find lowest common ancestor (LCA) of all genomes
        self.lca = find_lca(self.input_tax.values(), self.taxdump)
        print('All input genomes belong to {} ({}).'.format(
            self.lca, describe_taxon(self.lca, self.taxdump)))
Exemple #9
0
    def test_write_hgt_list(self):
        me = Analyze()
        me.output = self.tmpdir
        makedirs(join(me.output, 'hgts'), exist_ok=True)
        me.donor_name = False
        me.donor_rank = None
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.df = pd.DataFrame(
            [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True],
             ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True],
             ['S2', 'P5', 0.20, '0', False]],
            columns=['sample', 'protein', 'silh', 'match', 'hgt'])

        # default
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # number format and negative result
        me.write_hgt_list('S2')
        with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f:
            self.assertEqual(f.read(), 'P4\t0.8\t766\n')

        # raise to family
        me.donor_rank = 'family'
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n')
        self.assertEqual(obs, exp)

        # report taxon name
        me.donor_rank = None
        me.donor_name = True
        me.write_hgt_list('S1')
        with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f:
            obs = f.read()
        exp = ('P1\t0.85\tEscherichia coli\n'
               'P2\t0.95\tShigella dysenteriae\n'
               'P3\t1.05\tN/A\n')
        self.assertEqual(obs, exp)
        rmtree(join(me.output, 'hgts'))
 def test_get_descendants(self):
     taxdump = taxdump_from_text(taxdump_archaea)
     add_children(taxdump)
     obs = get_descendants('1935183', taxdump)  # Asgard group
     exp = ['1655434', '1655637', '1538547']
     self.assertListEqual(obs, exp)
Exemple #11
0
 def test_get_descendants(self):
     taxdump = {k: v for k, v in taxdump_archaea.items()}
     add_children(taxdump)
     obs = get_descendants('1935183', taxdump)  # Asgard group
     exp = ['1655434', '1655637', '1538547']
     self.assertListEqual(obs, exp)