Example #1
0
    def test_sum_taxids(self):
        me = Analyze()
        me.input_tax = {'S1': '1', 'S2': '3'}

        def _hits_df(d):
            return pd.Series(d, name='taxid').to_frame()

        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'a': '4',
                    'b': '6'
                })
            }, {
                'hits': _hits_df({
                    'a': '4',
                    'c': '8'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'b': '6',
                    'd': '1'
                })
            }]
        }
        obs = me.sum_taxids()
        exp = {'1', '3', '4', '6', '8'}
        self.assertSetEqual(obs, exp)
Example #2
0
    def test_calc_scores(self):
        columns = ('id', 'taxid', 'score')

        # helper for making hit table
        def _hits_df(data):
            return pd.DataFrame(data, columns=columns).set_index('id')

        me = Analyze()
        me.taxdump = taxdump_from_text(taxdump_proteo)
        add_children(me.taxdump)
        me.groups = {
            'self': {'561', '562', '585056'},
            'close': {'543', '91347', '1236'}
        }
        me.data = {
            'S1': [{
                'score': 100,
                'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95)))
            }, {
                'score': 90,
                'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72)))
            }],
            'S2': [{
                'score':
                96,
                'hits':
                _hits_df(
                    (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66)))
            }]
        }
        me.weighted = True
        me.match_th = 0.9
        me.calc_scores()

        # helper for get scores
        def _prot_scores(prot):
            return [prot[x] for x in ('self', 'close', 'distal')]

        s1_1 = me.data['S1'][0]
        self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self'])
        self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0])
        self.assertEqual(s1_1['match'], '0')
        s1_2 = me.data['S1'][1]
        self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close'])
        self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0])
        self.assertEqual(s1_2['match'], '0')
        s2_1 = me.data['S2'][0]
        self.assertListEqual(s2_1['hits']['group'].tolist(),
                             ['self', 'close', 'distal'])
        self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875])
        self.assertEqual(s2_1['match'], '620')
Example #3
0
 def test_make_score_table(self):
     me = Analyze()
     me.output = self.tmpdir
     me.data = {
         'S1': [{
             'id': 'P1',
             'length': 100,
             'match': '0',
             'self': 1.5,
             'close': 0.75,
             'distal': 0.0,
             'hits': pd.DataFrame([0] * 3)
         }, {
             'id': 'P2',
             'length': 120,
             'match': '1224',
             'self': 1.625,
             'close': 0.225,
             'distal': 0.375,
             'hits': pd.DataFrame([0] * 5)
         }],
         'S2': [{
             'id': 'P1',
             'length': 225,
             'match': '620',
             'self': 2.35,
             'close': 1.05,
             'distal': 0.75,
             'hits': pd.DataFrame([0] * 6)
         }]
     }
     me.make_score_table()
     obs = me.df.values.tolist()
     exp = [['S1', 'P1', 100, 3, 1.5, 0.75, 0, '0'],
            ['S1', 'P2', 120, 5, 1.625, 0.225, 0.375, '1224'],
            ['S2', 'P1', 225, 6, 2.35, 1.05, 0.75, '620']]
     self.assertListEqual(obs, exp)
     fp = join(self.tmpdir, 'scores.tsv')
     with open(fp, 'r') as f:
         obs = [x.split('\t') for x in f.read().splitlines()[1:]]
     exp = [[str(y) for y in x] for x in exp]
     self.assertListEqual(obs, exp)
     remove(fp)
Example #4
0
    def test_assign_taxonomy(self):
        # input are two genomes with defined taxonomy
        me = Analyze()
        me.input_tax = 'S1:561,S2:620'  # Escherichia and Shigella
        me.data = {}
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        # test input taxonomy extraction
        self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'})
        # test taxonomy refinement
        exp = {
            '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620'
        }
        self.assertSetEqual(set(me.taxdump.keys()), exp)
        # test LCA discovery
        self.assertEqual(me.lca, '543')

        # helper for making hit table
        def _hits_df(d):
            return pd.Series(d, name='taxid', dtype=object).to_frame()

        # input is one genome with defined taxonomy
        me = Analyze()
        me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]}
        me.input_tax = '561'  # Escherichia
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '561'})

        # input taxonomy not found in database
        me.input_tax = '1234'
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'TaxID 1234 is not present in taxonomy database.'
        self.assertEqual(str(ctx.exception), msg)

        # input are two genome whose taxonomies are to be inferred based on
        # search results
        me = Analyze()
        me.input_tax = None
        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'P1': '561',
                    'P2': '562'
                })
            }, {
                'hits': _hits_df({
                    'P3': '543',
                    'P4': '561'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'P5': '562',
                    'P6': '585056'
                })
            }, {
                'hits': _hits_df({
                    'P7': '561',
                    'P8': '1038927'
                })
            }, {
                'hits': _hits_df({'P9': '2580236'})
            }]
        }
        me.input_cov = 75
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'})
        self.assertEqual(me.lca, '543')

        # cannot auto-infer taxonomy
        me.data['S3'] = [{'hits': _hits_df({})}]
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.'
        self.assertEqual(str(ctx.exception), msg)

        # invalid input taxonomy string
        me.input_tax = '561'
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Invalid input taxonomy format.'
        self.assertEqual(str(ctx.exception), msg)