Exemple #1
0
    def test_sum_taxids(self):
        me = Analyze()
        me.input_tax = {'S1': '1', 'S2': '3'}

        def _hits_df(d):
            return pd.Series(d, name='taxid').to_frame()

        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'a': '4',
                    'b': '6'
                })
            }, {
                'hits': _hits_df({
                    'a': '4',
                    'c': '8'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'b': '6',
                    'd': '1'
                })
            }]
        }
        obs = me.sum_taxids()
        exp = {'1', '3', '4', '6', '8'}
        self.assertSetEqual(obs, exp)
Exemple #2
0
    def test_assign_taxonomy(self):
        # input are two genomes with defined taxonomy
        me = Analyze()
        me.input_tax = 'S1:561,S2:620'  # Escherichia and Shigella
        me.data = {}
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        # test input taxonomy extraction
        self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'})
        # test taxonomy refinement
        exp = {
            '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620'
        }
        self.assertSetEqual(set(me.taxdump.keys()), exp)
        # test LCA discovery
        self.assertEqual(me.lca, '543')

        # helper for making hit table
        def _hits_df(d):
            return pd.Series(d, name='taxid', dtype=object).to_frame()

        # input is one genome with defined taxonomy
        me = Analyze()
        me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]}
        me.input_tax = '561'  # Escherichia
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '561'})

        # input taxonomy not found in database
        me.input_tax = '1234'
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'TaxID 1234 is not present in taxonomy database.'
        self.assertEqual(str(ctx.exception), msg)

        # input are two genome whose taxonomies are to be inferred based on
        # search results
        me = Analyze()
        me.input_tax = None
        me.data = {
            'S1': [{
                'hits': _hits_df({
                    'P1': '561',
                    'P2': '562'
                })
            }, {
                'hits': _hits_df({
                    'P3': '543',
                    'P4': '561'
                })
            }],
            'S2': [{
                'hits': _hits_df({
                    'P5': '562',
                    'P6': '585056'
                })
            }, {
                'hits': _hits_df({
                    'P7': '561',
                    'P8': '1038927'
                })
            }, {
                'hits': _hits_df({'P9': '2580236'})
            }]
        }
        me.input_cov = 75
        me.taxdump = taxdump_from_text(taxdump_proteo)
        me.assign_taxonomy()
        self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'})
        self.assertEqual(me.lca, '543')

        # cannot auto-infer taxonomy
        me.data['S3'] = [{'hits': _hits_df({})}]
        me.taxdump = taxdump_from_text(taxdump_proteo)
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.'
        self.assertEqual(str(ctx.exception), msg)

        # invalid input taxonomy string
        me.input_tax = '561'
        with self.assertRaises(ValueError) as ctx:
            me.assign_taxonomy()
        msg = 'Invalid input taxonomy format.'
        self.assertEqual(str(ctx.exception), msg)