def test_sum_taxids(self): me = Analyze() me.input_tax = {'S1': '1', 'S2': '3'} def _hits_df(d): return pd.Series(d, name='taxid').to_frame() me.data = { 'S1': [{ 'hits': _hits_df({ 'a': '4', 'b': '6' }) }, { 'hits': _hits_df({ 'a': '4', 'c': '8' }) }], 'S2': [{ 'hits': _hits_df({ 'b': '6', 'd': '1' }) }] } obs = me.sum_taxids() exp = {'1', '3', '4', '6', '8'} self.assertSetEqual(obs, exp)
def test_calc_scores(self): columns = ('id', 'taxid', 'score') # helper for making hit table def _hits_df(data): return pd.DataFrame(data, columns=columns).set_index('id') me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = { 'self': {'561', '562', '585056'}, 'close': {'543', '91347', '1236'} } me.data = { 'S1': [{ 'score': 100, 'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95))) }, { 'score': 90, 'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72))) }], 'S2': [{ 'score': 96, 'hits': _hits_df( (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66))) }] } me.weighted = True me.match_th = 0.9 me.calc_scores() # helper for get scores def _prot_scores(prot): return [prot[x] for x in ('self', 'close', 'distal')] s1_1 = me.data['S1'][0] self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self']) self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0]) self.assertEqual(s1_1['match'], '0') s1_2 = me.data['S1'][1] self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close']) self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0]) self.assertEqual(s1_2['match'], '0') s2_1 = me.data['S2'][0] self.assertListEqual(s2_1['hits']['group'].tolist(), ['self', 'close', 'distal']) self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875]) self.assertEqual(s2_1['match'], '620')
def test_make_score_table(self): me = Analyze() me.output = self.tmpdir me.data = { 'S1': [{ 'id': 'P1', 'length': 100, 'match': '0', 'self': 1.5, 'close': 0.75, 'distal': 0.0, 'hits': pd.DataFrame([0] * 3) }, { 'id': 'P2', 'length': 120, 'match': '1224', 'self': 1.625, 'close': 0.225, 'distal': 0.375, 'hits': pd.DataFrame([0] * 5) }], 'S2': [{ 'id': 'P1', 'length': 225, 'match': '620', 'self': 2.35, 'close': 1.05, 'distal': 0.75, 'hits': pd.DataFrame([0] * 6) }] } me.make_score_table() obs = me.df.values.tolist() exp = [['S1', 'P1', 100, 3, 1.5, 0.75, 0, '0'], ['S1', 'P2', 120, 5, 1.625, 0.225, 0.375, '1224'], ['S2', 'P1', 225, 6, 2.35, 1.05, 0.75, '620']] self.assertListEqual(obs, exp) fp = join(self.tmpdir, 'scores.tsv') with open(fp, 'r') as f: obs = [x.split('\t') for x in f.read().splitlines()[1:]] exp = [[str(y) for y in x] for x in exp] self.assertListEqual(obs, exp) remove(fp)
def test_assign_taxonomy(self): # input are two genomes with defined taxonomy me = Analyze() me.input_tax = 'S1:561,S2:620' # Escherichia and Shigella me.data = {} me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() # test input taxonomy extraction self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'}) # test taxonomy refinement exp = { '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620' } self.assertSetEqual(set(me.taxdump.keys()), exp) # test LCA discovery self.assertEqual(me.lca, '543') # helper for making hit table def _hits_df(d): return pd.Series(d, name='taxid', dtype=object).to_frame() # input is one genome with defined taxonomy me = Analyze() me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]} me.input_tax = '561' # Escherichia me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() self.assertDictEqual(me.input_tax, {'S1': '561'}) # input taxonomy not found in database me.input_tax = '1234' me.taxdump = taxdump_from_text(taxdump_proteo) with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'TaxID 1234 is not present in taxonomy database.' self.assertEqual(str(ctx.exception), msg) # input are two genome whose taxonomies are to be inferred based on # search results me = Analyze() me.input_tax = None me.data = { 'S1': [{ 'hits': _hits_df({ 'P1': '561', 'P2': '562' }) }, { 'hits': _hits_df({ 'P3': '543', 'P4': '561' }) }], 'S2': [{ 'hits': _hits_df({ 'P5': '562', 'P6': '585056' }) }, { 'hits': _hits_df({ 'P7': '561', 'P8': '1038927' }) }, { 'hits': _hits_df({'P9': '2580236'}) }] } me.input_cov = 75 me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'}) self.assertEqual(me.lca, '543') # cannot auto-infer taxonomy me.data['S3'] = [{'hits': _hits_df({})}] me.taxdump = taxdump_from_text(taxdump_proteo) with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.' self.assertEqual(str(ctx.exception), msg) # invalid input taxonomy string me.input_tax = '561' with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'Invalid input taxonomy format.' self.assertEqual(str(ctx.exception), msg)