def test_find_match(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) df = pd.DataFrame( [ [100, '585056'], # E. coli UMN026 [99, '1038927'], # E. coli O104:H4 [97, '562'], # Escherichia coli [95, '622'], # Shigella dysenteriae [92, '543'], # Enterobacteriaceae [88, '548'], # Klebsiella aerogenes [80, '766'] ], # Rickettsiales columns=['score', 'taxid']) # keep top 1% hits me.match_th = 0.99 self.assertEqual(me.find_match(df), '562') # keep top 10% hits me.match_th = 0.9 self.assertEqual(me.find_match(df), '543') # keep top 20% hits me.match_th = 0.8 self.assertEqual(me.find_match(df), '1224') # input DataFrame is empty self.assertEqual(me.find_match(pd.DataFrame()), '0')
def test_calc_scores(self): columns = ('id', 'taxid', 'score') # helper for making hit table def _hits_df(data): return pd.DataFrame(data, columns=columns).set_index('id') me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = { 'self': {'561', '562', '585056'}, 'close': {'543', '91347', '1236'} } me.data = { 'S1': [{ 'score': 100, 'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95))) }, { 'score': 90, 'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72))) }], 'S2': [{ 'score': 96, 'hits': _hits_df( (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66))) }] } me.weighted = True me.match_th = 0.9 me.calc_scores() # helper for get scores def _prot_scores(prot): return [prot[x] for x in ('self', 'close', 'distal')] s1_1 = me.data['S1'][0] self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self']) self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0]) self.assertEqual(s1_1['match'], '0') s1_2 = me.data['S1'][1] self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close']) self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0]) self.assertEqual(s1_2['match'], '0') s2_1 = me.data['S2'][0] self.assertListEqual(s2_1['hits']['group'].tolist(), ['self', 'close', 'distal']) self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875]) self.assertEqual(s2_1['match'], '620')