def test_define_groups(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = {} # user defined groups: # self: genera Escherichia and Shigella # close: family Enterobacteriaceae me.groups = {} me.self_tax = '561,620' me.close_tax = '543' me.define_groups() self.assertListEqual(me.self_tax, ['561', '620']) exp = {'561', '562', '585056', '1038927', '2580236', '620', '622'} self.assertSetEqual(me.groups['self'], exp) self.assertListEqual(me.close_tax, ['543']) exp = {'543', '548', '570'} self.assertSetEqual(me.groups['close'], exp) # auto-infer groups me.self_tax = {} me.close_tax = {} me.lca = '562' # all inputs are E. coli me.self_rank = 'genus' # but we want to raise self to genus me.close_size = 2 # close group must be this big or bigger me.define_groups() self.assertListEqual(me.self_tax, ['561']) exp = {'561', '562', '585056', '1038927', '2580236'} self.assertSetEqual(me.groups['self'], exp) self.assertListEqual(me.close_tax, ['543']) exp = {'543', '548', '570', '620', '622'} self.assertSetEqual(me.groups['close'], exp)
def test_infer_close_group(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = {} # close group is parent of LCA of self group me.self_tax = ['562'] # E. coli me.groups['self'] = set(['562'] + get_descendants('562', me.taxdump)) me.close_tax = None me.close_size = None me.infer_close_group() self.assertListEqual(me.close_tax, ['561']) # Escherichia self.assertSetEqual(me.groups['close'], {'561', '2580236'}) # close group must have at least 5 taxa me.close_tax = None me.groups['close'] = None me.close_size = 5 me.infer_close_group() self.assertListEqual(me.close_tax, ['543']) # Enterobacteriaceae exp = {'543', '620', '622', '570', '548', '561', '2580236'} self.assertSetEqual(me.groups['close'], exp) # close group is LCA of multiple self groups me.self_tax = ['561', '620'] # Escherichia and Shigella me.groups['self'] = set().union(*[[x] + get_descendants(x, me.taxdump) for x in me.self_tax]) me.close_tax = None me.groups['close'] = None me.close_size = None me.infer_close_group() self.assertListEqual(me.close_tax, ['543']) # Enterobacteriaceae exp = {'543', '570', '548'} self.assertSetEqual(me.groups['close'], exp)
def test_calc_scores(self): columns = ('id', 'taxid', 'score') # helper for making hit table def _hits_df(data): return pd.DataFrame(data, columns=columns).set_index('id') me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = { 'self': {'561', '562', '585056'}, 'close': {'543', '91347', '1236'} } me.data = { 'S1': [{ 'score': 100, 'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95))) }, { 'score': 90, 'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72))) }], 'S2': [{ 'score': 96, 'hits': _hits_df( (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66))) }] } me.weighted = True me.match_th = 0.9 me.calc_scores() # helper for get scores def _prot_scores(prot): return [prot[x] for x in ('self', 'close', 'distal')] s1_1 = me.data['S1'][0] self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self']) self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0]) self.assertEqual(s1_1['match'], '0') s1_2 = me.data['S1'][1] self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close']) self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0]) self.assertEqual(s1_2['match'], '0') s2_1 = me.data['S2'][0] self.assertListEqual(s2_1['hits']['group'].tolist(), ['self', 'close', 'distal']) self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875]) self.assertEqual(s2_1['match'], '620')