def test_find_match(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) df = pd.DataFrame( [ [100, '585056'], # E. coli UMN026 [99, '1038927'], # E. coli O104:H4 [97, '562'], # Escherichia coli [95, '622'], # Shigella dysenteriae [92, '543'], # Enterobacteriaceae [88, '548'], # Klebsiella aerogenes [80, '766'] ], # Rickettsiales columns=['score', 'taxid']) # keep top 1% hits me.match_th = 0.99 self.assertEqual(me.find_match(df), '562') # keep top 10% hits me.match_th = 0.9 self.assertEqual(me.find_match(df), '543') # keep top 20% hits me.match_th = 0.8 self.assertEqual(me.find_match(df), '1224') # input DataFrame is empty self.assertEqual(me.find_match(pd.DataFrame()), '0')
def test_sort_by_hierarchy(self): taxdump = taxdump_from_text(taxdump_archaea) # sort by hierarchy from low to high: # Lokiarchaeum sp. GC14_75, Lokiarchaeum, Candidatus Lokiarchaeota, # Asgard group, Archaea tids = ['1935183', '1655637', '2157', '1538547', '1655434'] obs = sort_by_hierarchy(tids, taxdump) exp = ['1538547', '1655637', '1655434', '1935183', '2157'] self.assertListEqual(obs, exp) # add DPANN group, which cannot be sorted tids.append('1783276') with self.assertRaises(ValueError) as ctx: sort_by_hierarchy(tids, taxdump) msg = 'Cannot sort taxIds by hierarchy.' self.assertEqual(str(ctx.exception), msg) # take away Candidatus Lokiarchaeota to break the sequence tids.pop() tids.pop() with self.assertRaises(ValueError) as ctx: sort_by_hierarchy(tids, taxdump) msg = 'Cannot sort taxIds by hierarchy.' self.assertEqual(str(ctx.exception), msg)
def test_genome_lineages(self): me = Database() me.output = self.tmpdir me.taxdump = taxdump_from_text(taxdump_proteo) data = ( ('G1', '1224', ''), # Proteobacteria ('G2', '562', '562'), # Escherichia coli ('G3', '622', '622'), # Shigella dysenteriae ('G4', '548', '548')) # Klebsiella aerogenes me.df = pd.DataFrame(data, columns=['genome', 'taxid', 'species']).set_index('genome') for rank in [ 'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus' ]: me.df[rank] = '' me.genome_lineages() with open(join(self.tmpdir, 'lineages.txt'), 'r') as f: obs = dict(x.split('\t') for x in f.read().splitlines()) proteo = 'k__Bacteria; p__Proteobacteria;' self.assertEqual(obs['G1'], proteo + ' c__; o__; f__; g__; s__') entero = proteo + ' c__Gammaproteobacteria; o__Enterobacterales;' +\ ' f__Enterobacteriaceae;' self.assertEqual(obs['G2'], entero + ' g__Escherichia; s__Escherichia coli') self.assertEqual(obs['G3'], entero + ' g__Shigella; s__Shigella dysenteriae') self.assertEqual(obs['G4'], entero + ' g__Klebsiella; s__Klebsiella aerogenes') remove(join(self.tmpdir, 'lineages.txt'))
def test_infer_close_group(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = {} # close group is parent of LCA of self group me.self_tax = ['562'] # E. coli me.groups['self'] = set(['562'] + get_descendants('562', me.taxdump)) me.close_tax = None me.close_size = None me.infer_close_group() self.assertListEqual(me.close_tax, ['561']) # Escherichia self.assertSetEqual(me.groups['close'], {'561', '2580236'}) # close group must have at least 5 taxa me.close_tax = None me.groups['close'] = None me.close_size = 5 me.infer_close_group() self.assertListEqual(me.close_tax, ['543']) # Enterobacteriaceae exp = {'543', '620', '622', '570', '548', '561', '2580236'} self.assertSetEqual(me.groups['close'], exp) # close group is LCA of multiple self groups me.self_tax = ['561', '620'] # Escherichia and Shigella me.groups['self'] = set().union(*[[x] + get_descendants(x, me.taxdump) for x in me.self_tax]) me.close_tax = None me.groups['close'] = None me.close_size = None me.infer_close_group() self.assertListEqual(me.close_tax, ['543']) # Enterobacteriaceae exp = {'543', '570', '548'} self.assertSetEqual(me.groups['close'], exp)
def test_infer_self_group(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) # assign to LCA of all genomes (E. coli) me.self_tax = None me.lca = '562' me.self_rank = None me.infer_self_group() self.assertListEqual(me.self_tax, ['562']) # raise LCA to genus level (Escherichia) me.self_tax = None me.lca = '562' me.self_rank = 'genus' me.infer_self_group() self.assertListEqual(me.self_tax, ['561']) # LCA (Enterobacteriaceae) is already above designated rank (genus) me.self_tax = None me.lca = '543' me.self_rank = 'genus' me.infer_self_group() self.assertListEqual(me.self_tax, ['543'])
def test_define_groups(self): me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = {} # user defined groups: # self: genera Escherichia and Shigella # close: family Enterobacteriaceae me.groups = {} me.self_tax = '561,620' me.close_tax = '543' me.define_groups() self.assertListEqual(me.self_tax, ['561', '620']) exp = {'561', '562', '585056', '1038927', '2580236', '620', '622'} self.assertSetEqual(me.groups['self'], exp) self.assertListEqual(me.close_tax, ['543']) exp = {'543', '548', '570'} self.assertSetEqual(me.groups['close'], exp) # auto-infer groups me.self_tax = {} me.close_tax = {} me.lca = '562' # all inputs are E. coli me.self_rank = 'genus' # but we want to raise self to genus me.close_size = 2 # close group must be this big or bigger me.define_groups() self.assertListEqual(me.self_tax, ['561']) exp = {'561', '562', '585056', '1038927', '2580236'} self.assertSetEqual(me.groups['self'], exp) self.assertListEqual(me.close_tax, ['543']) exp = {'543', '548', '570', '620', '622'} self.assertSetEqual(me.groups['close'], exp)
def test_infer_genome_tax(self): taxdump = taxdump_from_text(taxdump_proteo) # five proteins, in which four have hits taxids = [ ['562', '620', '570'], # E. coli ['562', '585056', '1038927', '2'], # E. coli ['561', '543', '776'], # Escherichia ['548', '570', '1236'], # K. aerogenes [] ] prots = [{'hits': pd.DataFrame(x, columns=['taxid'])} for x in taxids] obs = Analyze.infer_genome_tax(prots, taxdump, 75) exp = ('561', 75.0) # 3 / 4 best hits assigned to Escherichia self.assertTupleEqual(obs, exp) # reduce coverage threshold obs = Analyze.infer_genome_tax(prots, taxdump, 50) exp = ('562', 50.0) # 2 / 4 best hits assigned to Escherichia self.assertTupleEqual(obs, exp) # remove one protein that best matches E. coli prots.pop(0) obs = Analyze.infer_genome_tax(prots, taxdump, 75) exp = ('543', 100.0) # 3 / 3 best hits assigned to Enterobacteriaceae self.assertTupleEqual(obs, exp) # no input protein with self.assertRaises(ValueError) as ctx: Analyze.infer_genome_tax({}, taxdump, 75) msg = 'Cannot auto-infer taxonomy.' self.assertEqual(str(ctx.exception), msg)
def test_add_children(self): taxdump = taxdump_from_text(taxdump_archaea) add_children(taxdump) self.assertSetEqual(set(taxdump['1']['children']), {'131567'}) self.assertSetEqual(set(taxdump['2157']['children']), {'1935183', '1783276'}) self.assertSetEqual(set(taxdump['1655434']['children']), {'1655637'}) self.assertListEqual(taxdump['2']['children'], [])
def test_get_lineage(self): taxdump = taxdump_from_text(taxdump_archaea) self.assertListEqual(get_lineage('2157', taxdump), ['2157', '131567', '1']) obs = get_lineage('1538547', taxdump) exp = [ '1538547', '1655637', '1655434', '1935183', '2157', '131567', '1' ] self.assertListEqual(obs, exp)
def test_taxids_at_ranks(self): taxdump = taxdump_from_text(taxdump_archaea) ranks = ['phylum', 'class', 'genus', 'species'] obs = taxids_at_ranks('1538547', ranks, taxdump) exp = { 'phylum': '1655434', 'class': None, 'genus': '1655637', 'species': '1538547' } self.assertDictEqual(obs, exp)
def test_taxdump_from_text(self): obs = taxdump_from_text(taxdump_archaea) self.assertEqual(len(obs), 9) self.assertDictEqual(obs['1'], { 'name': 'root', 'parent': '1', 'rank': 'no rank' }) self.assertDictEqual(obs['2157'], { 'name': 'Archaea', 'parent': '131567', 'rank': 'superkingdom' })
def test_calc_scores(self): columns = ('id', 'taxid', 'score') # helper for making hit table def _hits_df(data): return pd.DataFrame(data, columns=columns).set_index('id') me = Analyze() me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.groups = { 'self': {'561', '562', '585056'}, 'close': {'543', '91347', '1236'} } me.data = { 'S1': [{ 'score': 100, 'hits': _hits_df((('P1', '561', 100), ('P2', '562', 95))) }, { 'score': 90, 'hits': _hits_df((('P3', '561', 81), ('P4', '543', 72))) }], 'S2': [{ 'score': 96, 'hits': _hits_df( (('P5', '561', 90), ('P6', '543', 84), ('P7', '620', 66))) }] } me.weighted = True me.match_th = 0.9 me.calc_scores() # helper for get scores def _prot_scores(prot): return [prot[x] for x in ('self', 'close', 'distal')] s1_1 = me.data['S1'][0] self.assertListEqual(s1_1['hits']['group'].tolist(), ['self', 'self']) self.assertListEqual(_prot_scores(s1_1), [1.95, 0.0, 0.0]) self.assertEqual(s1_1['match'], '0') s1_2 = me.data['S1'][1] self.assertListEqual(s1_2['hits']['group'].tolist(), ['self', 'close']) self.assertListEqual(_prot_scores(s1_2), [0.9, 0.8, 0.0]) self.assertEqual(s1_2['match'], '0') s2_1 = me.data['S2'][0] self.assertListEqual(s2_1['hits']['group'].tolist(), ['self', 'close', 'distal']) self.assertListEqual(_prot_scores(s2_1), [0.9375, 0.875, 0.6875]) self.assertEqual(s2_1['match'], '620')
def test_find_lca(self): taxdump = taxdump_from_text(taxdump_archaea) self.assertEqual(find_lca(['131567'], taxdump), '131567') self.assertEqual(find_lca(['1935183', '1783276'], taxdump), '2157') self.assertEqual(find_lca(['1935183', '1783276', '1655434'], taxdump), '2157') self.assertEqual(find_lca(['1935183', '1783276', '2157'], taxdump), '2157') self.assertEqual(find_lca(['1935183', '2'], taxdump), '131567') self.assertEqual(find_lca(['1', '2', '1'], taxdump), '1') taxdump['x'] = {'name': 'x', 'parent': 'x'} with self.assertRaises(ValueError) as ctx: find_lca(['2', 'x'], taxdump) msg = 'Cannot find LCA of taxIds in database.' self.assertEqual(str(ctx.exception), msg)
def test_write_hgt_list(self): me = Analyze() me.output = self.tmpdir makedirs(join(me.output, 'hgts'), exist_ok=True) me.donor_name = False me.donor_rank = None me.taxdump = taxdump_from_text(taxdump_proteo) add_children(me.taxdump) me.df = pd.DataFrame( [['S1', 'P1', 0.85, '562', True], ['S1', 'P2', 0.95, '622', True], ['S1', 'P3', 1.05, '0', True], ['S2', 'P4', 0.80, '766', True], ['S2', 'P5', 0.20, '0', False]], columns=['sample', 'protein', 'silh', 'match', 'hgt']) # default me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t562\n' 'P2\t0.95\t622\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # number format and negative result me.write_hgt_list('S2') with open(join(me.output, 'hgts', 'S2.txt'), 'r') as f: self.assertEqual(f.read(), 'P4\t0.8\t766\n') # raise to family me.donor_rank = 'family' me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\t543\n' 'P2\t0.95\t543\n' 'P3\t1.05\t0\n') self.assertEqual(obs, exp) # report taxon name me.donor_rank = None me.donor_name = True me.write_hgt_list('S1') with open(join(me.output, 'hgts', 'S1.txt'), 'r') as f: obs = f.read() exp = ('P1\t0.85\tEscherichia coli\n' 'P2\t0.95\tShigella dysenteriae\n' 'P3\t1.05\tN/A\n') self.assertEqual(obs, exp) rmtree(join(me.output, 'hgts'))
def test_build_taxonmap(self): me = Database() me.output = self.tmpdir me.taxdump = taxdump_from_text(taxdump_proteo) me.p2tids = { 'P1': {'766'}, # Rickettsiales 'P2': {'570', '548'}, # Klebsiella 'P3': {'620', '622'}, # Shigella 'P4': {'561', '562'}, # Escherichia 'P5': {'126792', '28211'} } # root me.build_taxonmap() exp = {'P1': '766', 'P2': '570', 'P3': '620', 'P4': '561', 'P5': '1'} self.assertDictEqual(me.taxonmap, exp) with gzip.open(join(self.tmpdir, 'taxon.map.gz'), 'rt') as f: obs = dict(x.split('\t') for x in f.read().splitlines()) self.assertDictEqual(obs, exp) remove(join(self.tmpdir, 'taxon.map.gz'))
def test_sample_by_taxonomy(self): me = Database() # do nothing me.sample = None self.assertIsNone(me.sample_by_taxonomy()) # xxx header = ('genome', 'taxid', 'refseq_category', 'assembly_level') data = ( ('G1', '585056', '', 'Chromosome'), # E. coli UMN026 ('G2', '1038927', 'representative genome', 'Chromosome'), # E. coli O104:H4 (rep. genome to be prioritized over G1) ('G3', '2580236', '', 'Contig'), # sync E. coli ('G4', '622', '', 'Scaffold'), # Shigella ('G5', '548', '', 'Scaffold'), # Klebsiella ('G6', '126792', 'reference genome', 'Contig')) # plasmid df = pd.DataFrame(data, columns=header) me.reference = False me.representative = False me.taxdump = taxdump_from_text(taxdump_proteo) # up to one genome per genus me.rank = 'genus' me.sample = 1 me.df = df.copy() me.sample_by_taxonomy() self.assertListEqual(me.df.columns.tolist(), list(header) + ['genus']) self.assertListEqual(me.df['genome'].tolist(), ['G2', 'G4', 'G5']) # include reference genome (plasmid) me.reference = True me.df = df.copy() me.sample_by_taxonomy() self.assertEqual(me.df['genome'].tolist()[-1], 'G6') # up to two genomes for entire cellular life me.rank = 'superkingdom' me.sample = 2 me.reference = False me.df = df.copy() me.sample_by_taxonomy() self.assertListEqual(me.df['genome'].tolist(), ['G1', 'G2'])
def test_build_taxdump(self): me = Database() me.output = self.tmpdir me.tmpdir = join(self.datadir, 'DnaK', 'taxdump') me.taxdump = taxdump_from_text(taxdump_proteo) data = ( ('G1', '1224'), # Proteobacteria ('G2', '562'), # Escherichia coli ('G3', '585056'), # E. coli UMN026 ('G4', '1038927')) # E. coli O104:H4 me.df = pd.DataFrame(data, columns=['genome', 'taxid']).set_index('genome') me.build_taxdump() with open(join(self.tmpdir, 'taxdump', 'nodes.dmp'), 'r') as f: obs = set(x.split('\t')[0] for x in f.read().splitlines()) exp = { '1', '131567', '2', '1224', '1236', '91347', '543', '561', '562', '585056', '1038927' } self.assertSetEqual(obs, exp) rmtree(join(self.tmpdir, 'taxdump'))
def test_identify_taxonomy(self): me = Database() header = ('organism_name', 'taxid', 'species', 'species_taxid') data = (('Escherichia coli UMN026', '585056', 'E. coli', '562'), ('Escherichia coli O104:H4', '1038927', 'E. coli', '562'), ('Klebsiella aerogenes', '548', 'Klebsiella aerogenes', '548'), ('unclassified Gammaproteobacteria', '118884', '', ''), ('Plasmid pPY113', '126792', '', '')) df = pd.DataFrame(data, columns=header) # organism names must be capital and latinate me.capital = True me.block = None me.latin = True me.taxids = None me.exclude = False me.taxdump = taxdump_from_text(taxdump_proteo) me.df = df.copy() me.identify_taxonomy() self.assertNotIn('species_taxid', me.df.columns) self.assertListEqual(me.df.index.tolist(), [0, 1, 2]) self.assertListEqual(me.df['species'].tolist(), ['562', '562', '548']) # block word me.block = 'plasmid' me.latin = False me.df = df.copy() me.identify_taxonomy() self.assertListEqual(me.df.index.tolist(), [0, 1, 2]) # no Escherichia me.taxids = '561' me.exclude = True me.df = df.copy() me.identify_taxonomy() self.assertListEqual(me.df.index.tolist(), [2])
def test__get_taxon(self): taxdump = taxdump_from_text(taxdump_archaea) with self.assertRaises(ValueError) as ctx: _get_taxon('12345', taxdump) msg = 'TaxID 12345 is not found in taxonomy database.' self.assertEqual(str(ctx.exception), msg)
def test_is_ancestral(self): taxdump = taxdump_from_text(taxdump_archaea) self.assertTrue(is_ancestral('1538547', {'2157'}, taxdump)) self.assertFalse(is_ancestral('1538547', {'2'}, taxdump))
def test_get_descendants(self): taxdump = taxdump_from_text(taxdump_archaea) add_children(taxdump) obs = get_descendants('1935183', taxdump) # Asgard group exp = ['1655434', '1655637', '1538547'] self.assertListEqual(obs, exp)
def test_taxid_at_rank(self): taxdump = taxdump_from_text(taxdump_archaea) self.assertEqual(taxid_at_rank('1538547', 'genus', taxdump), '1655637') self.assertEqual(taxid_at_rank('1538547', 'phylum', taxdump), '1655434')
def test_refine_taxdump(self): taxdump = taxdump_from_text(taxdump_archaea) tids = ['2', '2157'] # Bacteria and Archaea obs = set(refine_taxdump(tids, taxdump)) exp = {'1', '131567', '2', '2157'} # plus cellular organisms and root self.assertSetEqual(obs, exp)
def test_assign_taxonomy(self): # input are two genomes with defined taxonomy me = Analyze() me.input_tax = 'S1:561,S2:620' # Escherichia and Shigella me.data = {} me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() # test input taxonomy extraction self.assertDictEqual(me.input_tax, {'S1': '561', 'S2': '620'}) # test taxonomy refinement exp = { '1', '131567', '2', '1224', '1236', '91347', '543', '561', '620' } self.assertSetEqual(set(me.taxdump.keys()), exp) # test LCA discovery self.assertEqual(me.lca, '543') # helper for making hit table def _hits_df(d): return pd.Series(d, name='taxid', dtype=object).to_frame() # input is one genome with defined taxonomy me = Analyze() me.data = {'S1': [{'hits': pd.DataFrame(columns=['taxid'])}]} me.input_tax = '561' # Escherichia me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() self.assertDictEqual(me.input_tax, {'S1': '561'}) # input taxonomy not found in database me.input_tax = '1234' me.taxdump = taxdump_from_text(taxdump_proteo) with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'TaxID 1234 is not present in taxonomy database.' self.assertEqual(str(ctx.exception), msg) # input are two genome whose taxonomies are to be inferred based on # search results me = Analyze() me.input_tax = None me.data = { 'S1': [{ 'hits': _hits_df({ 'P1': '561', 'P2': '562' }) }, { 'hits': _hits_df({ 'P3': '543', 'P4': '561' }) }], 'S2': [{ 'hits': _hits_df({ 'P5': '562', 'P6': '585056' }) }, { 'hits': _hits_df({ 'P7': '561', 'P8': '1038927' }) }, { 'hits': _hits_df({'P9': '2580236'}) }] } me.input_cov = 75 me.taxdump = taxdump_from_text(taxdump_proteo) me.assign_taxonomy() self.assertDictEqual(me.input_tax, {'S1': '543', 'S2': '561'}) self.assertEqual(me.lca, '543') # cannot auto-infer taxonomy me.data['S3'] = [{'hits': _hits_df({})}] me.taxdump = taxdump_from_text(taxdump_proteo) with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'Cannot auto-infer taxonomy for S3. Please specify manually.' self.assertEqual(str(ctx.exception), msg) # invalid input taxonomy string me.input_tax = '561' with self.assertRaises(ValueError) as ctx: me.assign_taxonomy() msg = 'Invalid input taxonomy format.' self.assertEqual(str(ctx.exception), msg)