def run_leave_seq_out_test(self): job_name = self.cfg.subst_name("l1out_seq_%NAME%") if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: jp = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_seq") placements = jp.get_placement() seq_count = 0 for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.get_orig_ranks(seq_name) # get EPA tax label ranks, lws = self.classify_seq(place) # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) # cross-check with higher rank mislabels if self.ranktest and mis_rec: rank_conf = 0 for lvl in range(2,len(orig_ranks)): tax_path = Taxonomy.get_rank_uid(orig_ranks, lvl) if tax_path in self.misrank_conf_map: rank_conf = max(rank_conf, self.misrank_conf_map[tax_path]) mis_rec['rank_conf'] = rank_conf seq_count += 1 return seq_count
def classify(self, query_fname, minp=0.9, ptp=False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) jp = self.run_epa() self.cfg.log.info( "Assigning taxonomic labels based on EPA placements...\n") placements = jp.get_placement() if self.out_assign_fname: fo = open(self.out_assign_fname, "w") else: fo = None noassign_list = [] for place in placements: taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] ranks, lws = self.classify_helper.classify_seq(edges) rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw) if rankout == None: noassign_list.append(origin_taxon_name) else: output = "%s\t%s\t" % (origin_taxon_name, rankout) if self.cfg.check_novelty: isnovo = self.novelty_check(place_edge=str(edges[0][0]), ranks=ranks, lws=lws) output += "*" if isnovo else "o" self.print_result_line(fo, output) noassign_list += self.get_noalign_list() for taxon_name in noassign_list: output = "%s\t\t\t?" % origin_taxon_name self.print_result_line(fo, output) if fo: fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: self.run_ptp(jp)
def run_leave_seq_out_test(self): job_name = self.cfg.subst_name("l1out_seq_%NAME%") placements = [] if self.cfg.jplace_fname: if os.path.isdir(self.cfg.jplace_fname): jplace_fmask = os.path.join(self.cfg.jplace_fname, '*.jplace') else: jplace_fmask = self.cfg.jplace_fname jplace_fname_list = glob.glob(jplace_fmask) for jplace_fname in jplace_fname_list: jp = EpaJsonParser(jplace_fname) placements += jp.get_placement() config.log.debug("Loaded %d placements from %s\n", len(placements), jplace_fmask) else: jp = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_seq") placements = jp.get_placement() if self.cfg.output_interim_files: out_jplace_fname = self.cfg.out_fname("%NAME%.l1out_seq.jplace") self.raxml.copy_epa_jplace(job_name, out_jplace_fname, move=True, mode="l1o_seq") seq_count = 0 l1out_ass = {} for place in placements: seq_name = place["n"][0] # get original taxonomic label # orig_ranks = self.get_orig_ranks(seq_name) orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name) # get EPA tax label ranks, lws = self.classify_seq(place) l1out_ass[seq_name] = (ranks, lws) # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) # cross-check with higher rank mislabels if self.cfg.ranktest and mis_rec: rank_conf = 0 for lvl in range(2,len(orig_ranks)): tax_path = Taxonomy.get_rank_uid(orig_ranks, lvl) if tax_path in self.misrank_conf_map: rank_conf = max(rank_conf, self.misrank_conf_map[tax_path]) mis_rec['rank_conf'] = rank_conf seq_count += 1 self.write_assignments(l1out_ass, final=False) return seq_count
def test_assign_taxonomy(self): assign_fname = os.path.join(self.testfile_dir, "true_assign.txt") expected_assign_map = {} with open(assign_fname) as inf: for line in inf: sid, ranks_str, lws = line.strip().split("\t") expected_assign_map[sid] = ranks_str.split(";") jplace_fname = os.path.join(self.testfile_dir, "test.jplace") parser = EpaJsonParser(jplace_fname) for p in parser.get_placement(): sid = p["n"][0] edges = p["p"] ranks, conf = self.classify_helper.classify_seq(edges) # for e in edges: print self.bid_tax_map[str(e[0])], e[2] # print sid, "\t", ";".join(ranks) #, conf self.assertEqual(ranks, expected_assign_map[sid])
def test_jplace_read(self): jplace_fname = os.path.join(self.testfile_dir, "test.jplace") parser = EpaJsonParser(jplace_fname) self.assertEquals(parser.get_raxml_version(), "8.2.3") t = Tree(parser.get_tree()) t_len = len(t) self.assertEquals(t_len, 32) self.assertEquals(len(parser.get_placement()), 6) for p in parser.get_placement(): self.assertFalse(p["n"][0] in t) self.assertTrue(len(p["p"]) > 0) for edge in p["p"]: branch = int(edge[0]) lh = edge[1] lhw = edge[2] self.assertTrue(branch >= 0 and branch < (t_len * 2 - 3)) self.assertTrue(lhw >= 0.0 and lhw <= 1.0)
def test_jplace_read(self): jplace_fname = os.path.join(self.testfile_dir, "test.jplace") parser = EpaJsonParser(jplace_fname) self.assertEqual(parser.get_raxml_version(), "8.2.3") t = Tree(parser.get_tree()) t_len = len(t) self.assertEqual(t_len, 32) self.assertEqual(len(parser.get_placement()), 6) for p in parser.get_placement(): self.assertFalse(p["n"][0] in t) self.assertTrue(len(p["p"]) > 0) for edge in p["p"]: branch = int(edge[0]) lh = edge[1] lhw = edge[2] self.assertTrue(branch >= 0 and branch < (t_len * 2 - 3)) self.assertTrue(lhw >= 0.0 and lhw <= 1.0)
def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) placements = jp.get_placement() if fout: fo = open(fout, "w") else: fo = None output2 = "" for place in placements: output = None taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] # edges = self.erlang_filter(edges, p = pv) if len(edges) > 0: ranks, lws = self.classify_helper.classify_seq(edges, method, minlw) isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw) rankout = self.print_ranks(ranks, lws, minlw) if rankout == None: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" else: output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw)) if isnovo: output += "*" else: output +="o" if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") else: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) output = "%s\t\t\t?" % origin_taxon_name if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") if self.cfg.verbose: print(output2) if fo: fo.write(output2) fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug) if self.cfg.verbose: print "Species clusters:" if fout: fo2 = open(fout+".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") if self.cfg.verbose: print s if fo2: fo2.close() ############################################# if not self.jplace_fname: if not self.cfg.debug: raxml.cleanup(job_name) FileUtils.remove_if_exists(reduced_align_fname) FileUtils.remove_if_exists(reftree_fname) FileUtils.remove_if_exists(optmod_fname)
def run_final_epa_test(self): self.reftree_outgroup = self.refjson.get_outgroup() pruned_reftree = self.prune_mislabels_from_tree(self.reftree, "reference") pruned_taxtree = self.prune_mislabels_from_tree(self.reftree, "taxonomic") # remove unifurcation at the root if len(pruned_reftree.children) == 1: pruned_reftree = pruned_reftree.children[0] self.mislabels = [] th = TaxTreeHelper(self.cfg, self.origin_taxonomy) th.set_mf_rooted_tree(pruned_taxtree) reftree_epalbl_str = None if self.cfg.final_jplace_fname: if os.path.isdir(self.cfg.final_jplace_fname): jplace_fmask = os.path.join(self.cfg.final_jplace_fname, '*.jplace') else: jplace_fmask = self.cfg.final_jplace_fname jplace_fname_list = glob.glob(jplace_fmask) placements = [] for jplace_fname in jplace_fname_list: jp = EpaJsonParser(jplace_fname) placements += jp.get_placement() if not reftree_epalbl_str: reftree_epalbl_str = jp.get_std_newick_tree() config.log.debug("Loaded %d final epa placements from %s\n", len(placements), jplace_fmask) else: epa_result = self.run_epa_once(pruned_reftree) reftree_epalbl_str = epa_result.get_std_newick_tree() placements = epa_result.get_placement() # update branchid-taxonomy mapping to account for possible changes in branch numbering reftree_tax = Tree(reftree_epalbl_str) th.set_bf_unrooted_tree(reftree_tax) bid_tax_map = th.get_bid_taxonomy_map() self.write_bid_tax_map(bid_tax_map, final=True) cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate, self.node_height) # newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre") # th.get_tax_tree().write(outfile=newtax_fname, format=3) final_ass = {} for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name) # EXPERIMENTAL FEATURE - disabled for now! # It could happen that certain ranks were present in the "original" reference tree, but # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious" # after the leave-one-out test and thus pruned) # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from # pruned tree to "Undefined". # orig_ranks = th.strip_missing_ranks(orig_ranks) # print orig_ranks # get EPA tax label ranks, lws = cl.classify_seq(place["p"]) final_ass[seq_name] = (ranks, lws) #print seq_name, ": ", orig_ranks, "--->", ranks # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) self.write_assignments(final_ass, final=True)
def run_final_epa_test(self): self.reftree_outgroup = self.refjson.get_outgroup() pruned_reftree = self.prune_mislabels_from_tree( self.reftree, "reference") pruned_taxtree = self.prune_mislabels_from_tree( self.reftree, "taxonomic") # remove unifurcation at the root if len(pruned_reftree.children) == 1: pruned_reftree = pruned_reftree.children[0] self.mislabels = [] th = TaxTreeHelper(self.cfg, self.origin_taxonomy) th.set_mf_rooted_tree(pruned_taxtree) reftree_epalbl_str = None if self.cfg.final_jplace_fname: if os.path.isdir(self.cfg.final_jplace_fname): jplace_fmask = os.path.join(self.cfg.final_jplace_fname, '*.jplace') else: jplace_fmask = self.cfg.final_jplace_fname jplace_fname_list = glob.glob(jplace_fmask) placements = [] for jplace_fname in jplace_fname_list: jp = EpaJsonParser(jplace_fname) placements += jp.get_placement() if not reftree_epalbl_str: reftree_epalbl_str = jp.get_std_newick_tree() config.log.debug("Loaded %d final epa placements from %s\n", len(placements), jplace_fmask) else: epa_result = self.run_epa_once(pruned_reftree) reftree_epalbl_str = epa_result.get_std_newick_tree() placements = epa_result.get_placement() # update branchid-taxonomy mapping to account for possible changes in branch numbering reftree_tax = Tree(reftree_epalbl_str) th.set_bf_unrooted_tree(reftree_tax) bid_tax_map = th.get_bid_taxonomy_map() self.write_bid_tax_map(bid_tax_map, final=True) cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate, self.node_height) # newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre") # th.get_tax_tree().write(outfile=newtax_fname, format=3) final_ass = {} for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name) # EXPERIMENTAL FEATURE - disabled for now! # It could happen that certain ranks were present in the "original" reference tree, but # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious" # after the leave-one-out test and thus pruned) # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from # pruned tree to "Undefined". # orig_ranks = th.strip_missing_ranks(orig_ranks) # print orig_ranks # get EPA tax label ranks, lws = cl.classify_seq(place["p"]) final_ass[seq_name] = (ranks, lws) #print seq_name, ": ", orig_ranks, "--->", ranks # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) self.write_assignments(final_ass, final=True)
def run_leave_seq_out_test(self): job_name = self.cfg.subst_name("l1out_seq_%NAME%") placements = [] if self.cfg.jplace_fname: if os.path.isdir(self.cfg.jplace_fname): jplace_fmask = os.path.join(self.cfg.jplace_fname, '*.jplace') else: jplace_fmask = self.cfg.jplace_fname jplace_fname_list = glob.glob(jplace_fmask) for jplace_fname in jplace_fname_list: jp = EpaJsonParser(jplace_fname) placements += jp.get_placement() config.log.debug("Loaded %d placements from %s\n", len(placements), jplace_fmask) else: jp = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_seq") placements = jp.get_placement() if self.cfg.output_interim_files: out_jplace_fname = self.cfg.out_fname( "%NAME%.l1out_seq.jplace") self.raxml.copy_epa_jplace(job_name, out_jplace_fname, move=True, mode="l1o_seq") seq_count = 0 l1out_ass = {} for place in placements: seq_name = place["n"][0] # get original taxonomic label # orig_ranks = self.get_orig_ranks(seq_name) orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name) # get EPA tax label ranks, lws = self.classify_seq(place) l1out_ass[seq_name] = (ranks, lws) # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) # cross-check with higher rank mislabels if self.cfg.ranktest and mis_rec: rank_conf = 0 for lvl in range(2, len(orig_ranks)): tax_path = Taxonomy.get_rank_uid(orig_ranks, lvl) if tax_path in self.misrank_conf_map: rank_conf = max(rank_conf, self.misrank_conf_map[tax_path]) mis_rec['rank_conf'] = rank_conf seq_count += 1 self.write_assignments(l1out_ass, final=False) return seq_count
def classify(self, query_fname, minp = 0.9, ptp = False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) self.cfg.log.info("Running RAxML-EPA to place %d query sequences...\n" % self.query_count) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True) self.cfg.log.info("Assigning taxonomic labels based on EPA placements...\n") placements = jp.get_placement() if self.out_assign_fname: fo = open(self.out_assign_fname, "w") else: fo = None noassign_list = [] for place in placements: taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] if len(edges) > 0: ranks, lws = self.classify_helper.classify_seq(edges) isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks=ranks, lws=lws) rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw) if rankout == None: noassign_list.append(origin_taxon_name) else: output = "%s\t%s\t" % (origin_taxon_name, rankout) if isnovo: output += "*" else: output +="o" if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") else: noassign_list.append(origin_taxon_name) if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) noassign_list.append(origin_taxon_name) for taxon_name in noassign_list: output = "%s\t\t\t?" % origin_taxon_name if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") if fo: fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug) self.cfg.log.debug("Species clusters:") if fout: fo2 = open(fout+".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") self.cfg.log.debug(s) if fo2: fo2.close()