def test_outgroup(self): mfu_tree_fname = os.path.join(self.testfile_dir, "taxtree.nw") mfu_tree = Tree(mfu_tree_fname) self.taxtree_helper.set_mf_rooted_tree(mfu_tree) outgr = self.taxtree_helper.get_outgroup() self.assertEqual(outgr.get_leaf_names(), self.expected_outgr.get_leaf_names())
def run_epa(self): self.cfg.log.info( "Running RAxML-EPA to place %d query sequences...\n" % self.query_count) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True) return jp
def load_refjson(self, refjson_fname): try: self.refjson = RefJsonParser(refjson_fname) except ValueError: self.cfg.exit_user_error("ERROR: Invalid json file format!") #validate input json format (valid, err) = self.refjson.validate() if not valid: self.cfg.log.error( "ERROR: Parsing reference JSON file failed:\n%s", err) self.cfg.exit_user_error() self.rate = self.refjson.get_rate() self.node_height = self.refjson.get_node_height() self.origin_taxonomy = self.refjson.get_origin_taxonomy() self.tax_tree = self.refjson.get_tax_tree() self.cfg.compress_patterns = self.refjson.get_pattern_compression() self.bid_taxonomy_map = self.refjson.get_branch_tax_map() if not self.bid_taxonomy_map: # old file format (before 1.6), need to rebuild this map from scratch th = TaxTreeHelper(self.cfg, self.origin_taxonomy) th.set_mf_rooted_tree(self.tax_tree) th.set_bf_unrooted_tree(self.refjson.get_reftree()) self.bid_taxonomy_map = th.get_bid_taxonomy_map() self.write_bid_tax_map(self.bid_taxonomy_map, final=False) reftree_str = self.refjson.get_raxml_readable_tree() self.reftree = Tree(reftree_str) self.reftree_size = len(self.reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() self.classify_helper = TaxClassifyHelper(self.cfg, self.bid_taxonomy_map, self.rate, self.node_height) self.taxtree_helper = TaxTreeHelper(self.cfg, self.origin_taxonomy, self.tax_tree) tax_code_name = self.refjson.get_taxcode() self.tax_code = TaxCode(tax_code_name) self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX, tax_map=self.origin_taxonomy) self.tax_common_ranks = self.taxonomy.get_common_ranks() # print "Common ranks: ", self.tax_common_ranks self.mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS self.rank_mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS
def setUp(self): self.testfile_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "testfiles") self.tax_fname = os.path.join(self.testfile_dir, "test_clean.tax") self.taxonomy = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname) tax_map = self.taxonomy.get_map() cfg = EpacConfig() self.taxtree_helper = TaxTreeHelper(cfg, tax_map) outgr_fname = os.path.join(self.testfile_dir, "outgroup.nw") self.expected_outgr = Tree(outgr_fname)
def test_taxtree_builder(self): cfg = EpacConfig() testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles") tax_fname = os.path.join(testfile_dir, "test.tax") tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname) tree_fname = os.path.join(testfile_dir, "taxtree.nw") expected_tree = Tree(tree_fname, format=8) tb = TaxTreeBuilder(cfg, tax) tax_tree, seq_ids = tb.build() self.assertEqual(seq_ids, tax.get_map().keys()) self.assertEqual(tax_tree.write(format=8), expected_tree.write(format=8))
def epa_post_process(self): lbl_tree = Tree(self.reftree_lbl_str) self.taxtree_helper.set_bf_unrooted_tree(lbl_tree) self.reftree_tax = self.taxtree_helper.get_tax_tree() self.bid_ranks_map = self.taxtree_helper.get_bid_taxonomy_map() if self.cfg.debug: self.reftree_tax.write(outfile=self.reftree_tax_fname, format=3) with open(self.reftree_lbl_fname, "w") as outf: outf.write(self.reftree_lbl_str) with open(self.brmap_fname, "w") as outf: for bid, br_rec in self.bid_ranks_map.iteritems(): outf.write("%s\t%s\t%d\t%f\n" % (bid, br_rec[0], br_rec[1], br_rec[2]))
def test_jplace_read(self): jplace_fname = os.path.join(self.testfile_dir, "test.jplace") parser = EpaJsonParser(jplace_fname) self.assertEqual(parser.get_raxml_version(), "8.2.3") t = Tree(parser.get_tree()) t_len = len(t) self.assertEqual(t_len, 32) self.assertEqual(len(parser.get_placement()), 6) for p in parser.get_placement(): self.assertFalse(p["n"][0] in t) self.assertTrue(len(p["p"]) > 0) for edge in p["p"]: branch = int(edge[0]) lh = edge[1] lhw = edge[2] self.assertTrue(branch >= 0 and branch < (t_len * 2 - 3)) self.assertTrue(lhw >= 0.0 and lhw <= 1.0)
def test_branch_labeling(self): bfu_tree_fname = os.path.join(self.testfile_dir, "resolved_tree.nw") bfu_tree = Tree(bfu_tree_fname) map_fname = os.path.join(self.testfile_dir, "bid_tax_map.txt") self.expected_map = {} with open(map_fname) as inf: for line in inf: bid, rank_id, rdiff, brlen = line.strip().split("\t") self.expected_map[bid] = (rank_id, int(rdiff), float(brlen)) self.taxtree_helper.set_outgroup(self.expected_outgr) self.taxtree_helper.set_bf_unrooted_tree(bfu_tree) bid_tax_map = self.taxtree_helper.get_bid_taxonomy_map() self.assertEqual(len(bid_tax_map), 2 * len(bfu_tree) - 3) for bid in self.expected_map.iterkeys(): e_rec = self.expected_map[bid] rec = bid_tax_map[bid] self.assertEqual(e_rec[0], rec[0]) self.assertEqual(e_rec[1], rec[1]) self.assertAlmostEqual(e_rec[2], rec[2], 6)
def run_final_epa_test(self): self.reftree_outgroup = self.refjson.get_outgroup() pruned_reftree = self.prune_mislabels_from_tree( self.reftree, "reference") pruned_taxtree = self.prune_mislabels_from_tree( self.reftree, "taxonomic") # remove unifurcation at the root if len(pruned_reftree.children) == 1: pruned_reftree = pruned_reftree.children[0] self.mislabels = [] th = TaxTreeHelper(self.cfg, self.origin_taxonomy) th.set_mf_rooted_tree(pruned_taxtree) reftree_epalbl_str = None if self.cfg.final_jplace_fname: if os.path.isdir(self.cfg.final_jplace_fname): jplace_fmask = os.path.join(self.cfg.final_jplace_fname, '*.jplace') else: jplace_fmask = self.cfg.final_jplace_fname jplace_fname_list = glob.glob(jplace_fmask) placements = [] for jplace_fname in jplace_fname_list: jp = EpaJsonParser(jplace_fname) placements += jp.get_placement() if not reftree_epalbl_str: reftree_epalbl_str = jp.get_std_newick_tree() config.log.debug("Loaded %d final epa placements from %s\n", len(placements), jplace_fmask) else: epa_result = self.run_epa_once(pruned_reftree) reftree_epalbl_str = epa_result.get_std_newick_tree() placements = epa_result.get_placement() # update branchid-taxonomy mapping to account for possible changes in branch numbering reftree_tax = Tree(reftree_epalbl_str) th.set_bf_unrooted_tree(reftree_tax) bid_tax_map = th.get_bid_taxonomy_map() self.write_bid_tax_map(bid_tax_map, final=True) cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate, self.node_height) # newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre") # th.get_tax_tree().write(outfile=newtax_fname, format=3) final_ass = {} for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name) # EXPERIMENTAL FEATURE - disabled for now! # It could happen that certain ranks were present in the "original" reference tree, but # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious" # after the leave-one-out test and thus pruned) # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from # pruned tree to "Undefined". # orig_ranks = th.strip_missing_ranks(orig_ranks) # print orig_ranks # get EPA tax label ranks, lws = cl.classify_seq(place["p"]) final_ass[seq_name] = (ranks, lws) #print seq_name, ": ", orig_ranks, "--->", ranks # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) self.write_assignments(final_ass, final=True)
def run_final_epa_test(self): self.reftree_outgroup = self.refjson.get_outgroup() tmp_reftree = self.reftree.copy(method="newick") name2refnode = {} for leaf in tmp_reftree.iter_leaves(): name2refnode[leaf.name] = leaf tmp_taxtree = self.tax_tree.copy(method="newick") name2taxnode = {} for leaf in tmp_taxtree.iter_leaves(): name2taxnode[leaf.name] = leaf for mis_rec in self.mislabels: rname = mis_rec['name'] # rname = EpacConfig.REF_SEQ_PREFIX + name if rname in name2refnode: name2refnode[rname].delete() else: print "Node not found in the reference tree: %s" % rname if rname in name2taxnode: name2taxnode[rname].delete() else: print "Node not found in the taxonomic tree: %s" % rname # remove unifurcation at the root if len(tmp_reftree.children) == 1: tmp_reftree = tmp_reftree.children[0] self.mislabels = [] th = TaxTreeHelper(self.cfg, self.origin_taxonomy) th.set_mf_rooted_tree(tmp_taxtree) epa_result = self.run_epa_once(tmp_reftree) reftree_epalbl_str = epa_result.get_std_newick_tree() placements = epa_result.get_placement() # update branchid-taxonomy mapping to account for possible changes in branch numbering reftree_tax = Tree(reftree_epalbl_str) th.set_bf_unrooted_tree(reftree_tax) bid_tax_map = th.get_bid_taxonomy_map() self.write_bid_tax_map(bid_tax_map, final=True) cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate, self.node_height) # newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre") # th.get_tax_tree().write(outfile=newtax_fname, format=3) final_ass = {} for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name) # EXPERIMENTAL FEATURE - disabled for now! # It could happen that certain ranks were present in the "original" reference tree, but # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious" # after the leave-one-out test and thus pruned) # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from # pruned tree to "Undefined". # orig_ranks = th.strip_missing_ranks(orig_ranks) # print orig_ranks # get EPA tax label ranks, lws = cl.classify_seq(place["p"]) final_ass[seq_name] = (ranks, lws) #print seq_name, ": ", orig_ranks, "--->", ranks # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) self.write_assignments(final_ass, final=True)