Ejemplo n.º 1
0
 def test_outgroup(self):
     mfu_tree_fname = os.path.join(self.testfile_dir, "taxtree.nw")
     mfu_tree = Tree(mfu_tree_fname)
     self.taxtree_helper.set_mf_rooted_tree(mfu_tree)
     outgr = self.taxtree_helper.get_outgroup()
     self.assertEqual(outgr.get_leaf_names(),
                      self.expected_outgr.get_leaf_names())
Ejemplo n.º 2
0
    def run_epa(self):
        self.cfg.log.info(
            "Running RAxML-EPA to place %d query sequences...\n" %
            self.query_count)
        raxml = RaxmlWrapper(config)
        reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
        self.refjson.get_raxml_readable_tree(reftree_fname)
        optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
        self.refjson.get_binary_model(optmod_fname)
        job_name = self.cfg.subst_name("epa_%NAME%")

        reftree_str = self.refjson.get_raxml_readable_tree()
        reftree = Tree(reftree_str)

        self.reftree_size = len(reftree.get_leaves())

        # IMPORTANT: set EPA heuristic rate based on tree size!
        self.cfg.resolve_auto_settings(self.reftree_size)
        # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file
        if self.cfg.epa_load_optmod:
            self.cfg.raxml_model = self.refjson.get_ratehet_model()

        reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

        jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname,
                           optmod_fname)

        raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True)

        return jp
Ejemplo n.º 3
0
    def load_refjson(self, refjson_fname):
        try:
            self.refjson = RefJsonParser(refjson_fname)
        except ValueError:
            self.cfg.exit_user_error("ERROR: Invalid json file format!")

        #validate input json format
        (valid, err) = self.refjson.validate()
        if not valid:
            self.cfg.log.error(
                "ERROR: Parsing reference JSON file failed:\n%s", err)
            self.cfg.exit_user_error()

        self.rate = self.refjson.get_rate()
        self.node_height = self.refjson.get_node_height()
        self.origin_taxonomy = self.refjson.get_origin_taxonomy()
        self.tax_tree = self.refjson.get_tax_tree()
        self.cfg.compress_patterns = self.refjson.get_pattern_compression()

        self.bid_taxonomy_map = self.refjson.get_branch_tax_map()
        if not self.bid_taxonomy_map:
            # old file format (before 1.6), need to rebuild this map from scratch
            th = TaxTreeHelper(self.cfg, self.origin_taxonomy)
            th.set_mf_rooted_tree(self.tax_tree)
            th.set_bf_unrooted_tree(self.refjson.get_reftree())
            self.bid_taxonomy_map = th.get_bid_taxonomy_map()

        self.write_bid_tax_map(self.bid_taxonomy_map, final=False)

        reftree_str = self.refjson.get_raxml_readable_tree()
        self.reftree = Tree(reftree_str)
        self.reftree_size = len(self.reftree.get_leaves())

        # IMPORTANT: set EPA heuristic rate based on tree size!
        self.cfg.resolve_auto_settings(self.reftree_size)
        # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file
        if self.cfg.epa_load_optmod:
            self.cfg.raxml_model = self.refjson.get_ratehet_model()

        self.classify_helper = TaxClassifyHelper(self.cfg,
                                                 self.bid_taxonomy_map,
                                                 self.rate, self.node_height)
        self.taxtree_helper = TaxTreeHelper(self.cfg, self.origin_taxonomy,
                                            self.tax_tree)

        tax_code_name = self.refjson.get_taxcode()
        self.tax_code = TaxCode(tax_code_name)

        self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX,
                                 tax_map=self.origin_taxonomy)
        self.tax_common_ranks = self.taxonomy.get_common_ranks()
        #        print "Common ranks: ", self.tax_common_ranks

        self.mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS
        self.rank_mislabels_cnt = [0] * TaxCode.UNI_TAX_LEVELS
Ejemplo n.º 4
0
    def setUp(self):
        self.testfile_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "testfiles")
        self.tax_fname = os.path.join(self.testfile_dir, "test_clean.tax")
        self.taxonomy = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname)
        tax_map = self.taxonomy.get_map()
        cfg = EpacConfig()
        self.taxtree_helper = TaxTreeHelper(cfg, tax_map)

        outgr_fname = os.path.join(self.testfile_dir, "outgroup.nw")
        self.expected_outgr = Tree(outgr_fname)
Ejemplo n.º 5
0
 def test_taxtree_builder(self):
     cfg = EpacConfig()
     testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles")
     tax_fname = os.path.join(testfile_dir, "test.tax")
     tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname)
     tree_fname = os.path.join(testfile_dir, "taxtree.nw")
     expected_tree = Tree(tree_fname, format=8)
     tb = TaxTreeBuilder(cfg, tax)
     tax_tree, seq_ids = tb.build()
     self.assertEqual(seq_ids, tax.get_map().keys())
     self.assertEqual(tax_tree.write(format=8), expected_tree.write(format=8))
Ejemplo n.º 6
0
    def epa_post_process(self):
        lbl_tree = Tree(self.reftree_lbl_str)
        self.taxtree_helper.set_bf_unrooted_tree(lbl_tree)
        self.reftree_tax = self.taxtree_helper.get_tax_tree()
        self.bid_ranks_map = self.taxtree_helper.get_bid_taxonomy_map()

        if self.cfg.debug:
            self.reftree_tax.write(outfile=self.reftree_tax_fname, format=3)
            with open(self.reftree_lbl_fname, "w") as outf:
                outf.write(self.reftree_lbl_str)
            with open(self.brmap_fname, "w") as outf:
                for bid, br_rec in self.bid_ranks_map.iteritems():
                    outf.write("%s\t%s\t%d\t%f\n" %
                               (bid, br_rec[0], br_rec[1], br_rec[2]))
Ejemplo n.º 7
0
 def test_jplace_read(self):
     jplace_fname = os.path.join(self.testfile_dir, "test.jplace")
     parser = EpaJsonParser(jplace_fname)
     self.assertEqual(parser.get_raxml_version(), "8.2.3")
     t = Tree(parser.get_tree())
     t_len = len(t)
     self.assertEqual(t_len, 32)
     self.assertEqual(len(parser.get_placement()), 6)
     for p in parser.get_placement():
         self.assertFalse(p["n"][0] in t)
         self.assertTrue(len(p["p"]) > 0)
         for edge in p["p"]:
             branch = int(edge[0])
             lh = edge[1]
             lhw = edge[2]
             self.assertTrue(branch >= 0 and branch < (t_len * 2 - 3))
             self.assertTrue(lhw >= 0.0 and lhw <= 1.0)
Ejemplo n.º 8
0
    def test_branch_labeling(self):
        bfu_tree_fname = os.path.join(self.testfile_dir, "resolved_tree.nw")
        bfu_tree = Tree(bfu_tree_fname)
        map_fname = os.path.join(self.testfile_dir, "bid_tax_map.txt")
        self.expected_map = {}
        with open(map_fname) as inf:
            for line in inf:
                bid, rank_id, rdiff, brlen = line.strip().split("\t")
                self.expected_map[bid] = (rank_id, int(rdiff), float(brlen))

        self.taxtree_helper.set_outgroup(self.expected_outgr)
        self.taxtree_helper.set_bf_unrooted_tree(bfu_tree)
        bid_tax_map = self.taxtree_helper.get_bid_taxonomy_map()
        self.assertEqual(len(bid_tax_map), 2 * len(bfu_tree) - 3)
        for bid in self.expected_map.iterkeys():
            e_rec = self.expected_map[bid]
            rec = bid_tax_map[bid]
            self.assertEqual(e_rec[0], rec[0])
            self.assertEqual(e_rec[1], rec[1])
            self.assertAlmostEqual(e_rec[2], rec[2], 6)
Ejemplo n.º 9
0
    def run_final_epa_test(self):
        self.reftree_outgroup = self.refjson.get_outgroup()

        pruned_reftree = self.prune_mislabels_from_tree(
            self.reftree, "reference")
        pruned_taxtree = self.prune_mislabels_from_tree(
            self.reftree, "taxonomic")

        # remove unifurcation at the root
        if len(pruned_reftree.children) == 1:
            pruned_reftree = pruned_reftree.children[0]

        self.mislabels = []

        th = TaxTreeHelper(self.cfg, self.origin_taxonomy)
        th.set_mf_rooted_tree(pruned_taxtree)

        reftree_epalbl_str = None
        if self.cfg.final_jplace_fname:
            if os.path.isdir(self.cfg.final_jplace_fname):
                jplace_fmask = os.path.join(self.cfg.final_jplace_fname,
                                            '*.jplace')
            else:
                jplace_fmask = self.cfg.final_jplace_fname

            jplace_fname_list = glob.glob(jplace_fmask)
            placements = []
            for jplace_fname in jplace_fname_list:
                jp = EpaJsonParser(jplace_fname)
                placements += jp.get_placement()
                if not reftree_epalbl_str:
                    reftree_epalbl_str = jp.get_std_newick_tree()

            config.log.debug("Loaded %d final epa placements from %s\n",
                             len(placements), jplace_fmask)
        else:
            epa_result = self.run_epa_once(pruned_reftree)
            reftree_epalbl_str = epa_result.get_std_newick_tree()
            placements = epa_result.get_placement()

        # update branchid-taxonomy mapping to account for possible changes in branch numbering
        reftree_tax = Tree(reftree_epalbl_str)
        th.set_bf_unrooted_tree(reftree_tax)
        bid_tax_map = th.get_bid_taxonomy_map()

        self.write_bid_tax_map(bid_tax_map, final=True)

        cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate,
                               self.node_height)

        #        newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre")
        #        th.get_tax_tree().write(outfile=newtax_fname, format=3)

        final_ass = {}
        for place in placements:
            seq_name = place["n"][0]

            # get original taxonomic label
            orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name)

            # EXPERIMENTAL FEATURE - disabled for now!
            # It could happen that certain ranks were present in the "original" reference tree, but
            # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious"
            # after the leave-one-out test and thus pruned)
            # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade
            # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from
            # pruned tree to "Undefined".
            #            orig_ranks = th.strip_missing_ranks(orig_ranks)
            #            print orig_ranks

            # get EPA tax label
            ranks, lws = cl.classify_seq(place["p"])
            final_ass[seq_name] = (ranks, lws)

            #print seq_name, ": ", orig_ranks, "--->", ranks

            # check if they match
            mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks,
                                                lws)

        self.write_assignments(final_ass, final=True)
Ejemplo n.º 10
0
    def run_final_epa_test(self):
        self.reftree_outgroup = self.refjson.get_outgroup()

        tmp_reftree = self.reftree.copy(method="newick")
        name2refnode = {}
        for leaf in tmp_reftree.iter_leaves():
            name2refnode[leaf.name] = leaf

        tmp_taxtree = self.tax_tree.copy(method="newick")
        name2taxnode = {}
        for leaf in tmp_taxtree.iter_leaves():
            name2taxnode[leaf.name] = leaf

        for mis_rec in self.mislabels:
            rname = mis_rec['name']
            #            rname = EpacConfig.REF_SEQ_PREFIX + name

            if rname in name2refnode:
                name2refnode[rname].delete()
            else:
                print "Node not found in the reference tree: %s" % rname

            if rname in name2taxnode:
                name2taxnode[rname].delete()
            else:
                print "Node not found in the taxonomic tree: %s" % rname

        # remove unifurcation at the root
        if len(tmp_reftree.children) == 1:
            tmp_reftree = tmp_reftree.children[0]

        self.mislabels = []

        th = TaxTreeHelper(self.cfg, self.origin_taxonomy)
        th.set_mf_rooted_tree(tmp_taxtree)

        epa_result = self.run_epa_once(tmp_reftree)

        reftree_epalbl_str = epa_result.get_std_newick_tree()
        placements = epa_result.get_placement()

        # update branchid-taxonomy mapping to account for possible changes in branch numbering
        reftree_tax = Tree(reftree_epalbl_str)
        th.set_bf_unrooted_tree(reftree_tax)
        bid_tax_map = th.get_bid_taxonomy_map()

        self.write_bid_tax_map(bid_tax_map, final=True)

        cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate,
                               self.node_height)

        #        newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre")
        #        th.get_tax_tree().write(outfile=newtax_fname, format=3)

        final_ass = {}
        for place in placements:
            seq_name = place["n"][0]

            # get original taxonomic label
            orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name)

            # EXPERIMENTAL FEATURE - disabled for now!
            # It could happen that certain ranks were present in the "original" reference tree, but
            # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious"
            # after the leave-one-out test and thus pruned)
            # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade
            # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from
            # pruned tree to "Undefined".
            #            orig_ranks = th.strip_missing_ranks(orig_ranks)
            #            print orig_ranks

            # get EPA tax label
            ranks, lws = cl.classify_seq(place["p"])
            final_ass[seq_name] = (ranks, lws)

            #print seq_name, ": ", orig_ranks, "--->", ranks

            # check if they match
            mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks,
                                                lws)

        self.write_assignments(final_ass, final=True)