Example #1
0
    def run_final_epa_test(self):
        self.reftree_outgroup = self.refjson.get_outgroup()

        pruned_reftree = self.prune_mislabels_from_tree(self.reftree, "reference")
        pruned_taxtree = self.prune_mislabels_from_tree(self.reftree, "taxonomic")

        # remove unifurcation at the root
        if len(pruned_reftree.children) == 1:
            pruned_reftree = pruned_reftree.children[0]
            
        self.mislabels = []

        th = TaxTreeHelper(self.cfg, self.origin_taxonomy)
        th.set_mf_rooted_tree(pruned_taxtree)
         
        reftree_epalbl_str = None    
        if self.cfg.final_jplace_fname:
            if os.path.isdir(self.cfg.final_jplace_fname):
                jplace_fmask = os.path.join(self.cfg.final_jplace_fname, '*.jplace')
            else:
                jplace_fmask = self.cfg.final_jplace_fname

            jplace_fname_list = glob.glob(jplace_fmask)
            placements = []
            for jplace_fname in jplace_fname_list:
                jp = EpaJsonParser(jplace_fname)
                placements += jp.get_placement()
                if not reftree_epalbl_str:
                  reftree_epalbl_str = jp.get_std_newick_tree()        
                
            config.log.debug("Loaded %d final epa placements from %s\n", len(placements), jplace_fmask)
        else:
            epa_result = self.run_epa_once(pruned_reftree)
            reftree_epalbl_str = epa_result.get_std_newick_tree()        
            placements = epa_result.get_placement()
        
        # update branchid-taxonomy mapping to account for possible changes in branch numbering
        reftree_tax = Tree(reftree_epalbl_str)
        th.set_bf_unrooted_tree(reftree_tax)
        bid_tax_map = th.get_bid_taxonomy_map()
        
        self.write_bid_tax_map(bid_tax_map, final=True)

        cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate, self.node_height)
        
#        newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre")
#        th.get_tax_tree().write(outfile=newtax_fname, format=3)

        final_ass = {}
        for place in placements:
            seq_name = place["n"][0]

            # get original taxonomic label
            orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name)

            # EXPERIMENTAL FEATURE - disabled for now!
            # It could happen that certain ranks were present in the "original" reference tree, but 
            # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious" 
            # after the leave-one-out test and thus pruned)
            # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade
            # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from  
            # pruned tree to "Undefined".
#            orig_ranks = th.strip_missing_ranks(orig_ranks)
#            print orig_ranks

            # get EPA tax label
            ranks, lws = cl.classify_seq(place["p"])
            final_ass[seq_name] = (ranks, lws)

            #print seq_name, ": ", orig_ranks, "--->", ranks

            # check if they match
            mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws)

        self.write_assignments(final_ass, final=True)
Example #2
0
    def run_final_epa_test(self):
        self.reftree_outgroup = self.refjson.get_outgroup()

        pruned_reftree = self.prune_mislabels_from_tree(
            self.reftree, "reference")
        pruned_taxtree = self.prune_mislabels_from_tree(
            self.reftree, "taxonomic")

        # remove unifurcation at the root
        if len(pruned_reftree.children) == 1:
            pruned_reftree = pruned_reftree.children[0]

        self.mislabels = []

        th = TaxTreeHelper(self.cfg, self.origin_taxonomy)
        th.set_mf_rooted_tree(pruned_taxtree)

        reftree_epalbl_str = None
        if self.cfg.final_jplace_fname:
            if os.path.isdir(self.cfg.final_jplace_fname):
                jplace_fmask = os.path.join(self.cfg.final_jplace_fname,
                                            '*.jplace')
            else:
                jplace_fmask = self.cfg.final_jplace_fname

            jplace_fname_list = glob.glob(jplace_fmask)
            placements = []
            for jplace_fname in jplace_fname_list:
                jp = EpaJsonParser(jplace_fname)
                placements += jp.get_placement()
                if not reftree_epalbl_str:
                    reftree_epalbl_str = jp.get_std_newick_tree()

            config.log.debug("Loaded %d final epa placements from %s\n",
                             len(placements), jplace_fmask)
        else:
            epa_result = self.run_epa_once(pruned_reftree)
            reftree_epalbl_str = epa_result.get_std_newick_tree()
            placements = epa_result.get_placement()

        # update branchid-taxonomy mapping to account for possible changes in branch numbering
        reftree_tax = Tree(reftree_epalbl_str)
        th.set_bf_unrooted_tree(reftree_tax)
        bid_tax_map = th.get_bid_taxonomy_map()

        self.write_bid_tax_map(bid_tax_map, final=True)

        cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.rate,
                               self.node_height)

        #        newtax_fname = self.cfg.subst_name("newtax_%NAME%.tre")
        #        th.get_tax_tree().write(outfile=newtax_fname, format=3)

        final_ass = {}
        for place in placements:
            seq_name = place["n"][0]

            # get original taxonomic label
            orig_ranks = self.taxtree_helper.get_seq_ranks_from_tree(seq_name)

            # EXPERIMENTAL FEATURE - disabled for now!
            # It could happen that certain ranks were present in the "original" reference tree, but
            # are completely missing in the pruned tree (e.g., all seqs of a species were considered "suspicious"
            # after the leave-one-out test and thus pruned)
            # In this case, EPA has no chance to infer full original taxonomic annotation (=species) since the corresponding clade
            # is now missing. To account for this fact, we amend the original taxonomic annotation and set ranks missing from
            # pruned tree to "Undefined".
            #            orig_ranks = th.strip_missing_ranks(orig_ranks)
            #            print orig_ranks

            # get EPA tax label
            ranks, lws = cl.classify_seq(place["p"])
            final_ass[seq_name] = (ranks, lws)

            #print seq_name, ": ", orig_ranks, "--->", ranks

            # check if they match
            mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks,
                                                lws)

        self.write_assignments(final_ass, final=True)