Example #1
0
class RefTreeBuilder:
    def __init__(self, config):
        self.cfg = config
        self.mfresolv_job_name = self.cfg.subst_name("mfresolv_%NAME%")
        self.epalbl_job_name = self.cfg.subst_name("epalbl_%NAME%")
        self.optmod_job_name = self.cfg.subst_name("optmod_%NAME%")
        self.raxml_wrapper = RaxmlWrapper(config)

        self.outgr_fname = self.cfg.tmp_fname("%NAME%_outgr.tre")
        self.reftree_mfu_fname = self.cfg.tmp_fname("%NAME%_mfu.tre")
        self.reftree_bfu_fname = self.cfg.tmp_fname("%NAME%_bfu.tre")
        self.optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
        self.lblalign_fname = self.cfg.tmp_fname("%NAME%_lblq.fa")
        self.reftree_lbl_fname = self.cfg.tmp_fname("%NAME%_lbl.tre")
        self.reftree_tax_fname = self.cfg.tmp_fname("%NAME%_tax.tre")
        self.brmap_fname = self.cfg.tmp_fname("%NAME%_map.txt")

    def load_alignment(self):
        in_file = self.cfg.align_fname
        self.input_seqs = None
        formats = [
            "fasta", "phylip_relaxed", "iphylip_relaxed", "phylip", "iphylip"
        ]
        for fmt in formats:
            try:
                self.input_seqs = SeqGroup(sequences=in_file, format=fmt)
                break
            except:
                self.cfg.log.debug("Guessing input format: not " + fmt)
        if self.input_seqs == None:
            self.cfg.exit_user_error(
                "Invalid input file format: %s\nThe supported input formats are fasta and phylip"
                % in_file)

    def validate_taxonomy(self):
        self.input_validator = InputValidator(self.cfg, self.taxonomy,
                                              self.input_seqs)
        self.input_validator.validate()

    def build_multif_tree(self):
        c = self.cfg

        tb = TaxTreeBuilder(c, self.taxonomy)
        (t, ids) = tb.build(c.reftree_min_rank, c.reftree_max_seqs_per_leaf,
                            c.reftree_clades_to_include,
                            c.reftree_clades_to_ignore)
        self.reftree_ids = frozenset(ids)
        self.reftree_size = len(ids)
        self.reftree_multif = t

        # IMPORTANT: select GAMMA or CAT model based on tree size!
        self.cfg.resolve_auto_settings(self.reftree_size)

        if self.cfg.debug:
            refseq_fname = self.cfg.tmp_fname("%NAME%_seq_ids.txt")
            # list of sequence ids which comprise the reference tree
            with open(refseq_fname, "w") as f:
                for sid in ids:
                    f.write("%s\n" % sid)

            # original tree with taxonomic ranks as internal node labels
            reftax_fname = self.cfg.tmp_fname("%NAME%_mfu_tax.tre")
            t.write(outfile=reftax_fname, format=8)
        #    t.show()

    def export_ref_alignment(self):
        """This function transforms the input alignment in the following way:
           1. Filter out sequences which are not part of the reference tree
           2. Add sequence name prefix (r_)"""

        self.refalign_fname = self.cfg.tmp_fname("%NAME%_matrix.afa")
        with open(self.refalign_fname, "w") as fout:
            for name, seq, comment, sid in self.input_seqs.iter_entries():
                seq_name = EpacConfig.REF_SEQ_PREFIX + name
                if seq_name in self.input_validator.corr_seqid:
                    seq_name = self.input_validator.corr_seqid[seq_name]
                if seq_name in self.reftree_ids:
                    fout.write(">" + seq_name + "\n" + seq + "\n")

        # we do not need the original alignment anymore, so free its memory
        self.input_seqs = None

    def export_ref_taxonomy(self):
        self.taxonomy_map = {}

        for sid, ranks in self.taxonomy.iteritems():
            if sid in self.reftree_ids:
                self.taxonomy_map[sid] = ranks

        if self.cfg.debug:
            tax_fname = self.cfg.tmp_fname("%NAME%_tax.txt")
            with open(tax_fname, "w") as fout:
                for sid, ranks in self.taxonomy_map.iteritems():
                    ranks_str = self.taxonomy.seq_lineage_str(sid)
                    fout.write(sid + "\t" + ranks_str + "\n")

    def save_rooting(self):
        rt = self.reftree_multif

        tax_map = self.taxonomy.get_map()
        self.taxtree_helper = TaxTreeHelper(self.cfg, tax_map)
        self.taxtree_helper.set_mf_rooted_tree(rt)
        outgr = self.taxtree_helper.get_outgroup()
        outgr_size = len(outgr.get_leaves())
        outgr.write(outfile=self.outgr_fname, format=9)
        self.reftree_outgroup = outgr
        self.cfg.log.debug(
            "Outgroup for rooting was saved to: %s, outgroup size: %d",
            self.outgr_fname, outgr_size)

        # remove unifurcation at the root
        if len(rt.children) == 1:
            rt = rt.children[0]

        # now we can safely unroot the tree and remove internal node labels to make it suitable for raxml
        rt.write(outfile=self.reftree_mfu_fname, format=9)

    # RAxML call to convert multifurcating tree to the strictly bifurcating one
    def resolve_multif(self):
        self.cfg.log.debug("\nReducing the alignment: \n")
        self.reduced_refalign_fname = self.raxml_wrapper.reduce_alignment(
            self.refalign_fname)

        self.cfg.log.debug("\nConstrained ML inference: \n")
        raxml_params = [
            "-s", self.reduced_refalign_fname, "-g", self.reftree_mfu_fname,
            "--no-seq-check", "-N",
            str(self.cfg.rep_num)
        ]
        if self.cfg.mfresolv_method == "fast":
            raxml_params += ["-D"]
        elif self.cfg.mfresolv_method == "ultrafast":
            raxml_params += ["-f", "e"]
        if self.cfg.restart and self.raxml_wrapper.result_exists(
                self.mfresolv_job_name):
            self.invocation_raxml_multif = self.raxml_wrapper.get_invocation_str(
                self.mfresolv_job_name)
            self.cfg.log.debug(
                "\nUsing existing ML tree found in: %s\n",
                self.raxml_wrapper.result_fname(self.mfresolv_job_name))
        else:
            self.invocation_raxml_multif = self.raxml_wrapper.run(
                self.mfresolv_job_name, raxml_params)
            #            self.invocation_raxml_multif = self.raxml_wrapper.run_multiple(self.mfresolv_job_name, raxml_params, self.cfg.rep_num)
            if self.cfg.mfresolv_method == "ultrafast":
                self.raxml_wrapper.copy_result_tree(
                    self.mfresolv_job_name,
                    self.raxml_wrapper.besttree_fname(self.mfresolv_job_name))

        if self.raxml_wrapper.besttree_exists(self.mfresolv_job_name):
            if not self.cfg.reopt_model:
                self.raxml_wrapper.copy_best_tree(self.mfresolv_job_name,
                                                  self.reftree_bfu_fname)
                self.raxml_wrapper.copy_optmod_params(self.mfresolv_job_name,
                                                      self.optmod_fname)
                self.invocation_raxml_optmod = ""
                job_name = self.mfresolv_job_name
            else:
                bfu_fname = self.raxml_wrapper.besttree_fname(
                    self.mfresolv_job_name)
                job_name = self.optmod_job_name

                # RAxML call to optimize model parameters and write them down to the binary model file
                self.cfg.log.debug("\nOptimizing model parameters: \n")
                raxml_params = [
                    "-f", "e", "-s", self.reduced_refalign_fname, "-t",
                    bfu_fname, "--no-seq-check"
                ]
                if self.cfg.raxml_model.startswith(
                        "GTRCAT") and not self.cfg.compress_patterns:
                    raxml_params += ["-H"]
                if self.cfg.restart and self.raxml_wrapper.result_exists(
                        self.optmod_job_name):
                    self.invocation_raxml_optmod = self.raxml_wrapper.get_invocation_str(
                        self.optmod_job_name)
                    self.cfg.log.debug(
                        "\nUsing existing optimized tree and parameters found in: %s\n",
                        self.raxml_wrapper.result_fname(self.optmod_job_name))
                else:
                    self.invocation_raxml_optmod = self.raxml_wrapper.run(
                        self.optmod_job_name, raxml_params)
                if self.raxml_wrapper.result_exists(self.optmod_job_name):
                    self.raxml_wrapper.copy_result_tree(
                        self.optmod_job_name, self.reftree_bfu_fname)
                    self.raxml_wrapper.copy_optmod_params(
                        self.optmod_job_name, self.optmod_fname)
                else:
                    errmsg = "RAxML run failed (model optimization), please examine the log for details: %s" \
                            % self.raxml_wrapper.make_raxml_fname("output", self.optmod_job_name)
                    self.cfg.exit_fatal_error(errmsg)

            if self.cfg.raxml_model.startswith("GTRCAT"):
                mod_name = "CAT"
            else:
                mod_name = "GAMMA"
            self.reftree_loglh = self.raxml_wrapper.get_tree_lh(
                job_name, mod_name)
            self.cfg.log.debug("\n%s-based logLH of the reference tree: %f\n" %
                               (mod_name, self.reftree_loglh))

        else:
            errmsg = "RAxML run failed (mutlifurcation resolution), please examine the log for details: %s" \
                    % self.raxml_wrapper.make_raxml_fname("output", self.mfresolv_job_name)
            self.cfg.exit_fatal_error(errmsg)

    def load_reduced_refalign(self):
        formats = ["fasta", "phylip_relaxed"]
        for fmt in formats:
            try:
                self.reduced_refalign_seqs = SeqGroup(
                    sequences=self.reduced_refalign_fname, format=fmt)
                break
            except:
                pass
        if self.reduced_refalign_seqs == None:
            errmsg = "FATAL ERROR: Invalid input file format in %s! (load_reduced_refalign)" % self.reduced_refalign_fname
            self.cfg.exit_fatal_error(errmsg)

    # dummy EPA run to label the branches of the reference tree, which we need to build a mapping to tax ranks
    def epa_branch_labeling(self):
        # create alignment with dummy query seq
        self.refalign_width = len(self.reduced_refalign_seqs.get_seqbyid(0))
        self.reduced_refalign_seqs.write(format="fasta",
                                         outfile=self.lblalign_fname)

        with open(self.lblalign_fname, "a") as fout:
            fout.write(">" + "DUMMY131313" + "\n")
            fout.write("A" * self.refalign_width + "\n")

        # TODO always load model regardless of the config file settings?
        epa_result = self.raxml_wrapper.run_epa(self.epalbl_job_name,
                                                self.lblalign_fname,
                                                self.reftree_bfu_fname,
                                                self.optmod_fname,
                                                mode="epa_mp")
        self.reftree_lbl_str = epa_result.get_std_newick_tree()
        self.raxml_version = epa_result.get_raxml_version()
        self.invocation_raxml_epalbl = epa_result.get_raxml_invocation()

        if not self.raxml_wrapper.epa_result_exists(self.epalbl_job_name):
            errmsg = "RAxML EPA run failed, please examine the log for details: %s" \
                    % self.raxml_wrapper.make_raxml_fname("output", self.epalbl_job_name)
            self.cfg.exit_fatal_error(errmsg)

    def epa_post_process(self):
        lbl_tree = Tree(self.reftree_lbl_str)
        self.taxtree_helper.set_bf_unrooted_tree(lbl_tree)
        self.reftree_tax = self.taxtree_helper.get_tax_tree()
        self.bid_ranks_map = self.taxtree_helper.get_bid_taxonomy_map()

        if self.cfg.debug:
            self.reftree_tax.write(outfile=self.reftree_tax_fname, format=3)
            with open(self.reftree_lbl_fname, "w") as outf:
                outf.write(self.reftree_lbl_str)
            with open(self.brmap_fname, "w") as outf:
                for bid, br_rec in self.bid_ranks_map.iteritems():
                    outf.write("%s\t%s\t%d\t%f\n" %
                               (bid, br_rec[0], br_rec[1], br_rec[2]))

    def calc_node_heights(self):
        """Calculate node heights on the reference tree (used to define branch-length cutoff during classification step)
           Algorithm is as follows:
           Tip node or node resolved to Species level: height = 1 
           Inner node resolved to Genus or above:      height = min(left_height, right_height) + 1 
         """
        nh_map = {}
        dummy_added = False
        for node in self.reftree_tax.traverse("postorder"):
            if not node.is_root():
                if not hasattr(node, "B"):
                    # In a rooted tree, there is always one more node/branch than in unrooted one
                    # That's why one branch will be always not EPA-labelled after the rooting
                    if not dummy_added:
                        node.B = "DDD"
                        dummy_added = True
                        species_rank = Taxonomy.EMPTY_RANK
                    else:
                        errmsg = "FATAL ERROR: More than one tree branch without EPA label (calc_node_heights)"
                        self.cfg.exit_fatal_error(errmsg)
                else:
                    species_rank = self.bid_ranks_map[node.B][-1]
                bid = node.B
                if node.is_leaf() or species_rank != Taxonomy.EMPTY_RANK:
                    nh_map[bid] = 1
                else:
                    lchild = node.children[0]
                    rchild = node.children[1]
                    nh_map[bid] = min(nh_map[lchild.B], nh_map[rchild.B]) + 1

        # remove heights for dummy nodes, since there won't be any placements on them
        if dummy_added:
            del nh_map["DDD"]

        self.node_height_map = nh_map

    def __get_all_rank_names(self, root):
        rnames = set([])
        for node in root.traverse("postorder"):
            ranks = node.ranks
            for rk in ranks:
                rnames.add(rk)
        return rnames

    def mono_index(self):
        """This method will calculate monophyly index by looking at the left and right hand side of the tree"""
        children = self.reftree_tax.children
        if len(children) == 1:
            while len(children) == 1:
                children = children[0].children
        if len(children) == 2:
            left = children[0]
            right = children[1]
            lset = self.__get_all_rank_names(left)
            rset = self.__get_all_rank_names(right)
            iset = lset & rset
            return iset
        else:
            print("Error: input tree not birfurcating")
            return set([])

    def build_hmm_profile(self, json_builder):
        print "Building the HMMER profile...\n"

        # this stupid workaround is needed because RAxML outputs the reduced
        # alignment in relaxed PHYLIP format, which is not supported by HMMER
        refalign_fasta = self.cfg.tmp_fname("%NAME%_ref_reduced.fa")
        self.reduced_refalign_seqs.write(outfile=refalign_fasta)

        hmm = hmmer(self.cfg, refalign_fasta)
        fprofile = hmm.build_hmm_profile()

        json_builder.set_hmm_profile(fprofile)

    def write_json(self):
        jw = RefJsonBuilder()

        jw.set_branch_tax_map(self.bid_ranks_map)
        jw.set_tree(self.reftree_lbl_str)
        jw.set_outgroup(self.reftree_outgroup)
        jw.set_ratehet_model(self.cfg.raxml_model)
        jw.set_tax_tree(self.reftree_multif)
        jw.set_pattern_compression(self.cfg.compress_patterns)
        jw.set_taxcode(self.cfg.taxcode_name)

        jw.set_merged_ranks_map(self.input_validator.merged_ranks)
        corr_ranks_reverse = dict(
            (reversed(item)
             for item in self.input_validator.corr_ranks.items()))
        jw.set_corr_ranks_map(corr_ranks_reverse)
        corr_seqid_reverse = dict(
            (reversed(item)
             for item in self.input_validator.corr_seqid.items()))
        jw.set_corr_seqid_map(corr_seqid_reverse)

        mdata = {
            "ref_tree_size": self.reftree_size,
            "ref_alignment_width": self.refalign_width,
            "raxml_version": self.raxml_version,
            "timestamp": str(datetime.datetime.now()),
            "invocation_epac": self.invocation_epac,
            "invocation_raxml_multif": self.invocation_raxml_multif,
            "invocation_raxml_optmod": self.invocation_raxml_optmod,
            "invocation_raxml_epalbl": self.invocation_raxml_epalbl,
            "reftree_loglh": self.reftree_loglh
        }
        jw.set_metadata(mdata)

        seqs = self.reduced_refalign_seqs.get_entries()
        jw.set_sequences(seqs)

        if not self.cfg.no_hmmer:
            self.build_hmm_profile(jw)

        orig_tax = self.taxonomy_map
        jw.set_origin_taxonomy(orig_tax)

        self.cfg.log.debug("Calculating the speciation rate...\n")
        tp = tree_param(tree=self.reftree_lbl_str, origin_taxonomy=orig_tax)
        jw.set_rate(tp.get_speciation_rate_fast())
        jw.set_nodes_height(self.node_height_map)

        jw.set_binary_model(self.optmod_fname)

        self.cfg.log.debug("Writing down the reference file...\n")
        jw.dump(self.cfg.refjson_fname)

    # top-level function to build a reference tree
    def build_ref_tree(self):
        self.cfg.log.info("=> Loading taxonomy from file: %s ...\n",
                          self.cfg.taxonomy_fname)
        self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX,
                                 tax_fname=self.cfg.taxonomy_fname)
        self.cfg.log.info(
            "==> Loading reference alignment from file: %s ...\n",
            self.cfg.align_fname)
        self.load_alignment()
        self.cfg.log.info("===> Validating taxonomy and alignment ...\n")
        self.validate_taxonomy()
        self.cfg.log.info(
            "====> Building a multifurcating tree from taxonomy with %d seqs ...\n",
            self.taxonomy.seq_count())
        self.build_multif_tree()
        self.cfg.log.info("=====> Building the reference alignment ...\n")
        self.export_ref_alignment()
        self.export_ref_taxonomy()
        self.cfg.log.info(
            "======> Saving the outgroup for later re-rooting ...\n")
        self.save_rooting()
        self.cfg.log.info(
            "=======> Resolving multifurcation: choosing the best topology from %d independent RAxML runs ...\n"
            % self.cfg.rep_num)
        self.resolve_multif()
        self.load_reduced_refalign()
        self.cfg.log.info(
            "========> Calling RAxML-EPA to obtain branch labels ...\n")
        self.epa_branch_labeling()
        self.cfg.log.info(
            "=========> Post-processing the EPA tree (re-rooting, taxonomic labeling etc.) ...\n"
        )
        self.epa_post_process()
        self.calc_node_heights()

        self.cfg.log.debug("\n==========> Checking branch labels ...")
        self.cfg.log.debug("shared rank names before training: %s",
                           repr(self.taxonomy.get_common_ranks()))
        self.cfg.log.debug("shared rank names after  training: %s\n",
                           repr(self.mono_index()))

        self.cfg.log.info("==========> Saving the reference JSON file: %s\n" %
                          self.cfg.refjson_fname)
        self.write_json()
Example #2
0
class RefTreeBuilder:
    def __init__(self, config): 
        self.cfg = config
        self.mfresolv_job_name = self.cfg.subst_name("mfresolv_%NAME%")
        self.epalbl_job_name = self.cfg.subst_name("epalbl_%NAME%")
        self.optmod_job_name = self.cfg.subst_name("optmod_%NAME%")
        self.raxml_wrapper = RaxmlWrapper(config)
        
        self.outgr_fname = self.cfg.tmp_fname("%NAME%_outgr.tre")
        self.reftree_mfu_fname = self.cfg.tmp_fname("%NAME%_mfu.tre")
        self.reftree_bfu_fname = self.cfg.tmp_fname("%NAME%_bfu.tre")
        self.optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
        self.lblalign_fname = self.cfg.tmp_fname("%NAME%_lblq.fa")
        self.reftree_lbl_fname = self.cfg.tmp_fname("%NAME%_lbl.tre")
        self.reftree_tax_fname = self.cfg.tmp_fname("%NAME%_tax.tre")
        self.brmap_fname = self.cfg.tmp_fname("%NAME%_map.txt")

    def load_alignment(self):
        in_file = self.cfg.align_fname
        self.input_seqs = None
        formats = ["fasta", "phylip_relaxed", "iphylip_relaxed", "phylip", "iphylip"]
        for fmt in formats:
            try:
                self.input_seqs = SeqGroup(sequences=in_file, format = fmt)
                break
            except:
                self.cfg.log.debug("Guessing input format: not " + fmt)
        if self.input_seqs == None:
            self.cfg.exit_user_error("Invalid input file format: %s\nThe supported input formats are fasta and phylip" % in_file)
            
    def validate_taxonomy(self):
        self.input_validator = InputValidator(self.cfg, self.taxonomy, self.input_seqs)
        self.input_validator.validate()
        
    def build_multif_tree(self):
        c = self.cfg
        
        tb = TaxTreeBuilder(c, self.taxonomy)
        (t, ids) = tb.build(c.reftree_min_rank, c.reftree_max_seqs_per_leaf, c.reftree_clades_to_include, c.reftree_clades_to_ignore)
        self.reftree_ids = frozenset(ids)
        self.reftree_size = len(ids)
        self.reftree_multif = t

        # IMPORTANT: select GAMMA or CAT model based on tree size!                
        self.cfg.resolve_auto_settings(self.reftree_size)

        if self.cfg.debug:
            refseq_fname = self.cfg.tmp_fname("%NAME%_seq_ids.txt")
            # list of sequence ids which comprise the reference tree
            with open(refseq_fname, "w") as f:
                for sid in ids:
                    f.write("%s\n" % sid)

            # original tree with taxonomic ranks as internal node labels
            reftax_fname = self.cfg.tmp_fname("%NAME%_mfu_tax.tre")
            t.write(outfile=reftax_fname, format=8)
        #    t.show()

    def export_ref_alignment(self):
        """This function transforms the input alignment in the following way:
           1. Filter out sequences which are not part of the reference tree
           2. Add sequence name prefix (r_)"""
        
        self.refalign_fname = self.cfg.tmp_fname("%NAME%_matrix.afa")
        with open(self.refalign_fname, "w") as fout:
            for name, seq, comment, sid in self.input_seqs.iter_entries():
                seq_name = EpacConfig.REF_SEQ_PREFIX + name
                if seq_name in self.input_validator.corr_seqid:
                  seq_name = self.input_validator.corr_seqid[seq_name]
                if seq_name in self.reftree_ids:
                    fout.write(">" + seq_name + "\n" + seq + "\n")

        # we do not need the original alignment anymore, so free its memory
        self.input_seqs = None

    def export_ref_taxonomy(self):
        self.taxonomy_map = {}
        
        for sid, ranks in self.taxonomy.iteritems():
            if sid in self.reftree_ids:
                self.taxonomy_map[sid] = ranks
            
        if self.cfg.debug:
            tax_fname = self.cfg.tmp_fname("%NAME%_tax.txt")
            with open(tax_fname, "w") as fout:
                for sid, ranks in self.taxonomy_map.iteritems():
                    ranks_str = self.taxonomy.seq_lineage_str(sid) 
                    fout.write(sid + "\t" + ranks_str + "\n")   

    def save_rooting(self):
        rt = self.reftree_multif

        tax_map = self.taxonomy.get_map()
        self.taxtree_helper = TaxTreeHelper(self.cfg, tax_map)
        self.taxtree_helper.set_mf_rooted_tree(rt)
        outgr = self.taxtree_helper.get_outgroup()
        outgr_size = len(outgr.get_leaves())
        outgr.write(outfile=self.outgr_fname, format=9)
        self.reftree_outgroup = outgr
        self.cfg.log.debug("Outgroup for rooting was saved to: %s, outgroup size: %d", self.outgr_fname, outgr_size)
            
        # remove unifurcation at the root
        if len(rt.children) == 1:
            rt = rt.children[0]
        
        # now we can safely unroot the tree and remove internal node labels to make it suitable for raxml
        rt.write(outfile=self.reftree_mfu_fname, format=9)

    # RAxML call to convert multifurcating tree to the strictly bifurcating one
    def resolve_multif(self):
        self.cfg.log.debug("\nReducing the alignment: \n")
        self.reduced_refalign_fname = self.raxml_wrapper.reduce_alignment(self.refalign_fname)
        
        self.cfg.log.debug("\nConstrained ML inference: \n")
        raxml_params = ["-s", self.reduced_refalign_fname, "-g", self.reftree_mfu_fname, "--no-seq-check", "-N", str(self.cfg.rep_num)] 
        if self.cfg.mfresolv_method  == "fast":
            raxml_params += ["-D"]
        elif self.cfg.mfresolv_method  == "ultrafast":
            raxml_params += ["-f", "e"]
        if self.cfg.restart and self.raxml_wrapper.result_exists(self.mfresolv_job_name):
            self.invocation_raxml_multif = self.raxml_wrapper.get_invocation_str(self.mfresolv_job_name)
            self.cfg.log.debug("\nUsing existing ML tree found in: %s\n", self.raxml_wrapper.result_fname(self.mfresolv_job_name))
        else:
            self.invocation_raxml_multif = self.raxml_wrapper.run(self.mfresolv_job_name, raxml_params)
#            self.invocation_raxml_multif = self.raxml_wrapper.run_multiple(self.mfresolv_job_name, raxml_params, self.cfg.rep_num)
            if self.cfg.mfresolv_method  == "ultrafast":
              self.raxml_wrapper.copy_result_tree(self.mfresolv_job_name, self.raxml_wrapper.besttree_fname(self.mfresolv_job_name))
              
        if self.raxml_wrapper.besttree_exists(self.mfresolv_job_name):        
            if not self.cfg.reopt_model:
                self.raxml_wrapper.copy_best_tree(self.mfresolv_job_name, self.reftree_bfu_fname)
                self.raxml_wrapper.copy_optmod_params(self.mfresolv_job_name, self.optmod_fname)
                self.invocation_raxml_optmod = ""
                job_name = self.mfresolv_job_name
            else:
                bfu_fname = self.raxml_wrapper.besttree_fname(self.mfresolv_job_name)
                job_name = self.optmod_job_name

                # RAxML call to optimize model parameters and write them down to the binary model file
                self.cfg.log.debug("\nOptimizing model parameters: \n")
                raxml_params = ["-f", "e", "-s", self.reduced_refalign_fname, "-t", bfu_fname, "--no-seq-check"]
                if self.cfg.raxml_model.startswith("GTRCAT") and not self.cfg.compress_patterns:
                    raxml_params +=  ["-H"]
                if self.cfg.restart and self.raxml_wrapper.result_exists(self.optmod_job_name):
                    self.invocation_raxml_optmod = self.raxml_wrapper.get_invocation_str(self.optmod_job_name)
                    self.cfg.log.debug("\nUsing existing optimized tree and parameters found in: %s\n", self.raxml_wrapper.result_fname(self.optmod_job_name))
                else:
                    self.invocation_raxml_optmod = self.raxml_wrapper.run(self.optmod_job_name, raxml_params)
                if self.raxml_wrapper.result_exists(self.optmod_job_name):
                    self.raxml_wrapper.copy_result_tree(self.optmod_job_name, self.reftree_bfu_fname)
                    self.raxml_wrapper.copy_optmod_params(self.optmod_job_name, self.optmod_fname)
                else:
                    errmsg = "RAxML run failed (model optimization), please examine the log for details: %s" \
                            % self.raxml_wrapper.make_raxml_fname("output", self.optmod_job_name)
                    self.cfg.exit_fatal_error(errmsg)
                    
            if self.cfg.raxml_model.startswith("GTRCAT"):
              mod_name = "CAT"
            else:
              mod_name = "GAMMA" 
            self.reftree_loglh = self.raxml_wrapper.get_tree_lh(job_name, mod_name)
            self.cfg.log.debug("\n%s-based logLH of the reference tree: %f\n" % (mod_name, self.reftree_loglh))

        else:
            errmsg = "RAxML run failed (mutlifurcation resolution), please examine the log for details: %s" \
                    % self.raxml_wrapper.make_raxml_fname("output", self.mfresolv_job_name)
            self.cfg.exit_fatal_error(errmsg)
            
    def load_reduced_refalign(self):
        formats = ["fasta", "phylip_relaxed"]
        for fmt in formats:
            try:
                self.reduced_refalign_seqs = SeqGroup(sequences=self.reduced_refalign_fname, format = fmt)
                break
            except:
                pass
        if self.reduced_refalign_seqs == None:
            errmsg = "FATAL ERROR: Invalid input file format in %s! (load_reduced_refalign)" % self.reduced_refalign_fname
            self.cfg.exit_fatal_error(errmsg)
    
    # dummy EPA run to label the branches of the reference tree, which we need to build a mapping to tax ranks    
    def epa_branch_labeling(self):
        # create alignment with dummy query seq
        self.refalign_width = len(self.reduced_refalign_seqs.get_seqbyid(0))
        self.reduced_refalign_seqs.write(format="fasta", outfile=self.lblalign_fname)
        
        with open(self.lblalign_fname, "a") as fout:
            fout.write(">" + "DUMMY131313" + "\n")        
            fout.write("A"*self.refalign_width + "\n")        
        
        # TODO always load model regardless of the config file settings?
        epa_result = self.raxml_wrapper.run_epa(self.epalbl_job_name, self.lblalign_fname, self.reftree_bfu_fname, self.optmod_fname, mode="epa_mp")
        self.reftree_lbl_str = epa_result.get_std_newick_tree()
        self.raxml_version = epa_result.get_raxml_version()
        self.invocation_raxml_epalbl = epa_result.get_raxml_invocation()

        if not self.raxml_wrapper.epa_result_exists(self.epalbl_job_name):        
            errmsg = "RAxML EPA run failed, please examine the log for details: %s" \
                    % self.raxml_wrapper.make_raxml_fname("output", self.epalbl_job_name)
            self.cfg.exit_fatal_error(errmsg)

    def epa_post_process(self):
        lbl_tree = Tree(self.reftree_lbl_str)
        self.taxtree_helper.set_bf_unrooted_tree(lbl_tree)
        self.reftree_tax = self.taxtree_helper.get_tax_tree()
        self.bid_ranks_map = self.taxtree_helper.get_bid_taxonomy_map()
        
        if self.cfg.debug:
            self.reftree_tax.write(outfile=self.reftree_tax_fname, format=3)
            with open(self.reftree_lbl_fname, "w") as outf:
                outf.write(self.reftree_lbl_str)
            with open(self.brmap_fname, "w") as outf:
                for bid, br_rec in self.bid_ranks_map.iteritems():
                    outf.write("%s\t%s\t%d\t%f\n" % (bid, br_rec[0], br_rec[1], br_rec[2]))

    def calc_node_heights(self):
        """Calculate node heights on the reference tree (used to define branch-length cutoff during classification step)
           Algorithm is as follows:
           Tip node or node resolved to Species level: height = 1 
           Inner node resolved to Genus or above:      height = min(left_height, right_height) + 1 
         """
        nh_map = {}
        dummy_added = False
        for node in self.reftree_tax.traverse("postorder"):
            if not node.is_root():
                if not hasattr(node, "B"):                
                    # In a rooted tree, there is always one more node/branch than in unrooted one
                    # That's why one branch will be always not EPA-labelled after the rooting
                    if not dummy_added: 
                        node.B = "DDD"
                        dummy_added = True
                        species_rank = Taxonomy.EMPTY_RANK
                    else:
                        errmsg = "FATAL ERROR: More than one tree branch without EPA label (calc_node_heights)"
                        self.cfg.exit_fatal_error(errmsg)
                else:
                    species_rank = self.bid_ranks_map[node.B][-1]
                bid = node.B
                if node.is_leaf() or species_rank != Taxonomy.EMPTY_RANK:
                    nh_map[bid] = 1
                else:
                    lchild = node.children[0]
                    rchild = node.children[1]
                    nh_map[bid] = min(nh_map[lchild.B], nh_map[rchild.B]) + 1

        # remove heights for dummy nodes, since there won't be any placements on them
        if dummy_added:
            del nh_map["DDD"]
            
        self.node_height_map = nh_map

    def __get_all_rank_names(self, root):
        rnames = set([])
        for node in root.traverse("postorder"):
            ranks = node.ranks
            for rk in ranks:
                rnames.add(rk)
        return rnames

    def mono_index(self):
        """This method will calculate monophyly index by looking at the left and right hand side of the tree"""
        children = self.reftree_tax.children
        if len(children) == 1:
            while len(children) == 1:
                children = children[0].children 
        if len(children) == 2:
            left = children[0]
            right =children[1]
            lset = self.__get_all_rank_names(left)
            rset = self.__get_all_rank_names(right)
            iset = lset & rset
            return iset
        else:
            print("Error: input tree not birfurcating")
            return set([])

    def build_hmm_profile(self, json_builder):
        print "Building the HMMER profile...\n"

        # this stupid workaround is needed because RAxML outputs the reduced
        # alignment in relaxed PHYLIP format, which is not supported by HMMER
        refalign_fasta = self.cfg.tmp_fname("%NAME%_ref_reduced.fa")
        self.reduced_refalign_seqs.write(outfile=refalign_fasta)

        hmm = hmmer(self.cfg, refalign_fasta)
        fprofile = hmm.build_hmm_profile()

        json_builder.set_hmm_profile(fprofile)
        
    def write_json(self):
        jw = RefJsonBuilder()

        jw.set_branch_tax_map(self.bid_ranks_map)
        jw.set_tree(self.reftree_lbl_str)
        jw.set_outgroup(self.reftree_outgroup)
        jw.set_ratehet_model(self.cfg.raxml_model)
        jw.set_tax_tree(self.reftree_multif)
        jw.set_pattern_compression(self.cfg.compress_patterns)
        jw.set_taxcode(self.cfg.taxcode_name)
        
        jw.set_merged_ranks_map(self.input_validator.merged_ranks)
        corr_ranks_reverse = dict((reversed(item) for item in self.input_validator.corr_ranks.items()))
        jw.set_corr_ranks_map(corr_ranks_reverse)
        corr_seqid_reverse = dict((reversed(item) for item in self.input_validator.corr_seqid.items()))
        jw.set_corr_seqid_map(corr_seqid_reverse)

        mdata = { "ref_tree_size": self.reftree_size, 
                  "ref_alignment_width": self.refalign_width,
                  "raxml_version": self.raxml_version,
                  "timestamp": str(datetime.datetime.now()),
                  "invocation_epac": self.invocation_epac,
                  "invocation_raxml_multif": self.invocation_raxml_multif,
                  "invocation_raxml_optmod": self.invocation_raxml_optmod,
                  "invocation_raxml_epalbl": self.invocation_raxml_epalbl,
                  "reftree_loglh": self.reftree_loglh
                }
        jw.set_metadata(mdata)

        seqs = self.reduced_refalign_seqs.get_entries()    
        jw.set_sequences(seqs)
        
        if not self.cfg.no_hmmer:
            self.build_hmm_profile(jw)

        orig_tax = self.taxonomy_map
        jw.set_origin_taxonomy(orig_tax)
        
        self.cfg.log.debug("Calculating the speciation rate...\n")
        tp = tree_param(tree = self.reftree_lbl_str, origin_taxonomy = orig_tax)
        jw.set_rate(tp.get_speciation_rate_fast())
        jw.set_nodes_height(self.node_height_map)
        
        jw.set_binary_model(self.optmod_fname)
        
        self.cfg.log.debug("Writing down the reference file...\n")
        jw.dump(self.cfg.refjson_fname)

    # top-level function to build a reference tree    
    def build_ref_tree(self):
        self.cfg.log.info("=> Loading taxonomy from file: %s ...\n" , self.cfg.taxonomy_fname)
        self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX, tax_fname=self.cfg.taxonomy_fname)
        self.cfg.log.info("==> Loading reference alignment from file: %s ...\n" , self.cfg.align_fname)
        self.load_alignment()
        self.cfg.log.info("===> Validating taxonomy and alignment ...\n")
        self.validate_taxonomy()
        self.cfg.log.info("====> Building a multifurcating tree from taxonomy with %d seqs ...\n" , self.taxonomy.seq_count())
        self.build_multif_tree()
        self.cfg.log.info("=====> Building the reference alignment ...\n")
        self.export_ref_alignment()
        self.export_ref_taxonomy()
        self.cfg.log.info("======> Saving the outgroup for later re-rooting ...\n")
        self.save_rooting()
        self.cfg.log.info("=======> Resolving multifurcation: choosing the best topology from %d independent RAxML runs ...\n" % self.cfg.rep_num)
        self.resolve_multif()
        self.load_reduced_refalign()
        self.cfg.log.info("========> Calling RAxML-EPA to obtain branch labels ...\n")
        self.epa_branch_labeling()
        self.cfg.log.info("=========> Post-processing the EPA tree (re-rooting, taxonomic labeling etc.) ...\n")
        self.epa_post_process()
        self.calc_node_heights()
        
        self.cfg.log.debug("\n==========> Checking branch labels ...")
        self.cfg.log.debug("shared rank names before training: %s", repr(self.taxonomy.get_common_ranks()))
        self.cfg.log.debug("shared rank names after  training: %s\n", repr(self.mono_index()))
        
        self.cfg.log.info("==========> Saving the reference JSON file: %s\n" % self.cfg.refjson_fname)
        self.write_json()
class RefTreeBuilder:
    def __init__(self, config): 
        self.cfg = config
        self.mfresolv_job_name = self.cfg.subst_name("mfresolv_%NAME%")
        self.epalbl_job_name = self.cfg.subst_name("epalbl_%NAME%")
        self.optmod_job_name = self.cfg.subst_name("optmod_%NAME%")
        self.raxml_wrapper = RaxmlWrapper(config)
        
        self.outgr_fname = self.cfg.tmp_fname("%NAME%_outgr.tre")
        self.reftree_mfu_fname = self.cfg.tmp_fname("%NAME%_mfu.tre")
        self.reftree_bfu_fname = self.cfg.tmp_fname("%NAME%_bfu.tre")
        self.optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
        self.lblalign_fname = self.cfg.tmp_fname("%NAME%_lblq.fa")
        self.reftree_lbl_fname = self.cfg.tmp_fname("%NAME%_lbl.tre")
        self.reftree_tax_fname = self.cfg.tmp_fname("%NAME%_tax.tre")
        self.brmap_fname = self.cfg.tmp_fname("%NAME%_map.txt")

    def validate_taxonomy(self):
        # make sure we don't taxonomy "irregularities" (more than 7 ranks or missing ranks in the middle)
        action = self.cfg.wrong_rank_count
        if action != "ignore":
            autofix = action == "autofix"
            errs = self.taxonomy.check_for_disbalance(autofix)
            if len(errs) > 0:
                if action == "autofix":
                    print "WARNING: %d sequences with invalid annotation (missing/redundant ranks) found and were fixed as follows:\n" % len(errs)
                    for err in errs:
                        print "Original:   %s\t%s"   % (err[0], err[1])
                        print "Fixed as:   %s\t%s\n" % (err[0], err[2])
                elif action == "skip":
                    print "WARNING: Following %d sequences with invalid annotation (missing/redundant ranks) were skipped:\n" % len(errs)
                    for err in errs:
                        self.taxonomy.remove_seq(err[0])
                        print "%s\t%s" % err
                else:  # abort
                    print "ERROR: %d sequences with invalid annotation (missing/redundant ranks) found:\n" % len(errs)
                    for err in errs:
                        print "%s\t%s" % err
                    print "\nPlease fix them manually (add/remove ranks) and run the pipeline again (or use -wrong-rank-count autofix option)"
                    print "NOTE: Only standard 7-level taxonomies are supported at the moment. Although missing trailing ranks (e.g. species) are allowed,"
                    print "missing intermediate ranks (e.g. family) or sublevels (e.g. suborder) are not!\n"
                    sys.exit()

        # check for duplicate rank names
        action = self.cfg.dup_rank_names
        if action != "ignore":
            autofix = action == "autofix"
            dups = self.taxonomy.check_for_duplicates(autofix)
            if len(dups) > 0:
                if action == "autofix":
                    print "WARNING: %d sequences with duplicate rank names found and were renamed as follows:\n" % len(dups)
                    for dup in dups:
                        print "Original:    %s\t%s"   %  (dup[0], dup[1])
                        print "Duplicate:   %s\t%s"   %  (dup[2], dup[3])
                        print "Renamed to:  %s\t%s\n" %  (dup[2], dup[4])
                elif action == "skip":
                    print "WARNING: Following %d sequences with duplicate rank names were skipped:\n" % len(dups)
                    for dup in dups:
                        self.taxonomy.remove_seq(dup[2])
                        print "%s\t%s\n" % (dup[2], dup[3])
                else:  # abort
                    print "ERROR: %d sequences with duplicate rank names found:\n" % len(dups)
                    for dup in dups:
                        print "%s\t%s\n%s\t%s\n" % dup
                    print "Please fix (rename) them and run the pipeline again (or use -dup-rank-names autofix option)" 
                    sys.exit()
        
        # check for invalid characters in rank names
        self.taxonomy.normalize_rank_names()
        
        self.taxonomy.close_taxonomy_gaps()

    def build_multif_tree(self):
        c = self.cfg
        
        tb = TaxTreeBuilder(c, self.taxonomy)
        (t, ids) = tb.build(c.reftree_min_rank, c.reftree_max_seqs_per_leaf, c.reftree_clades_to_include, c.reftree_clades_to_ignore)
        self.reftree_ids = frozenset(ids)
        self.reftree_size = len(ids)
        self.reftree_multif = t

        # IMPORTANT: select GAMMA or CAT model based on tree size!                
        self.cfg.resolve_auto_settings(self.reftree_size)

        if self.cfg.debug:
            refseq_fname = self.cfg.tmp_fname("%NAME%_seq_ids.txt")
            # list of sequence ids which comprise the reference tree
            with open(refseq_fname, "w") as f:
                for sid in ids:
                    f.write("%s\n" % sid)

            # original tree with taxonomic ranks as internal node labels
            reftax_fname = self.cfg.tmp_fname("%NAME%_mfu_tax.tre")
            t.write(outfile=reftax_fname, format=8)
        #    t.show()

    def export_ref_alignment(self):
        """This function transforms the input alignment in the following way:
           1. Filter out sequences which are not part of the reference tree
           2. Add sequence name prefix (r_)"""
        in_file = self.cfg.align_fname
        ref_seqs = None
        formats = ["fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed"]
        for fmt in formats:
            try:
                ref_seqs = SeqGroup(sequences=in_file, format = fmt)
                break
            except:
                if self.cfg.debug:
                    print("Guessing input format: not " + fmt)
        if ref_seqs == None:
            print("Invalid input file format: %s" % in_file)
            print("The supported input formats are fasta and phylip")
            sys.exit()

        self.refalign_fname = self.cfg.tmp_fname("%NAME%_matrix.afa")
        with open(self.refalign_fname, "w") as fout:
            for name, seq, comment, sid in ref_seqs.iter_entries():
                seq_name = EpacConfig.REF_SEQ_PREFIX + name
                if seq_name in self.reftree_ids:
                    fout.write(">" + seq_name + "\n" + seq + "\n")

    def export_ref_taxonomy(self):
        self.taxonomy_map = {}
        
        for sid, ranks in self.taxonomy.iteritems():
            if sid in self.reftree_ids:
                self.taxonomy_map[sid] = ranks
            
        if self.cfg.debug:
            tax_fname = self.cfg.tmp_fname("%NAME%_tax.txt")
            with open(tax_fname, "w") as fout:
                for sid, ranks in self.taxonomy_map.iteritems():
                    ranks_str = self.taxonomy.lineage_str(sid, True) 
                    fout.write(sid + "\t" + ranks_str + "\n")   

    def save_rooting(self):
        rt = self.reftree_multif

        tax_map = self.taxonomy.get_map()
        self.taxtree_helper = TaxTreeHelper(tax_map, self.cfg)
        self.taxtree_helper.set_mf_rooted_tree(rt)
        outgr = self.taxtree_helper.get_outgroup()
        outgr_size = len(outgr.get_leaves())
        outgr.write(outfile=self.outgr_fname, format=9)
        self.reftree_outgroup = outgr
        if self.cfg.verbose:
            print "Outgroup for rooting was saved to: %s, outgroup size: %d" % (self.outgr_fname, outgr_size)
            
        # remove unifurcation at the root
        if len(rt.children) == 1:
            rt = rt.children[0]
        
        # now we can safely unroot the tree and remove internal node labels to make it suitable for raxml
        rt.write(outfile=self.reftree_mfu_fname, format=9)

    # RAxML call to convert multifurcating tree to the strictly bifurcating one
    def resolve_multif(self):
        print "\nReducing the alignment: \n"
        self.reduced_refalign_fname = self.raxml_wrapper.reduce_alignment(self.refalign_fname)
        
        print "\nResolving multifurcation: \n"
        raxml_params = ["-s", self.reduced_refalign_fname, "-g", self.reftree_mfu_fname, "-F", "--no-seq-check"]
        if self.cfg.mfresolv_method  == "fast":
            raxml_params += ["-D"]
        elif self.cfg.mfresolv_method  == "ultrafast":
            raxml_params += ["-f", "e"]
        self.invocation_raxml_multif = self.raxml_wrapper.run(self.mfresolv_job_name, raxml_params)
        if self.raxml_wrapper.result_exists(self.mfresolv_job_name):        
#            self.raxml_wrapper.copy_result_tree(self.mfresolv_job_name, self.reftree_bfu_fname)
#            self.raxml_wrapper.copy_optmod_params(self.mfresolv_job_name, self.optmod_fname)

            bfu_fname = self.raxml_wrapper.result_fname(self.mfresolv_job_name)

            # RAxML call to optimize model parameters and write them down to the binary model file
            print "\nOptimizing model parameters: \n"
            raxml_params = ["-f", "e", "-s", self.reduced_refalign_fname, "-t", bfu_fname, "--no-seq-check"]
            if self.cfg.raxml_model == "GTRCAT" and not self.cfg.compress_patterns:
                raxml_params +=  ["-H"]
            self.invocation_raxml_optmod = self.raxml_wrapper.run(self.optmod_job_name, raxml_params)
            if self.raxml_wrapper.result_exists(self.optmod_job_name):
                self.raxml_wrapper.copy_result_tree(self.optmod_job_name, self.reftree_bfu_fname)
                self.raxml_wrapper.copy_optmod_params(self.optmod_job_name, self.optmod_fname)
                if not self.cfg.debug:
                    self.raxml_wrapper.cleanup(self.optmod_job_name)
            else:
                print "RAxML run failed (model optimization), please examine the log for details: %s" \
                        % self.raxml_wrapper.make_raxml_fname("output", self.optmod_job_name)
                sys.exit()  

            if not self.cfg.debug:
                self.raxml_wrapper.cleanup(self.mfresolv_job_name)
        else:
            print "RAxML run failed (mutlifurcation resolution), please examine the log for details: %s" \
                    % self.raxml_wrapper.make_raxml_fname("output", self.mfresolv_job_name)
            sys.exit()  
            
    def load_reduced_refalign(self):
        formats = ["fasta", "phylip_relaxed"]
        for fmt in formats:
            try:
                self.reduced_refalign_seqs = SeqGroup(sequences=self.reduced_refalign_fname, format = fmt)
                break
            except:
                pass
        if self.reduced_refalign_seqs == None:
            print("FATAL ERROR: Invalid input file format in %s! (load_reduced_refalign)" % self.reduced_refalign_fname)
            sys.exit()
    
    # dummy EPA run to label the branches of the reference tree, which we need to build a mapping to tax ranks    
    def epa_branch_labeling(self):
        # create alignment with dummy query seq
        self.refalign_width = len(self.reduced_refalign_seqs.get_seqbyid(0))
        self.reduced_refalign_seqs.write(format="fasta", outfile=self.lblalign_fname)
        
        with open(self.lblalign_fname, "a") as fout:
            fout.write(">" + "DUMMY131313" + "\n")        
            fout.write("A"*self.refalign_width + "\n")        
        
        epa_result = self.raxml_wrapper.run_epa(self.epalbl_job_name, self.lblalign_fname, self.reftree_bfu_fname, self.optmod_fname)
        self.reftree_lbl_str = epa_result.get_std_newick_tree()
        self.raxml_version = epa_result.get_raxml_version()
        self.invocation_raxml_epalbl = epa_result.get_raxml_invocation()

        if self.raxml_wrapper.epa_result_exists(self.epalbl_job_name):        
            if not self.cfg.debug:
                self.raxml_wrapper.cleanup(self.epalbl_job_name, True)
        else:
            print "RAxML EPA run failed, please examine the log for details: %s" \
                    % self.raxml_wrapper.make_raxml_fname("output", self.epalbl_job_name)
            sys.exit()        

    def epa_post_process(self):
        lbl_tree = Tree(self.reftree_lbl_str)
        self.taxtree_helper.set_bf_unrooted_tree(lbl_tree)
        self.reftree_tax = self.taxtree_helper.get_tax_tree()
        self.bid_ranks_map = self.taxtree_helper.get_bid_taxonomy_map()
        
        if self.cfg.debug:
            self.reftree_tax.write(outfile=self.reftree_lbl_fname, format=5)
            self.reftree_tax.write(outfile=self.reftree_tax_fname, format=3)

    def build_branch_rank_map(self):
        self.bid_ranks_map = {}
        for node in self.reftree_tax.traverse("postorder"):
            if not node.is_root() and hasattr(node, "B"):                
                parent = node.up                
                self.bid_ranks_map[node.B] = parent.ranks
#                print "%s => %s" % (node.B, parent.ranks)
            elif self.cfg.verbose:
                print "INFO: EPA branch label missing, mapping to taxon skipped (%s)" % node.name
    
    def write_branch_rank_map(self):
        with open(self.brmap_fname, "w") as fbrmap:    
            for node in self.reftree_tax.traverse("postorder"):
                if not node.is_root() and hasattr(node, "B"):                
                    fbrmap.write(node.B + "\t" + ";".join(self.bid_ranks_map[node.B]) + "\n")
    
    def calc_node_heights(self):
        """Calculate node heights on the reference tree (used to define branch-length cutoff during classification step)
           Algorithm is as follows:
           Tip node or node resolved to Species level: height = 1 
           Inner node resolved to Genus or above:      height = min(left_height, right_height) + 1 
         """
        nh_map = {}
        dummy_added = False
        for node in self.reftree_tax.traverse("postorder"):
            if not node.is_root():
                if not hasattr(node, "B"):                
                    # In a rooted tree, there is always one more node/branch than in unrooted one
                    # That's why one branch will be always not EPA-labelled after the rooting
                    if not dummy_added: 
                        node.B = "DDD"
                        dummy_added = True
                        species_rank = Taxonomy.EMPTY_RANK
                    else:
                        print "FATAL ERROR: More than one tree branch without EPA label (calc_node_heights)"
                        sys.exit()
                else:
                    species_rank = self.bid_ranks_map[node.B][6]
                bid = node.B
                if node.is_leaf() or species_rank != Taxonomy.EMPTY_RANK:
                    nh_map[bid] = 1
                else:
                    lchild = node.children[0]
                    rchild = node.children[1]
                    nh_map[bid] = min(nh_map[lchild.B], nh_map[rchild.B]) + 1

        # remove heights for dummy nodes, since there won't be any placements on them
        if dummy_added:
            del nh_map["DDD"]
            
        self.node_height_map = nh_map

    def __get_all_rank_names(self, root):
        rnames = set([])
        for node in root.traverse("postorder"):
            ranks = node.ranks
            for rk in ranks:
                rnames.add(rk)
        return rnames

    def mono_index(self):
        """This method will calculate monophyly index by looking at the left and right hand side of the tree"""
        children = self.reftree_tax.children
        if len(children) == 1:
            while len(children) == 1:
                children = children[0].children 
        if len(children) == 2:
            left = children[0]
            right =children[1]
            lset = self.__get_all_rank_names(left)
            rset = self.__get_all_rank_names(right)
            iset = lset & rset
            return iset
        else:
            print("Error: input tree not birfurcating")
            return set([])

    def build_hmm_profile(self, json_builder):
        print "Building the HMMER profile...\n"

        # this stupid workaround is needed because RAxML outputs the reduced
        # alignment in relaxed PHYLIP format, which is not supported by HMMER
        refalign_fasta = self.cfg.tmp_fname("%NAME%_ref_reduced.fa")
        self.reduced_refalign_seqs.write(outfile=refalign_fasta)

        hmm = hmmer(self.cfg, refalign_fasta)
        fprofile = hmm.build_hmm_profile()

        json_builder.set_hmm_profile(fprofile)
        
        if not self.cfg.debug:
            FileUtils.remove_if_exists(refalign_fasta)
            FileUtils.remove_if_exists(fprofile)

    def write_json(self):
        jw = RefJsonBuilder()

        jw.set_taxonomy(self.bid_ranks_map)
        jw.set_tree(self.reftree_lbl_str)
        jw.set_outgroup(self.reftree_outgroup)
        jw.set_ratehet_model(self.cfg.raxml_model)
        jw.set_tax_tree(self.reftree_multif)
        jw.set_pattern_compression(self.cfg.compress_patterns)

        mdata = { "ref_tree_size": self.reftree_size, 
                  "ref_alignment_width": self.refalign_width,
                  "raxml_version": self.raxml_version,
                  "timestamp": str(datetime.datetime.now()),
                  "invocation_epac": self.invocation_epac,
                  "invocation_raxml_multif": self.invocation_raxml_multif,
                  "invocation_raxml_optmod": self.invocation_raxml_optmod,
                  "invocation_raxml_epalbl": self.invocation_raxml_epalbl
                }
        jw.set_metadata(mdata)

        seqs = self.reduced_refalign_seqs.get_entries()    
        jw.set_sequences(seqs)
        
        if not self.cfg.no_hmmer:
            self.build_hmm_profile(jw)

        orig_tax = self.taxonomy_map
        jw.set_origin_taxonomy(orig_tax)
        
        print "Calculating the speciation rate...\n"
        tp = tree_param(tree = self.reftree_lbl_str, origin_taxonomy = orig_tax)
        jw.set_rate(tp.get_speciation_rate_fast())
        jw.set_nodes_height(self.node_height_map)
        
        jw.set_binary_model(self.optmod_fname)
        
        print "Writing down the reference file...\n"
        jw.dump(self.cfg.refjson_fname)

    def cleanup(self):
        FileUtils.remove_if_exists(self.outgr_fname)
        FileUtils.remove_if_exists(self.reftree_mfu_fname)
        FileUtils.remove_if_exists(self.reftree_bfu_fname)
        FileUtils.remove_if_exists(self.optmod_fname)
        FileUtils.remove_if_exists(self.lblalign_fname)
        FileUtils.remove_if_exists(self.outgr_fname)
        FileUtils.remove_if_exists(self.reduced_refalign_fname)
        FileUtils.remove_if_exists(self.refalign_fname)

    # top-level function to build a reference tree    
    def build_ref_tree(self):
        start_time = time.time()
        print "\n> Loading taxonomy from file: %s ...\n" % (self.cfg.taxonomy_fname)
        self.taxonomy = GGTaxonomyFile(self.cfg.taxonomy_fname, EpacConfig.REF_SEQ_PREFIX)
        print "\n=> Building a multifurcating tree from taxonomy with %d seqs ...\n" % self.taxonomy.seq_count()
        self.validate_taxonomy()
        self.build_multif_tree()
        print "\n==> Building the reference alignment ...\n"
        self.export_ref_alignment()
        self.export_ref_taxonomy()
        print "\n===> Saving the outgroup for later re-rooting ...\n"
        self.save_rooting()
        print "\n====> RAxML call: resolve multifurcation ...\n"
        self.resolve_multif()
        self.load_reduced_refalign()
        print "\n=====> RAxML-EPA call: labeling the branches ...\n"
        self.epa_branch_labeling()
        print "\n======> Post-processing the EPA tree (re-rooting, taxonomic labeling etc.) ...\n"
        self.epa_post_process()
        self.calc_node_heights()
        
        if self.cfg.verbose:
            print "\n=======> Checking branch labels ...\n"
            print "shared rank names before training: " + repr(self.taxonomy.get_common_ranks())
            print "shared rank names after  training: " + repr(self.mono_index())
        
        print "\n=======> Saving the reference JSON file ...\n"
        self.write_json()
        elapsed_time = time.time() - start_time
        print "\n***********  Done! (%.0f s) **********\n" % elapsed_time