class EpaClassifier:
    def __init__(self, config, args):
        self.cfg = config
        self.jplace_fname = args.jplace_fname
        self.ignore_refalign = args.ignore_refalign
        
        self.tmp_refaln = config.tmp_fname("%NAME%.refaln")
        #here is the final alignment file for running EPA
        self.epa_alignment = config.tmp_fname("%NAME%.afa")
        self.hmmprofile = config.tmp_fname("%NAME%.hmmprofile")
        self.tmpquery = config.tmp_fname("%NAME%.tmpquery")
        self.noalign = config.tmp_fname("%NAME%.noalign")
        self.seqs = None

        try:
            self.refjson = RefJsonParser(config.refjson_fname)
        except ValueError:
            print("Invalid json file format!")
            sys.exit()
        #validate input json format 
        self.refjson.validate()
        self.bid_taxonomy_map = self.refjson.get_bid_tanomomy_map()
        self.reftree = self.refjson.get_reftree()
        self.rate = self.refjson.get_rate()
        self.node_height = self.refjson.get_node_height()
        self.cfg.compress_patterns = self.refjson.get_pattern_compression()

        self.classify_helper = TaxClassifyHelper(self.cfg, self.bid_taxonomy_map, args.p_value, self.rate, self.node_height)

    def cleanup(self):
        FileUtils.remove_if_exists(self.tmp_refaln)
        FileUtils.remove_if_exists(self.epa_alignment)
        FileUtils.remove_if_exists(self.hmmprofile)
        FileUtils.remove_if_exists(self.tmpquery)
        FileUtils.remove_if_exists(self.noalign)

    def align_to_refenence(self, noalign, minp = 0.9):
        refaln = self.refjson.get_alignment(fout = self.tmp_refaln)
        fprofile = self.refjson.get_hmm_profile(self.hmmprofile)
        
        # if there is no hmmer profile in json file, build it from scratch          
        if not fprofile:
            hmm = hmmer(self.cfg, refaln)
            fprofile = hmm.build_hmm_profile()
    
        hm = hmmer(config = self.cfg, refalign = refaln , query = self.tmpquery, refprofile = fprofile, discard = noalign, seqs = self.seqs, minp = minp)
        self.epa_alignment = hm.align()

    def merge_alignment(self, query_seqs):
        refaln = self.refjson.get_alignment_list()
        with open(self.epa_alignment, "w") as fout:
            for seq in refaln:
                fout.write(">" + seq[0] + "\n" + seq[1] + "\n")
            for name, seq, comment, sid in query_seqs.iter_entries():
                fout.write(">" + name + "\n" + seq + "\n")


    def checkinput(self, query_fname, minp = 0.9):
        formats = ["fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed"]
        for fmt in formats:
            try:
                self.seqs = SeqGroup(sequences=query_fname, format = fmt)
                break
            except:
                print("Guessing input format: not " + fmt)
        if self.seqs == None:
            print("Invalid input file format!")
            print("The supported input formats are fasta and phylip")
            sys.exit()

        if self.ignore_refalign:
            print("Assuming query file contains reference sequences, skipping the alignment step...")
            with open(self.epa_alignment, "w") as fout:
                for name, seq, comment, sid in self.seqs.iter_entries():
                    ref_name = self.REF_PREFIX + name
                    if ref_name in self.refjson.get_sequences_names():
                        seq_name = ref_name
                    else:
                        seq_name = EpacConfig.QUERY_SEQ_PREFIX + name
                    fout.write(">" + seq_name + "\n" + seq + "\n")
            return
            
        # add query seq name prefix to avoid confusion between reference and query sequences
        self.seqs.add_name_prefix(EpacConfig.QUERY_SEQ_PREFIX)
        
        self.seqs.write(format="fasta", outfile=self.tmpquery)
        print("Checking if query sequences are aligned ...")
        entries = self.seqs.get_entries()
        seql = len(entries[0][1])
        aligned = True
        for entri in entries[1:]:
            l = len(entri[1])
            if not seql == l:
                aligned = False
                break
        
        if aligned and len(self.seqs) > 1:
            print("Query sequences are aligned")
            refalnl = self.refjson.get_alignment_length()
            if refalnl == seql:
                print("Merging query alignment with reference alignment")
                self.merge_alignment(self.seqs)
            else:
                print("Merging query alignment with reference alignment using MUSCLE")
                require_muscle()
                refaln = self.refjson.get_alignment(fout = self.tmp_refaln)
                m = muscle(self.cfg)
                self.epa_alignment = m.merge(refaln, self.tmpquery)
        else:
            print("Query sequences are not aligned")
            print("Align query sequences to the reference alignment using HMMER")
            require_hmmer()
            self.align_to_refenence(self.noalign, minp = minp)
        
        print("Running EPA ......")
        print("")


    def print_ranks(self, rks, confs, minlw = 0.0):
        ss = ""
        css = ""
        for i in range(len(rks)):
            conf = confs[i]
            if conf == confs[0] and confs[0] >=0.99:
                conf = 1.0
            if conf >= minlw:
                ss = ss + rks[i] + ";"
                css = css + "{0:.3f}".format(conf) + ";"
            else:
                break
        if ss == "":
            return None
        else:
            return ss[:-1] + "\t" + css[:-1]


    def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:        
            self.checkinput(query_fname, minp)
            raxml = RaxmlWrapper(config)
            reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
            self.refjson.get_raxml_readable_tree(reftree_fname)
            optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
            self.refjson.get_binary_model(optmod_fname)
            job_name = self.cfg.subst_name("epa_%NAME%")

            reftree_str = self.refjson.get_raxml_readable_tree()
            reftree = Tree(reftree_str)

            self.reftree_size = len(reftree.get_leaves())

            # IMPORTANT: set EPA heuristic rate based on tree size!                
            self.cfg.resolve_auto_settings(self.reftree_size)
            # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file        
            if self.cfg.epa_load_optmod:
                self.cfg.raxml_model = self.refjson.get_ratehet_model()

            reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

            jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname)
        
        placements = jp.get_placement()
        
        if fout:
            fo = open(fout, "w")
        else:
            fo = None
        
        output2 = ""
        for place in placements:
            output = None
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]
#            edges = self.erlang_filter(edges, p = pv)
            if len(edges) > 0:
                ranks, lws = self.classify_helper.classify_seq(edges, method, minlw)
                
                isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw)
                rankout = self.print_ranks(ranks, lws, minlw)
                
                if rankout == None:
                    output2 = output2 + origin_taxon_name+ "\t\t\t?\n"
                else:
                    output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw))
                    if isnovo: 
                        output += "*"
                    else:
                        output +="o"
                    if self.cfg.verbose:
                        print(output) 
                    if fo:
                        fo.write(output + "\n")
            else:
                output2 = output2 + origin_taxon_name+ "\t\t\t?\n"
        
        if os.path.exists(self.noalign):
            with open(self.noalign) as fnoa:
                lines = fnoa.readlines()
                for line in lines:
                    taxon_name = line.strip()[1:]
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
                    output = "%s\t\t\t?" % origin_taxon_name
                    if self.cfg.verbose:
                        print(output)
                    if fo:
                        fo.write(output + "\n")
        
        if self.cfg.verbose:
            print(output2)
        
        if fo:
            fo.write(output2)
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            full_aln = SeqGroup(self.epa_alignment)
            species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug)
            
            if self.cfg.verbose:
                print "Species clusters:"

            if fout:
                fo2 = open(fout+".species", "w")
            else:
                fo2 = None

            for sp_cluster in species_list:
                translated_taxa = []
                for taxon in sp_cluster:
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                    translated_taxa.append(origin_taxon_name)
                s = ",".join(translated_taxa)
                if fo2:
                    fo2.write(s + "\n")
                if self.cfg.verbose:
                    print s

            if fo2:
                fo2.close()
        #############################################
        
        if not self.jplace_fname:
            if not self.cfg.debug:
                raxml.cleanup(job_name)
                FileUtils.remove_if_exists(reduced_align_fname)
                FileUtils.remove_if_exists(reftree_fname)
                FileUtils.remove_if_exists(optmod_fname)

    def novelty_check(self, place_edge, ranks, lws, minlw):
        """If the taxonomic assignment is not assigned to the genus level, 
        we need to check if it is due to the incomplete reference taxonomy or 
        it is likely to be something new:
        
        1. If the final ranks are assinged because of lw cut, that means with samller lw
        the ranks can be further assinged to lowers. This indicate the undetermined ranks 
        in the assignment is not due to the incomplete reference taxonomy, so the query 
        sequence is likely to be something new.
        
        2. Otherwise We check all leaf nodes' immediate lower rank below this ml placement point, 
        if they are not empty, output all ranks and indicate this could be novelty.
        """
        
        lowrank = 0
        for i in range(len(ranks)):
            if i < 6:
                """above genus level"""
                rk = ranks[i]
                lw = lws[i]
                if rk == "-":
                    break
                else:
                    lowrank = lowrank + 1
                    if lw >=0 and lw < minlw:
                        return True
        
        if lowrank >= 5 and not ranks[lowrank] == "-":
            return False
        else:
            placenode = self.reftree.search_nodes(B = place_edge)[0]
            if placenode.is_leaf():
                return False
            else:
                leafnodes = placenode.get_leaves()
                flag = True
                for leaf in leafnodes:
                    br_num = leaf.B
                    branks = self.bid_taxonomy_map[br_num]
                    if branks[lowrank] == "-":
                        flag = False
                        break
                        
                return flag
Example #2
0
class EpaClassifier:
    def __init__(self, config, args):
        self.cfg = config
        self.jplace_fname = args.jplace_fname
        self.ignore_refalign = args.ignore_refalign

        self.tmp_refaln = config.tmp_fname("%NAME%.refaln")
        #here is the final alignment file for running EPA
        self.epa_alignment = config.tmp_fname("%NAME%.afa")
        self.hmmprofile = config.tmp_fname("%NAME%.hmmprofile")
        self.tmpquery = config.tmp_fname("%NAME%.tmpquery")
        self.noalign = config.tmp_fname("%NAME%.noalign")
        self.seqs = None

        assign_fname = args.output_name + ".assignment.txt"
        self.out_assign_fname = os.path.join(args.output_dir, assign_fname)
        jplace_fname = args.output_name + ".jplace"
        self.out_jplace_fname = os.path.join(args.output_dir, jplace_fname)

        try:
            self.refjson = RefJsonParser(config.refjson_fname)
        except ValueError:
            self.cfg.exit_user_error("Invalid json file format: %s" %
                                     config.refjson_fname)
        #validate input json format
        self.refjson.validate()
        self.reftree = self.refjson.get_reftree()
        self.rate = self.refjson.get_rate()
        self.node_height = self.refjson.get_node_height()
        self.cfg.compress_patterns = self.refjson.get_pattern_compression()

        self.bid_taxonomy_map = self.refjson.get_branch_tax_map()
        if not self.bid_taxonomy_map:
            # old file format (before 1.6), need to rebuild this map from scratch
            th = TaxTreeHelper(self.cfg, self.refjson.get_origin_taxonomy())
            th.set_mf_rooted_tree(self.refjson.get_tax_tree())
            th.set_bf_unrooted_tree(self.refjson.get_reftree())
            self.bid_taxonomy_map = th.get_bid_taxonomy_map()

        self.cfg.log.info("Loaded reference tree with %d taxa\n" %
                          len(self.reftree.get_leaves()))

        self.classify_helper = TaxClassifyHelper(self.cfg,
                                                 self.bid_taxonomy_map,
                                                 self.rate, self.node_height)

    def require_muscle(self):
        basepath = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(basepath + "/epac/bin/muscle"):
            errmsg = "The pipeline uses MUSCLE to merge alignments, please download the programm from:\n" + \
                     "http://www.drive5.com/muscle/downloads.htm\n" + \
                     "and specify path to your installation in the config file (sativa.cfg)\n"
            self.cfg.exit_user_error(errmsg)

    def require_hmmer(self):
        basepath = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(basepath +
                              "/epac/bin/hmmbuild") or not os.path.exists(
                                  basepath + "/epac/bin/hmmalign"):
            errmsg = "The pipeline uses HAMMER to align the query seqeunces, please download the programm from:\n" + \
                     "http://hmmer.janelia.org/\n" + \
                     "and specify path to your installation in the config file (sativa.cfg)\n"
            self.cfg.exit_user_error(errmsg)

    def align_to_refenence(self, noalign, minp=0.9):
        refaln = self.refjson.get_alignment(fout=self.tmp_refaln)
        fprofile = self.refjson.get_hmm_profile(self.hmmprofile)

        # if there is no hmmer profile in json file, build it from scratch
        if not fprofile:
            hmm = hmmer(self.cfg, refaln)
            fprofile = hmm.build_hmm_profile()

        hm = hmmer(config=self.cfg,
                   refalign=refaln,
                   query=self.tmpquery,
                   refprofile=fprofile,
                   discard=noalign,
                   seqs=self.seqs,
                   minp=minp)
        self.epa_alignment = hm.align()

    def merge_alignment(self, query_seqs):
        refaln = self.refjson.get_alignment_list()
        with open(self.epa_alignment, "w") as fout:
            for seq in refaln:
                fout.write(">" + seq[0] + "\n" + seq[1] + "\n")
            for name, seq, comment, sid in query_seqs.iter_entries():
                fout.write(">" + name + "\n" + seq + "\n")

    def write_combined_alignment(self):
        self.query_count = 0
        with open(self.epa_alignment, "w") as fout:
            for name, seq, comment, sid in self.seqs.iter_entries():
                ref_name = self.refjson.get_corr_seqid(
                    EpacConfig.REF_SEQ_PREFIX + name)
                if ref_name in self.refjson.get_sequences_names():
                    seq_name = ref_name
                else:
                    seq_name = EpacConfig.QUERY_SEQ_PREFIX + name
                    self.query_count += 1
                fout.write(">" + seq_name + "\n" + seq + "\n")

    def checkinput(self, query_fname, minp=0.9):
        formats = [
            "fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed"
        ]
        for fmt in formats:
            try:
                self.seqs = SeqGroup(sequences=query_fname, format=fmt)
                break
            except:
                self.cfg.log.debug("Guessing input format: not " + fmt)
        if self.seqs == None:
            self.cfg.exit_user_error(
                "Invalid input file format: %s\nThe supported input formats are fasta and phylip"
                % query_fname)

        if self.ignore_refalign:
            self.cfg.log.info(
                "Assuming query file contains reference sequences, skipping the alignment step...\n"
            )
            self.write_combined_alignment()
            return

        self.query_count = len(self.seqs)

        # add query seq name prefix to avoid confusion between reference and query sequences
        self.seqs.add_name_prefix(EpacConfig.QUERY_SEQ_PREFIX)

        self.seqs.write(format="fasta", outfile=self.tmpquery)
        self.cfg.log.info("Checking if query sequences are aligned ...")
        entries = self.seqs.get_entries()
        seql = len(entries[0][1])
        aligned = True
        for entri in entries[1:]:
            l = len(entri[1])
            if not seql == l:
                aligned = False
                break

        if aligned and len(self.seqs) > 1:
            self.cfg.log.info("Query sequences are aligned")
            refalnl = self.refjson.get_alignment_length()
            if refalnl == seql:
                self.cfg.log.info(
                    "Merging query alignment with reference alignment")
                self.merge_alignment(self.seqs)
            else:
                self.cfg.log.info(
                    "Merging query alignment with reference alignment using MUSCLE"
                )
                self.require_muscle()
                refaln = self.refjson.get_alignment(fout=self.tmp_refaln)
                m = muscle(self.cfg)
                self.epa_alignment = m.merge(refaln, self.tmpquery)
        else:
            self.cfg.log.info("Query sequences are not aligned")
            self.cfg.log.info(
                "Align query sequences to the reference alignment using HMMER")
            self.require_hmmer()
            self.align_to_refenence(self.noalign, minp=minp)

    def print_ranks(self, rks, confs, minlw=0.0):
        uncorr_ranks = self.refjson.get_uncorr_ranks(rks)
        ss = ""
        css = ""
        for i in range(len(uncorr_ranks)):
            conf = confs[i]
            if conf == confs[0] and confs[0] >= 0.99:
                conf = 1.0
            if conf >= minlw:
                ss = ss + uncorr_ranks[i] + ";"
                css = css + "{0:.3f}".format(conf) + ";"
            else:
                break
        if ss == "":
            return None
        else:
            return ss[:-1] + "\t" + css[:-1]

    def run_epa(self):
        self.cfg.log.info(
            "Running RAxML-EPA to place %d query sequences...\n" %
            self.query_count)
        raxml = RaxmlWrapper(config)
        reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
        self.refjson.get_raxml_readable_tree(reftree_fname)
        optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
        self.refjson.get_binary_model(optmod_fname)
        job_name = self.cfg.subst_name("epa_%NAME%")

        reftree_str = self.refjson.get_raxml_readable_tree()
        reftree = Tree(reftree_str)

        self.reftree_size = len(reftree.get_leaves())

        # IMPORTANT: set EPA heuristic rate based on tree size!
        self.cfg.resolve_auto_settings(self.reftree_size)
        # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file
        if self.cfg.epa_load_optmod:
            self.cfg.raxml_model = self.refjson.get_ratehet_model()

        reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

        jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname,
                           optmod_fname)

        raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True)

        return jp

    def run_ptp(self, jp):
        full_aln = SeqGroup(self.epa_alignment)
        species_list = epa_2_ptp(epa_jp=jp,
                                 ref_jp=self.refjson,
                                 full_alignment=full_aln,
                                 min_lw=0.5,
                                 debug=self.cfg.debug)

        self.cfg.log.debug("Species clusters:")

        if fout:
            fo2 = open(fout + ".species", "w")
        else:
            fo2 = None

        for sp_cluster in species_list:
            translated_taxa = []
            for taxon in sp_cluster:
                origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                translated_taxa.append(origin_taxon_name)
            s = ",".join(translated_taxa)
            if fo2:
                fo2.write(s + "\n")
            self.cfg.log.debug(s)

        if fo2:
            fo2.close()

    def print_result_line(self, fo, line):
        if self.cfg.verbose:
            print(line)
        if fo:
            fo.write(line + "\n")

    def get_noalign_list(self):
        noalign_list = []
        if os.path.exists(self.noalign):
            with open(self.noalign) as fnoa:
                lines = fnoa.readlines()
                for line in lines:
                    taxon_name = line.strip()[1:]
                    origin_taxon_name = EpacConfig.strip_query_prefix(
                        taxon_name)
                    noalign_list.append(origin_taxon_name)
        return noalign_list

    def classify(self, query_fname, minp=0.9, ptp=False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:
            self.checkinput(query_fname, minp)
            jp = self.run_epa()

        self.cfg.log.info(
            "Assigning taxonomic labels based on EPA placements...\n")

        placements = jp.get_placement()

        if self.out_assign_fname:
            fo = open(self.out_assign_fname, "w")
        else:
            fo = None

        noassign_list = []
        for place in placements:
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]

            ranks, lws = self.classify_helper.classify_seq(edges)
            rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw)

            if rankout == None:
                noassign_list.append(origin_taxon_name)
            else:
                output = "%s\t%s\t" % (origin_taxon_name, rankout)
                if self.cfg.check_novelty:
                    isnovo = self.novelty_check(place_edge=str(edges[0][0]),
                                                ranks=ranks,
                                                lws=lws)
                    output += "*" if isnovo else "o"
                self.print_result_line(fo, output)

        noassign_list += self.get_noalign_list()

        for taxon_name in noassign_list:
            output = "%s\t\t\t?" % origin_taxon_name
            self.print_result_line(fo, output)

        if fo:
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            self.run_ptp(jp)

    def novelty_check(self, place_edge, ranks, lws):
        """If the taxonomic assignment is not assigned to the genus level, 
        we need to check if it is due to the incomplete reference taxonomy or 
        it is likely to be something new:
        
        1. If the final ranks are assinged because of lw cut, that means with samller lw
        the ranks can be further assinged to lowers. This indicate the undetermined ranks 
        in the assignment is not due to the incomplete reference taxonomy, so the query 
        sequence is likely to be something new.
        
        2. Otherwise We check all leaf nodes' immediate lower rank below this ml placement point, 
        if they are not empty, output all ranks and indicate this could be novelty.
        """

        lowrank = 0
        for i in max(range(len(ranks)), 6):
            """above genus level"""
            rk = ranks[i]
            lw = lws[i]
            if rk == "-":
                break
            else:
                lowrank = lowrank + 1
                if lw >= 0 and lw < self.cfg.min_lhw:
                    return True

        if lowrank >= 5 and lowrank < len(ranks) and not ranks[lowrank] == "-":
            return False
        else:
            placenode = self.reftree.search_nodes(B=place_edge)[0]
            if placenode.is_leaf():
                return False
            else:
                leafnodes = placenode.get_leaves()
                flag = True
                for leaf in leafnodes:
                    br_num = leaf.B
                    branks = self.bid_taxonomy_map[br_num]
                    if lowrank >= len(branks) or branks[lowrank] == "-":
                        flag = False
                        break

                return flag
Example #3
0
class EpaClassifier:
    def __init__(self, config, args):
        self.cfg = config
        self.jplace_fname = args.jplace_fname
        self.ignore_refalign = args.ignore_refalign
        
        self.tmp_refaln = config.tmp_fname("%NAME%.refaln")
        #here is the final alignment file for running EPA
        self.epa_alignment = config.tmp_fname("%NAME%.afa")
        self.hmmprofile = config.tmp_fname("%NAME%.hmmprofile")
        self.tmpquery = config.tmp_fname("%NAME%.tmpquery")
        self.noalign = config.tmp_fname("%NAME%.noalign")
        self.seqs = None
        
        assign_fname = args.output_name + ".assignment.txt"
        self.out_assign_fname = os.path.join(args.output_dir, assign_fname)
        jplace_fname = args.output_name + ".jplace"
        self.out_jplace_fname = os.path.join(args.output_dir, jplace_fname)

        try:
            self.refjson = RefJsonParser(config.refjson_fname)
        except ValueError:
            self.cfg.exit_user_error("Invalid json file format: %s" % config.refjson_fname)
        #validate input json format 
        self.refjson.validate()
        self.reftree = self.refjson.get_reftree()
        self.rate = self.refjson.get_rate()
        self.node_height = self.refjson.get_node_height()
        self.cfg.compress_patterns = self.refjson.get_pattern_compression()

        self.bid_taxonomy_map = self.refjson.get_branch_tax_map()
        if not self.bid_taxonomy_map:
            # old file format (before 1.6), need to rebuild this map from scratch
            th = TaxTreeHelper(self.cfg, self.refjson.get_origin_taxonomy())
            th.set_mf_rooted_tree(self.refjson.get_tax_tree())
            th.set_bf_unrooted_tree(self.refjson.get_reftree())
            self.bid_taxonomy_map = th.get_bid_taxonomy_map()        
        
        self.cfg.log.info("Loaded reference tree with %d taxa\n" % len(self.reftree.get_leaves()))

        self.classify_helper = TaxClassifyHelper(self.cfg, self.bid_taxonomy_map, self.rate, self.node_height)
        
    def require_muscle(self):
        basepath = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(basepath + "/epac/bin/muscle"):
            errmsg = "The pipeline uses MUSCLE to merge alignments, please download the programm from:\n" + \
                     "http://www.drive5.com/muscle/downloads.htm\n" + \
                     "and specify path to your installation in the config file (sativa.cfg)\n"
            self.cfg.exit_user_error(errmsg)

    def require_hmmer(self):
        basepath = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(basepath + "/epac/bin/hmmbuild") or not os.path.exists(basepath + "/epac/bin/hmmalign"):
            errmsg = "The pipeline uses HAMMER to align the query seqeunces, please download the programm from:\n" + \
                     "http://hmmer.janelia.org/\n" + \
                     "and specify path to your installation in the config file (sativa.cfg)\n"
            self.cfg.exit_user_error(errmsg)

    def align_to_refenence(self, noalign, minp = 0.9):
        refaln = self.refjson.get_alignment(fout = self.tmp_refaln)
        fprofile = self.refjson.get_hmm_profile(self.hmmprofile)
        
        # if there is no hmmer profile in json file, build it from scratch          
        if not fprofile:
            hmm = hmmer(self.cfg, refaln)
            fprofile = hmm.build_hmm_profile()
    
        hm = hmmer(config = self.cfg, refalign = refaln , query = self.tmpquery, refprofile = fprofile, discard = noalign, seqs = self.seqs, minp = minp)
        self.epa_alignment = hm.align()

    def merge_alignment(self, query_seqs):
        refaln = self.refjson.get_alignment_list()
        with open(self.epa_alignment, "w") as fout:
            for seq in refaln:
                fout.write(">" + seq[0] + "\n" + seq[1] + "\n")
            for name, seq, comment, sid in query_seqs.iter_entries():
                fout.write(">" + name + "\n" + seq + "\n")


    def checkinput(self, query_fname, minp = 0.9):
        formats = ["fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed"]
        for fmt in formats:
            try:
                self.seqs = SeqGroup(sequences=query_fname, format = fmt)
                break
            except:
                self.cfg.log.debug("Guessing input format: not " + fmt)
        if self.seqs == None:
            self.cfg.exit_user_error("Invalid input file format: %s\nThe supported input formats are fasta and phylip" % query_fname)

        if self.ignore_refalign:
            self.cfg.log.info("Assuming query file contains reference sequences, skipping the alignment step...\n")
            self.query_count = 0
            with open(self.epa_alignment, "w") as fout:
                for name, seq, comment, sid in self.seqs.iter_entries():
                    ref_name = self.refjson.get_corr_seqid(EpacConfig.REF_SEQ_PREFIX + name)
                    if ref_name in self.refjson.get_sequences_names():
                        seq_name = ref_name
                    else:
                        seq_name = EpacConfig.QUERY_SEQ_PREFIX + name
                        self.query_count += 1
                    fout.write(">" + seq_name + "\n" + seq + "\n")
            return
            
        self.query_count = len(self.seqs)
            
        # add query seq name prefix to avoid confusion between reference and query sequences
        self.seqs.add_name_prefix(EpacConfig.QUERY_SEQ_PREFIX)
        
        self.seqs.write(format="fasta", outfile=self.tmpquery)
        self.cfg.log.info("Checking if query sequences are aligned ...")
        entries = self.seqs.get_entries()
        seql = len(entries[0][1])
        aligned = True
        for entri in entries[1:]:
            l = len(entri[1])
            if not seql == l:
                aligned = False
                break
        
        if aligned and len(self.seqs) > 1:
            self.cfg.log.info("Query sequences are aligned")
            refalnl = self.refjson.get_alignment_length()
            if refalnl == seql:
                self.cfg.log.info("Merging query alignment with reference alignment")
                self.merge_alignment(self.seqs)
            else:
                self.cfg.log.info("Merging query alignment with reference alignment using MUSCLE")
                self.require_muscle()
                refaln = self.refjson.get_alignment(fout = self.tmp_refaln)
                m = muscle(self.cfg)
                self.epa_alignment = m.merge(refaln, self.tmpquery)
        else:
            self.cfg.log.info("Query sequences are not aligned")
            self.cfg.log.info("Align query sequences to the reference alignment using HMMER")
            self.require_hmmer()
            self.align_to_refenence(self.noalign, minp = minp)

    def print_ranks(self, rks, confs, minlw = 0.0):
        uncorr_ranks = self.refjson.get_uncorr_ranks(rks)
        ss = ""
        css = ""
        for i in range(len(uncorr_ranks)):
            conf = confs[i]
            if conf == confs[0] and confs[0] >=0.99:
                conf = 1.0
            if conf >= minlw:
                ss = ss + uncorr_ranks[i] + ";"
                css = css + "{0:.3f}".format(conf) + ";"
            else:
                break
        if ss == "":
            return None
        else:
            return ss[:-1] + "\t" + css[:-1]


    def classify(self, query_fname, minp = 0.9, ptp = False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:        
            self.checkinput(query_fname, minp)

            self.cfg.log.info("Running RAxML-EPA to place %d query sequences...\n" % self.query_count)
            raxml = RaxmlWrapper(config)
            reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
            self.refjson.get_raxml_readable_tree(reftree_fname)
            optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
            self.refjson.get_binary_model(optmod_fname)
            job_name = self.cfg.subst_name("epa_%NAME%")

            reftree_str = self.refjson.get_raxml_readable_tree()
            reftree = Tree(reftree_str)

            self.reftree_size = len(reftree.get_leaves())

            # IMPORTANT: set EPA heuristic rate based on tree size!                
            self.cfg.resolve_auto_settings(self.reftree_size)
            # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file        
            if self.cfg.epa_load_optmod:
                self.cfg.raxml_model = self.refjson.get_ratehet_model()

            reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

            jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname)
            
            raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True)
        
        self.cfg.log.info("Assigning taxonomic labels based on EPA placements...\n")
 
        placements = jp.get_placement()
        
        if self.out_assign_fname:
            fo = open(self.out_assign_fname, "w")
        else:
            fo = None
        
        noassign_list = []
        for place in placements:
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]
            if len(edges) > 0:
                ranks, lws = self.classify_helper.classify_seq(edges)
                
                isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks=ranks, lws=lws)
                rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw)
                
                if rankout == None:
                    noassign_list.append(origin_taxon_name)
                else:
                    output = "%s\t%s\t" % (origin_taxon_name, rankout)
                    if isnovo: 
                        output += "*"
                    else:
                        output +="o"
                    if self.cfg.verbose:
                        print(output) 
                    if fo:
                        fo.write(output + "\n")
            else:
                noassign_list.append(origin_taxon_name)
        
        if os.path.exists(self.noalign):
            with open(self.noalign) as fnoa:
                lines = fnoa.readlines()
                for line in lines:
                    taxon_name = line.strip()[1:]
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
                    noassign_list.append(origin_taxon_name)
                        
        for taxon_name in noassign_list:
            output = "%s\t\t\t?" % origin_taxon_name
            if self.cfg.verbose:
                print(output)
            if fo:
                fo.write(output + "\n")
        
        if fo:
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            full_aln = SeqGroup(self.epa_alignment)
            species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug)
            
            self.cfg.log.debug("Species clusters:")
 
            if fout:
                fo2 = open(fout+".species", "w")
            else:
                fo2 = None

            for sp_cluster in species_list:
                translated_taxa = []
                for taxon in sp_cluster:
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                    translated_taxa.append(origin_taxon_name)
                s = ",".join(translated_taxa)
                if fo2:
                    fo2.write(s + "\n")
                self.cfg.log.debug(s)

            if fo2:
                fo2.close()
        #############################################
        
    def novelty_check(self, place_edge, ranks, lws):
        """If the taxonomic assignment is not assigned to the genus level, 
        we need to check if it is due to the incomplete reference taxonomy or 
        it is likely to be something new:
        
        1. If the final ranks are assinged because of lw cut, that means with samller lw
        the ranks can be further assinged to lowers. This indicate the undetermined ranks 
        in the assignment is not due to the incomplete reference taxonomy, so the query 
        sequence is likely to be something new.
        
        2. Otherwise We check all leaf nodes' immediate lower rank below this ml placement point, 
        if they are not empty, output all ranks and indicate this could be novelty.
        """
        
        lowrank = 0
        for i in range(len(ranks)):
            if i < 6:
                """above genus level"""
                rk = ranks[i]
                lw = lws[i]
                if rk == "-":
                    break
                else:
                    lowrank = lowrank + 1
                    if lw >=0 and lw < self.cfg.min_lhw:
                        return True
        
        if lowrank >= 5 and lowrank < len(ranks) and not ranks[lowrank] == "-":
            return False
        else:
            placenode = self.reftree.search_nodes(B = place_edge)[0]
            if placenode.is_leaf():
                return False
            else:
                leafnodes = placenode.get_leaves()
                flag = True
                for leaf in leafnodes:
                    br_num = leaf.B
                    branks = self.bid_taxonomy_map[br_num]
                    if branks[lowrank] == "-":
                        flag = False
                        break
                        
                return flag