Ejemplo n.º 1
0
    def run_ptp(self, jp):
        full_aln = SeqGroup(self.epa_alignment)
        species_list = epa_2_ptp(epa_jp=jp,
                                 ref_jp=self.refjson,
                                 full_alignment=full_aln,
                                 min_lw=0.5,
                                 debug=self.cfg.debug)

        self.cfg.log.debug("Species clusters:")

        if fout:
            fo2 = open(fout + ".species", "w")
        else:
            fo2 = None

        for sp_cluster in species_list:
            translated_taxa = []
            for taxon in sp_cluster:
                origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                translated_taxa.append(origin_taxon_name)
            s = ",".join(translated_taxa)
            if fo2:
                fo2.write(s + "\n")
            self.cfg.log.debug(s)

        if fo2:
            fo2.close()
Ejemplo n.º 2
0
    def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws):
        mislabel_lvl = -1
        min_len = min(len(orig_ranks),len(ranks))
        for rank_lvl in range(min_len):
            if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[rank_lvl] != orig_ranks[rank_lvl]:
                mislabel_lvl = rank_lvl
                break

        if mislabel_lvl >= 0:
            real_lvl = self.guess_rank_level(orig_ranks, mislabel_lvl)
            mis_rec = {}
            mis_rec['name'] = EpacConfig.strip_ref_prefix(seq_name)
            mis_rec['orig_level'] = mislabel_lvl
            mis_rec['real_level'] = real_lvl
            mis_rec['level_name'] = self.rank_level_name(real_lvl)[1]
            mis_rec['inv_level'] = -1 * real_lvl  # just for sorting
            mis_rec['orig_ranks'] = orig_ranks
            mis_rec['ranks'] = ranks
            mis_rec['lws'] = lws
            mis_rec['conf'] = lws[mislabel_lvl]
            self.mislabels.append(mis_rec)
            
            return mis_rec
        else:
            return None
Ejemplo n.º 3
0
 def get_noalign_list(self):
     noalign_list = []
     if os.path.exists(self.noalign):
         with open(self.noalign) as fnoa:
             lines = fnoa.readlines()
             for line in lines:
                 taxon_name = line.strip()[1:]
                 origin_taxon_name = EpacConfig.strip_query_prefix(
                     taxon_name)
                 noalign_list.append(origin_taxon_name)
     return noalign_list
Ejemplo n.º 4
0
    def setUp(self):
        self.testfile_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "testfiles")
        self.tax_fname = os.path.join(self.testfile_dir, "test_clean.tax")
        self.taxonomy = Taxonomy(EpacConfig.REF_SEQ_PREFIX, self.tax_fname)
        tax_map = self.taxonomy.get_map()
        cfg = EpacConfig()
        self.taxtree_helper = TaxTreeHelper(cfg, tax_map)

        outgr_fname = os.path.join(self.testfile_dir, "outgroup.nw")
        self.expected_outgr = Tree(outgr_fname)
Ejemplo n.º 5
0
 def test_taxtree_builder(self):
     cfg = EpacConfig()
     testfile_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testfiles")
     tax_fname = os.path.join(testfile_dir, "test.tax")
     tax = Taxonomy(EpacConfig.REF_SEQ_PREFIX, tax_fname)
     tree_fname = os.path.join(testfile_dir, "taxtree.nw")
     expected_tree = Tree(tree_fname, format=8)
     tb = TaxTreeBuilder(cfg, tax)
     tax_tree, seq_ids = tb.build()
     self.assertEqual(seq_ids, tax.get_map().keys())
     self.assertEqual(tax_tree.write(format=8), expected_tree.write(format=8))
Ejemplo n.º 6
0
    def classify(self, query_fname, minp=0.9, ptp=False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:
            self.checkinput(query_fname, minp)
            jp = self.run_epa()

        self.cfg.log.info(
            "Assigning taxonomic labels based on EPA placements...\n")

        placements = jp.get_placement()

        if self.out_assign_fname:
            fo = open(self.out_assign_fname, "w")
        else:
            fo = None

        noassign_list = []
        for place in placements:
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]

            ranks, lws = self.classify_helper.classify_seq(edges)
            rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw)

            if rankout == None:
                noassign_list.append(origin_taxon_name)
            else:
                output = "%s\t%s\t" % (origin_taxon_name, rankout)
                if self.cfg.check_novelty:
                    isnovo = self.novelty_check(place_edge=str(edges[0][0]),
                                                ranks=ranks,
                                                lws=lws)
                    output += "*" if isnovo else "o"
                self.print_result_line(fo, output)

        noassign_list += self.get_noalign_list()

        for taxon_name in noassign_list:
            output = "%s\t\t\t?" % origin_taxon_name
            self.print_result_line(fo, output)

        if fo:
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            self.run_ptp(jp)
Ejemplo n.º 7
0
 def check_seq_ids(self):
     # check that seq IDs in taxonomy and alignment correspond
     self.mis_ids = []
     for sid in self.taxonomy.seq_ranks_map.iterkeys():
         unprefixed_sid = EpacConfig.strip_ref_prefix(sid)
         if not self.alignment.has_seq(unprefixed_sid):
             self.mis_ids.append(unprefixed_sid)
             
     if len(self.mis_ids) > 0 and self.verbose:
         errmsg = "ERROR: Following %d sequence(s) are missing in your alignment file:\n%s\n\n" % (len(self.mis_ids), "\n".join(self.mis_ids))
         errmsg += "Please make sure sequence IDs in taxonomic annotation file and in alignment are identical!\n"
         self.cfg.exit_user_error(errmsg)
         
     return self.mis_ids
Ejemplo n.º 8
0
    def check_seq_ids(self):
        # check that seq IDs in taxonomy and alignment correspond
        self.mis_ids = []
        for sid in self.taxonomy.seq_ranks_map.iterkeys():
            unprefixed_sid = EpacConfig.strip_ref_prefix(sid)
            if not self.alignment.has_seq(unprefixed_sid):
                self.mis_ids.append(unprefixed_sid)

        if len(self.mis_ids) > 0 and self.verbose:
            errmsg = "ERROR: Following %d sequence(s) are missing in your alignment file:\n%s\n\n" % (
                len(self.mis_ids), "\n".join(self.mis_ids))
            errmsg += "Please make sure sequence IDs in taxonomic annotation file and in alignment are identical!\n"
            self.cfg.exit_user_error(errmsg)

        return self.mis_ids
Ejemplo n.º 9
0
 def mis_rec_to_string(self, mis_rec):
     lvl = mis_rec['orig_level']
     uncorr_name = EpacConfig.strip_ref_prefix(self.refjson.get_uncorr_seqid(mis_rec['name']))
     uncorr_orig_ranks = self.refjson.get_uncorr_ranks(mis_rec['orig_ranks'])
     uncorr_ranks = self.refjson.get_uncorr_ranks(mis_rec['ranks'])
     output = uncorr_name + "\t"
   
     if lvl >= 0:
         output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], 
             uncorr_orig_ranks[lvl], uncorr_ranks[lvl], mis_rec['lws'][lvl])
     else:
         output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], 
             "NA", "NA", mis_rec['lws'][0])
     
     output += Taxonomy.lineage_str(uncorr_orig_ranks) + "\t"
     output += Taxonomy.lineage_str(uncorr_ranks) + "\t"
     output += ";".join(["%.3f" % conf for conf in mis_rec['lws']])
     if 'rank_conf' in mis_rec:
         output += "\t%.3f" % mis_rec['rank_conf']
     return output
Ejemplo n.º 10
0
    def mis_rec_to_string(self, mis_rec):
        lvl = mis_rec['orig_level']
        uncorr_name = EpacConfig.strip_ref_prefix(
            self.refjson.get_uncorr_seqid(mis_rec['name']))
        uncorr_orig_ranks = self.refjson.get_uncorr_ranks(
            mis_rec['orig_ranks'])
        uncorr_ranks = self.refjson.get_uncorr_ranks(mis_rec['ranks'])
        output = uncorr_name + "\t"

        if lvl >= 0:
            output += "%s\t%s\t%s\t%.3f\t" % (
                mis_rec['level_name'], uncorr_orig_ranks[lvl],
                uncorr_ranks[lvl], mis_rec['lws'][lvl])
        else:
            output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], "NA",
                                              "NA", mis_rec['lws'][0])

        output += Taxonomy.lineage_str(uncorr_orig_ranks) + "\t"
        output += Taxonomy.lineage_str(uncorr_ranks) + "\t"
        output += ";".join(["%.3f" % conf for conf in mis_rec['lws']])
        if 'rank_conf' in mis_rec:
            output += "\t%.3f" % mis_rec['rank_conf']
        return output
Ejemplo n.º 11
0
 def test_epac_config(self):
     args = self.get_default_namespace()
     cfg = EpacConfig(args)
     self.check_common_config(cfg)
Ejemplo n.º 12
0
    def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:        
            self.checkinput(query_fname, minp)
            raxml = RaxmlWrapper(config)
            reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
            self.refjson.get_raxml_readable_tree(reftree_fname)
            optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
            self.refjson.get_binary_model(optmod_fname)
            job_name = self.cfg.subst_name("epa_%NAME%")

            reftree_str = self.refjson.get_raxml_readable_tree()
            reftree = Tree(reftree_str)

            self.reftree_size = len(reftree.get_leaves())

            # IMPORTANT: set EPA heuristic rate based on tree size!                
            self.cfg.resolve_auto_settings(self.reftree_size)
            # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file        
            if self.cfg.epa_load_optmod:
                self.cfg.raxml_model = self.refjson.get_ratehet_model()

            reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

            jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname)
        
        placements = jp.get_placement()
        
        if fout:
            fo = open(fout, "w")
        else:
            fo = None
        
        output2 = ""
        for place in placements:
            output = None
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]
#            edges = self.erlang_filter(edges, p = pv)
            if len(edges) > 0:
                ranks, lws = self.classify_helper.classify_seq(edges, method, minlw)
                
                isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw)
                rankout = self.print_ranks(ranks, lws, minlw)
                
                if rankout == None:
                    output2 = output2 + origin_taxon_name+ "\t\t\t?\n"
                else:
                    output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw))
                    if isnovo: 
                        output += "*"
                    else:
                        output +="o"
                    if self.cfg.verbose:
                        print(output) 
                    if fo:
                        fo.write(output + "\n")
            else:
                output2 = output2 + origin_taxon_name+ "\t\t\t?\n"
        
        if os.path.exists(self.noalign):
            with open(self.noalign) as fnoa:
                lines = fnoa.readlines()
                for line in lines:
                    taxon_name = line.strip()[1:]
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
                    output = "%s\t\t\t?" % origin_taxon_name
                    if self.cfg.verbose:
                        print(output)
                    if fo:
                        fo.write(output + "\n")
        
        if self.cfg.verbose:
            print(output2)
        
        if fo:
            fo.write(output2)
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            full_aln = SeqGroup(self.epa_alignment)
            species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug)
            
            if self.cfg.verbose:
                print "Species clusters:"

            if fout:
                fo2 = open(fout+".species", "w")
            else:
                fo2 = None

            for sp_cluster in species_list:
                translated_taxa = []
                for taxon in sp_cluster:
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                    translated_taxa.append(origin_taxon_name)
                s = ",".join(translated_taxa)
                if fo2:
                    fo2.write(s + "\n")
                if self.cfg.verbose:
                    print s

            if fo2:
                fo2.close()
        #############################################
        
        if not self.jplace_fname:
            if not self.cfg.debug:
                raxml.cleanup(job_name)
                FileUtils.remove_if_exists(reduced_align_fname)
                FileUtils.remove_if_exists(reftree_fname)
                FileUtils.remove_if_exists(optmod_fname)
Ejemplo n.º 13
0
 def setUp(self):
     self.cfg = EpacConfig()
     self.testfile_dir = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "testfiles")
Ejemplo n.º 14
0
    def classify(self, query_fname, minp = 0.9, ptp = False):
        if self.jplace_fname:
            jp = EpaJsonParser(self.jplace_fname)
        else:        
            self.checkinput(query_fname, minp)

            self.cfg.log.info("Running RAxML-EPA to place %d query sequences...\n" % self.query_count)
            raxml = RaxmlWrapper(config)
            reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre")
            self.refjson.get_raxml_readable_tree(reftree_fname)
            optmod_fname = self.cfg.tmp_fname("%NAME%.opt")
            self.refjson.get_binary_model(optmod_fname)
            job_name = self.cfg.subst_name("epa_%NAME%")

            reftree_str = self.refjson.get_raxml_readable_tree()
            reftree = Tree(reftree_str)

            self.reftree_size = len(reftree.get_leaves())

            # IMPORTANT: set EPA heuristic rate based on tree size!                
            self.cfg.resolve_auto_settings(self.reftree_size)
            # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file        
            if self.cfg.epa_load_optmod:
                self.cfg.raxml_model = self.refjson.get_ratehet_model()

            reduced_align_fname = raxml.reduce_alignment(self.epa_alignment)

            jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname)
            
            raxml.copy_epa_jplace(job_name, self.out_jplace_fname, move=True)
        
        self.cfg.log.info("Assigning taxonomic labels based on EPA placements...\n")
 
        placements = jp.get_placement()
        
        if self.out_assign_fname:
            fo = open(self.out_assign_fname, "w")
        else:
            fo = None
        
        noassign_list = []
        for place in placements:
            taxon_name = place["n"][0]
            origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
            edges = place["p"]
            if len(edges) > 0:
                ranks, lws = self.classify_helper.classify_seq(edges)
                
                isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks=ranks, lws=lws)
                rankout = self.print_ranks(ranks, lws, self.cfg.min_lhw)
                
                if rankout == None:
                    noassign_list.append(origin_taxon_name)
                else:
                    output = "%s\t%s\t" % (origin_taxon_name, rankout)
                    if isnovo: 
                        output += "*"
                    else:
                        output +="o"
                    if self.cfg.verbose:
                        print(output) 
                    if fo:
                        fo.write(output + "\n")
            else:
                noassign_list.append(origin_taxon_name)
        
        if os.path.exists(self.noalign):
            with open(self.noalign) as fnoa:
                lines = fnoa.readlines()
                for line in lines:
                    taxon_name = line.strip()[1:]
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name)
                    noassign_list.append(origin_taxon_name)
                        
        for taxon_name in noassign_list:
            output = "%s\t\t\t?" % origin_taxon_name
            if self.cfg.verbose:
                print(output)
            if fo:
                fo.write(output + "\n")
        
        if fo:
            fo.close()

        #############################################
        #
        # EPA-PTP species delimitation
        #
        #############################################
        if ptp:
            full_aln = SeqGroup(self.epa_alignment)
            species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug)
            
            self.cfg.log.debug("Species clusters:")
 
            if fout:
                fo2 = open(fout+".species", "w")
            else:
                fo2 = None

            for sp_cluster in species_list:
                translated_taxa = []
                for taxon in sp_cluster:
                    origin_taxon_name = EpacConfig.strip_query_prefix(taxon)
                    translated_taxa.append(origin_taxon_name)
                s = ",".join(translated_taxa)
                if fo2:
                    fo2.write(s + "\n")
                self.cfg.log.debug(s)

            if fo2:
                fo2.close()