class LeaveOneTest: def __init__(self, config, args): self.cfg = config self.method = args.method self.minlw = args.min_lhw self.jplace_fname = args.jplace_fname self.ranktest = args.ranktest self.output_fname = args.output_dir + "/" + args.output_name # switch off branch length filter self.brlen_pv = 0. self.tmp_refaln = config.tmp_fname("%NAME%.refaln") self.reftree_lbl_fname = config.tmp_fname("%NAME%_lbl.tre") self.reftree_tax_fname = config.tmp_fname("%NAME%_tax.tre") self.optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") try: self.refjson = RefJsonParser(config.refjson_fname, ver="1.2") except ValueError: print("Invalid json file format!") sys.exit() #validate input json format self.refjson.validate() self.rate = self.refjson.get_rate() self.node_height = self.refjson.get_node_height() self.origin_taxonomy = self.refjson.get_origin_taxonomy() self.bid_taxonomy_map = self.refjson.get_bid_tanomomy_map() self.tax_tree = self.refjson.get_tax_tree() self.cfg.compress_patterns = self.refjson.get_pattern_compression() reftree_str = self.refjson.get_raxml_readable_tree() self.reftree = Tree(reftree_str) self.reftree_size = len(self.reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() self.classify_helper = TaxClassifyHelper(self.cfg, self.bid_taxonomy_map, self.brlen_pv, self.rate, self.node_height) self.TAXONOMY_RANKS_COUNT = 10 self.mislabels = [] self.mislabels_cnt = [0] * self.TAXONOMY_RANKS_COUNT self.rank_mislabels = [] self.rank_mislabels_cnt = [0] * self.TAXONOMY_RANKS_COUNT self.misrank_conf_map = {} def cleanup(self): FileUtils.remove_if_exists(self.tmp_refaln) def classify_seq(self, placement): edges = placement["p"] if len(edges) > 0: return self.classify_helper.classify_seq(edges, self.method, self.minlw) else: print "ERROR: no placements! something is definitely wrong!" def rank_level_name(self, uni_rank_level): return { 0: ("?__", "Unknown"), 1: ("k__", "Kingdom"), 2: ("p__", "Phylum"), 3: ("c__", "Class"), 4: ("d__", "Subclass"), 5: ("o__", "Order"), 6: ("n__", "Suborder"), 7: ("f__", "Family"), 8: ("g__", "Genus"), 9: ("s__", "Species") }[uni_rank_level] def guess_rank_level(self, ranks, rank_level): rank_name = ranks[rank_level] real_level = 0 # check common prefixes and suffixes if rank_name.startswith("k__") or rank_name.lower() in ["bacteria", "archaea", "eukaryota"]: real_level = 1 elif rank_name.startswith("p__"): real_level = 2 elif rank_name.startswith("c__"): real_level = 3 elif rank_name.endswith("dae"): real_level = 4 elif rank_name.startswith("o__") or rank_name.endswith("ales"): real_level = 5 elif rank_name.endswith("neae"): real_level = 6 elif rank_name.startswith("f__") or rank_name.endswith("ceae"): real_level = 7 elif rank_name.startswith("g__"): real_level = 8 elif rank_name.startswith("s__"): real_level = 9 if real_level == 0: if rank_level == 0: # kingdom real_level = 1 else: parent_level = self.guess_rank_level(ranks, rank_level-1) real_level = parent_level + 1 if len(ranks) < 8 and (real_level in [4,6]): real_level += 1 return real_level def guess_rank_level_name(self, ranks, rank_level): real_level = self.guess_rank_level(ranks, rank_level) return self.rank_level_name(real_level) def check_seq_tax_labels(self, seq_name, orig_ranks, ranks, lws): mislabel_lvl = -1 min_len = min(len(orig_ranks),len(ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[rank_lvl] != orig_ranks[rank_lvl]: mislabel_lvl = rank_lvl break if mislabel_lvl >= 0: real_lvl = self.guess_rank_level(orig_ranks, mislabel_lvl) mis_rec = {} mis_rec['name'] = EpacConfig.strip_ref_prefix(seq_name) mis_rec['orig_level'] = mislabel_lvl mis_rec['real_level'] = real_lvl mis_rec['level_name'] = self.rank_level_name(real_lvl)[1] mis_rec['inv_level'] = -1 * real_lvl # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = ranks mis_rec['lws'] = lws mis_rec['conf'] = lws[mislabel_lvl] self.mislabels.append(mis_rec) return mis_rec else: return None def check_rank_tax_labels(self, rank_name, orig_ranks, ranks, lws): mislabel_lvl = -1 min_len = min(len(orig_ranks),len(ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != Taxonomy.EMPTY_RANK and ranks[rank_lvl] != orig_ranks[rank_lvl]: mislabel_lvl = rank_lvl break if mislabel_lvl >= 0: real_lvl = self.guess_rank_level(orig_ranks, mislabel_lvl) mis_rec = {} mis_rec['name'] = rank_name mis_rec['orig_level'] = mislabel_lvl mis_rec['real_level'] = real_lvl mis_rec['level_name'] = self.rank_level_name(real_lvl)[1] mis_rec['inv_level'] = -1 * real_lvl # just for sorting mis_rec['orig_ranks'] = orig_ranks mis_rec['ranks'] = ranks mis_rec['lws'] = lws mis_rec['conf'] = lws[mislabel_lvl] self.rank_mislabels.append(mis_rec) return mis_rec else: return None def mis_rec_to_string_old(self, mis_rec): lvl = mis_rec['orig_level'] output = mis_rec['name'] + "\t" output += "%s\t%s\t%s\t%.3f\n" % (mis_rec['level_name'], mis_rec['orig_ranks'][lvl], mis_rec['ranks'][lvl], mis_rec['lws'][lvl]) output += ";".join(mis_rec['orig_ranks']) + "\n" output += ";".join(mis_rec['ranks']) + "\n" output += "\t".join(["%.3f" % conf for conf in mis_rec['lws']]) + "\n" return output def mis_rec_to_string(self, mis_rec): lvl = mis_rec['orig_level'] output = mis_rec['name'] + "\t" output += "%s\t%s\t%s\t%.3f\t" % (mis_rec['level_name'], mis_rec['orig_ranks'][lvl], mis_rec['ranks'][lvl], mis_rec['lws'][lvl]) output += Taxonomy.lineage_str(mis_rec['orig_ranks']) + "\t" output += Taxonomy.lineage_str(mis_rec['ranks']) + "\t" output += ";".join(["%.3f" % conf for conf in mis_rec['lws']]) if 'rank_conf' in mis_rec: output += "\t%.3f" % mis_rec['rank_conf'] return output def sort_mislabels(self): self.mislabels = sorted(self.mislabels, key=itemgetter('inv_level', 'conf'), reverse=True) for mis_rec in self.mislabels: real_lvl = mis_rec["real_level"] self.mislabels_cnt[real_lvl] += 1 if self.ranktest: self.rank_mislabels = sorted(self.rank_mislabels, key=itemgetter('inv_level', 'conf'), reverse=True) for mis_rec in self.rank_mislabels: real_lvl = mis_rec["real_level"] self.rank_mislabels_cnt[real_lvl] += 1 def write_mislabels(self, final=True): if final: out_fname = "%s.mis" % self.output_fname else: out_fname = "%s.premis" % self.output_fname with open(out_fname, "w") as fo_all: fields = ["SeqID", "MislabeledLevel", "OriginalLabel", "ProposedLabel", "Confidence", "OriginalTaxonomyPath", "ProposedTaxonomyPath", "PerRankConfidence"] if self.ranktest: fields += ["HigherRankMisplacedConfidence"] header = ";" + "\t".join(fields) + "\n" fo_all.write(header) if self.cfg.verbose and len(self.mislabels) > 0 and final: print "Mislabeled sequences:\n" print header for mis_rec in self.mislabels: output = self.mis_rec_to_string(mis_rec) + "\n" fo_all.write(output) if self.cfg.verbose and final: print(output) if not final: return if self.ranktest: with open("%s.misrank" % self.output_fname, "w") as fo_all: fields = ["RankID", "MislabeledLevel", "OriginalLabel", "ProposedLabel", "Confidence", "OriginalTaxonomyPath", "ProposedTaxonomyPath", "PerRankConfidence"] header = ";" + "\t".join(fields) + "\n" fo_all.write(header) if self.cfg.verbose and len(self.rank_mislabels) > 0: print "\nMislabeled higher ranks:\n" print header for mis_rec in self.rank_mislabels: output = self.mis_rec_to_string(mis_rec) + "\n" fo_all.write(output) if self.cfg.verbose: print(output) print "Mislabels counts by ranks:" with open("%s.stats" % self.output_fname, "w") as fo_stat: seq_sum = 0 rank_sum = 0 for i in range(1, self.TAXONOMY_RANKS_COUNT): rname = self.rank_level_name(i)[1].ljust(10) if self.mislabels_cnt[i] > 0 or i not in [4,6]: seq_sum += self.mislabels_cnt[i] output = "%s:\t%d" % (rname, seq_sum) if self.ranktest: rank_sum += self.rank_mislabels_cnt[i] output += "\t%d" % rank_sum fo_stat.write(output + "\n") print(output) def get_orig_ranks(self, seq_name): nodes = self.tax_tree.get_leaves_by_name(seq_name) if len(nodes) != 1: print "FATAL ERROR: Sequence %s is not found in the taxonomic tree, or is present more than once!" % seq_name sys.exit() seq_node = nodes[0] orig_ranks = Taxonomy.split_rank_uid(seq_node.up.name) return orig_ranks def run_leave_subtree_out_test(self): job_name = self.cfg.subst_name("l1out_rank_%NAME%") # if self.jplace_fname: # jp = EpaJsonParser(self.jplace_fname) # else: #create file with subtrees rank_tips = {} rank_parent = {} for node in self.tax_tree.traverse("postorder"): if node.is_leaf() or node.is_root(): continue tax_path = node.name ranks = Taxonomy.split_rank_uid(tax_path) rank_lvl = Taxonomy.lowest_assigned_rank_level(ranks) if rank_lvl < 2: continue parent_ranks = Taxonomy.split_rank_uid(node.up.name) parent_lvl = Taxonomy.lowest_assigned_rank_level(parent_ranks) if parent_lvl < 1: continue rank_seqs = node.get_leaf_names() rank_size = len(rank_seqs) if rank_size < 2 or rank_size > self.reftree_size-4: continue # print rank_lvl, "\t", tax_path, "\t", rank_seqs, "\n" rank_tips[tax_path] = node.get_leaf_names() rank_parent[tax_path] = parent_ranks subtree_list = rank_tips.items() if len(subtree_list) == 0: return 0 subtree_list_file = self.cfg.tmp_fname("treelist_%NAME%.txt") with open(subtree_list_file, "w") as fout: for rank_name, tips in subtree_list: fout.write("%s\n" % " ".join(tips)) jp_list = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_subtree", subtree_fname=subtree_list_file) subtree_count = 0 for jp in jp_list: placements = jp.get_placement() for place in placements: ranks, lws = self.classify_seq(place) tax_path = subtree_list[subtree_count][0] orig_ranks = Taxonomy.split_rank_uid(tax_path) rank_level = Taxonomy.lowest_assigned_rank_level(orig_ranks) rank_prefix = self.guess_rank_level_name(orig_ranks, rank_level)[0] rank_name = orig_ranks[rank_level] if not rank_name.startswith(rank_prefix): rank_name = rank_prefix + rank_name parent_ranks = rank_parent[tax_path] # print orig_ranks, "\n", parent_ranks, "\n", ranks, "\n" mis_rec = self.check_rank_tax_labels(rank_name, parent_ranks, ranks, lws) if mis_rec: self.misrank_conf_map[tax_path] = mis_rec['conf'] subtree_count += 1 return subtree_count def run_leave_seq_out_test(self): job_name = self.cfg.subst_name("l1out_seq_%NAME%") if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: jp = self.raxml.run_epa(job_name, self.refalign_fname, self.reftree_fname, self.optmod_fname, mode="l1o_seq") placements = jp.get_placement() seq_count = 0 for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.get_orig_ranks(seq_name) # get EPA tax label ranks, lws = self.classify_seq(place) # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) # cross-check with higher rank mislabels if self.ranktest and mis_rec: rank_conf = 0 for lvl in range(2,len(orig_ranks)): tax_path = Taxonomy.get_rank_uid(orig_ranks, lvl) if tax_path in self.misrank_conf_map: rank_conf = max(rank_conf, self.misrank_conf_map[tax_path]) mis_rec['rank_conf'] = rank_conf seq_count += 1 return seq_count def run_final_epa_test(self): self.reftree_outgroup = self.refjson.get_outgroup() tmp_reftree = self.reftree.copy() tmp_taxtree = self.tax_tree.copy() for mis_rec in self.mislabels: name = mis_rec['name'] rname = EpacConfig.REF_SEQ_PREFIX + name leaf_nodes = tmp_reftree.get_leaves_by_name(rname) if len(leaf_nodes) > 0: leaf_nodes[0].delete() else: print "Node not found in the reference tree: %s" % rname leaf_nodes = tmp_taxtree.get_leaves_by_name(rname) if len(leaf_nodes) > 0: leaf_nodes[0].delete() else: print "Node not found in the taxonomic tree: %s" % rname # remove unifurcation at the root if len(tmp_reftree.children) == 1: tmp_reftree = tmp_reftree.children[0] self.mislabels = [] th = TaxTreeHelper(self.origin_taxonomy, self.cfg) th.set_mf_rooted_tree(tmp_taxtree) self.run_epa_once(tmp_reftree, th) def run_epa_once(self, reftree, th): reftree_fname = self.cfg.tmp_fname("final_ref_%NAME%.tre") job_name = self.cfg.subst_name("final_epa_%NAME%") reftree.write(outfile=reftree_fname) # IMPORTANT: don't load the model, since it's invalid for the pruned true !!! optmod_fname="" epa_result = self.raxml.run_epa(job_name, self.refalign_fname, reftree_fname, optmod_fname) reftree_epalbl_str = epa_result.get_std_newick_tree() placements = epa_result.get_placement() # update branchid-taxonomy mapping to account for possible changes in branch numbering reftree_tax = Tree(reftree_epalbl_str) th.set_bf_unrooted_tree(reftree_tax) bid_tax_map = th.get_bid_taxonomy_map() cl = TaxClassifyHelper(self.cfg, bid_tax_map, self.brlen_pv, self.rate, self.node_height) for place in placements: seq_name = place["n"][0] # get original taxonomic label orig_ranks = self.get_orig_ranks(seq_name) # get EPA tax label ranks, lws = cl.classify_seq(place["p"]) # check if they match mis_rec = self.check_seq_tax_labels(seq_name, orig_ranks, ranks, lws) if not self.cfg.debug: self.raxml.cleanup(job_name) FileUtils.remove_if_exists(reftree_fname) def run_test(self): self.raxml = RaxmlWrapper(self.cfg) print "Number of sequences in the reference: %d\n" % self.reftree_size self.refjson.get_raxml_readable_tree(self.reftree_fname) self.refalign_fname = self.refjson.get_alignment(self.tmp_refaln) self.refjson.get_binary_model(self.optmod_fname) if self.ranktest: print "Running the leave-one-rank-out test...\n" subtree_count = self.run_leave_subtree_out_test() print "Running the leave-one-sequence-out test...\n" self.run_leave_seq_out_test() if len(self.mislabels) > 0: print "Leave-one-out test identified %d suspicious sequences; running final EPA test to check them...\n" % len(self.mislabels) self.write_mislabels(final=False) self.run_final_epa_test() self.sort_mislabels() self.write_mislabels() print "\nPercentage of mislabeled sequences: %.2f %%" % (float(len(self.mislabels)) / self.reftree_size * 100) if not self.cfg.debug: FileUtils.remove_if_exists(self.reftree_fname) FileUtils.remove_if_exists(self.optmod_fname) FileUtils.remove_if_exists(self.refalign_fname)
class EpaClassifier: def __init__(self, config, args): self.cfg = config self.jplace_fname = args.jplace_fname self.ignore_refalign = args.ignore_refalign self.tmp_refaln = config.tmp_fname("%NAME%.refaln") #here is the final alignment file for running EPA self.epa_alignment = config.tmp_fname("%NAME%.afa") self.hmmprofile = config.tmp_fname("%NAME%.hmmprofile") self.tmpquery = config.tmp_fname("%NAME%.tmpquery") self.noalign = config.tmp_fname("%NAME%.noalign") self.seqs = None try: self.refjson = RefJsonParser(config.refjson_fname) except ValueError: print("Invalid json file format!") sys.exit() #validate input json format self.refjson.validate() self.bid_taxonomy_map = self.refjson.get_bid_tanomomy_map() self.reftree = self.refjson.get_reftree() self.rate = self.refjson.get_rate() self.node_height = self.refjson.get_node_height() self.cfg.compress_patterns = self.refjson.get_pattern_compression() self.classify_helper = TaxClassifyHelper(self.cfg, self.bid_taxonomy_map, args.p_value, self.rate, self.node_height) def cleanup(self): FileUtils.remove_if_exists(self.tmp_refaln) FileUtils.remove_if_exists(self.epa_alignment) FileUtils.remove_if_exists(self.hmmprofile) FileUtils.remove_if_exists(self.tmpquery) FileUtils.remove_if_exists(self.noalign) def align_to_refenence(self, noalign, minp = 0.9): refaln = self.refjson.get_alignment(fout = self.tmp_refaln) fprofile = self.refjson.get_hmm_profile(self.hmmprofile) # if there is no hmmer profile in json file, build it from scratch if not fprofile: hmm = hmmer(self.cfg, refaln) fprofile = hmm.build_hmm_profile() hm = hmmer(config = self.cfg, refalign = refaln , query = self.tmpquery, refprofile = fprofile, discard = noalign, seqs = self.seqs, minp = minp) self.epa_alignment = hm.align() def merge_alignment(self, query_seqs): refaln = self.refjson.get_alignment_list() with open(self.epa_alignment, "w") as fout: for seq in refaln: fout.write(">" + seq[0] + "\n" + seq[1] + "\n") for name, seq, comment, sid in query_seqs.iter_entries(): fout.write(">" + name + "\n" + seq + "\n") def checkinput(self, query_fname, minp = 0.9): formats = ["fasta", "phylip", "iphylip", "phylip_relaxed", "iphylip_relaxed"] for fmt in formats: try: self.seqs = SeqGroup(sequences=query_fname, format = fmt) break except: print("Guessing input format: not " + fmt) if self.seqs == None: print("Invalid input file format!") print("The supported input formats are fasta and phylip") sys.exit() if self.ignore_refalign: print("Assuming query file contains reference sequences, skipping the alignment step...") with open(self.epa_alignment, "w") as fout: for name, seq, comment, sid in self.seqs.iter_entries(): ref_name = self.REF_PREFIX + name if ref_name in self.refjson.get_sequences_names(): seq_name = ref_name else: seq_name = EpacConfig.QUERY_SEQ_PREFIX + name fout.write(">" + seq_name + "\n" + seq + "\n") return # add query seq name prefix to avoid confusion between reference and query sequences self.seqs.add_name_prefix(EpacConfig.QUERY_SEQ_PREFIX) self.seqs.write(format="fasta", outfile=self.tmpquery) print("Checking if query sequences are aligned ...") entries = self.seqs.get_entries() seql = len(entries[0][1]) aligned = True for entri in entries[1:]: l = len(entri[1]) if not seql == l: aligned = False break if aligned and len(self.seqs) > 1: print("Query sequences are aligned") refalnl = self.refjson.get_alignment_length() if refalnl == seql: print("Merging query alignment with reference alignment") self.merge_alignment(self.seqs) else: print("Merging query alignment with reference alignment using MUSCLE") require_muscle() refaln = self.refjson.get_alignment(fout = self.tmp_refaln) m = muscle(self.cfg) self.epa_alignment = m.merge(refaln, self.tmpquery) else: print("Query sequences are not aligned") print("Align query sequences to the reference alignment using HMMER") require_hmmer() self.align_to_refenence(self.noalign, minp = minp) print("Running EPA ......") print("") def print_ranks(self, rks, confs, minlw = 0.0): ss = "" css = "" for i in range(len(rks)): conf = confs[i] if conf == confs[0] and confs[0] >=0.99: conf = 1.0 if conf >= minlw: ss = ss + rks[i] + ";" css = css + "{0:.3f}".format(conf) + ";" else: break if ss == "": return None else: return ss[:-1] + "\t" + css[:-1] def classify(self, query_fname, fout = None, method = "1", minlw = 0.0, pv = 0.02, minp = 0.9, ptp = False): if self.jplace_fname: jp = EpaJsonParser(self.jplace_fname) else: self.checkinput(query_fname, minp) raxml = RaxmlWrapper(config) reftree_fname = self.cfg.tmp_fname("ref_%NAME%.tre") self.refjson.get_raxml_readable_tree(reftree_fname) optmod_fname = self.cfg.tmp_fname("%NAME%.opt") self.refjson.get_binary_model(optmod_fname) job_name = self.cfg.subst_name("epa_%NAME%") reftree_str = self.refjson.get_raxml_readable_tree() reftree = Tree(reftree_str) self.reftree_size = len(reftree.get_leaves()) # IMPORTANT: set EPA heuristic rate based on tree size! self.cfg.resolve_auto_settings(self.reftree_size) # If we're loading the pre-optimized model, we MUST set the same rate het. mode as in the ref file if self.cfg.epa_load_optmod: self.cfg.raxml_model = self.refjson.get_ratehet_model() reduced_align_fname = raxml.reduce_alignment(self.epa_alignment) jp = raxml.run_epa(job_name, reduced_align_fname, reftree_fname, optmod_fname) placements = jp.get_placement() if fout: fo = open(fout, "w") else: fo = None output2 = "" for place in placements: output = None taxon_name = place["n"][0] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) edges = place["p"] # edges = self.erlang_filter(edges, p = pv) if len(edges) > 0: ranks, lws = self.classify_helper.classify_seq(edges, method, minlw) isnovo = self.novelty_check(place_edge = str(edges[0][0]), ranks =ranks, lws = lws, minlw = minlw) rankout = self.print_ranks(ranks, lws, minlw) if rankout == None: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" else: output = "%s\t%s\t" % (origin_taxon_name, self.print_ranks(ranks, lws, minlw)) if isnovo: output += "*" else: output +="o" if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") else: output2 = output2 + origin_taxon_name+ "\t\t\t?\n" if os.path.exists(self.noalign): with open(self.noalign) as fnoa: lines = fnoa.readlines() for line in lines: taxon_name = line.strip()[1:] origin_taxon_name = EpacConfig.strip_query_prefix(taxon_name) output = "%s\t\t\t?" % origin_taxon_name if self.cfg.verbose: print(output) if fo: fo.write(output + "\n") if self.cfg.verbose: print(output2) if fo: fo.write(output2) fo.close() ############################################# # # EPA-PTP species delimitation # ############################################# if ptp: full_aln = SeqGroup(self.epa_alignment) species_list = epa_2_ptp(epa_jp = jp, ref_jp = self.refjson, full_alignment = full_aln, min_lw = 0.5, debug = self.cfg.debug) if self.cfg.verbose: print "Species clusters:" if fout: fo2 = open(fout+".species", "w") else: fo2 = None for sp_cluster in species_list: translated_taxa = [] for taxon in sp_cluster: origin_taxon_name = EpacConfig.strip_query_prefix(taxon) translated_taxa.append(origin_taxon_name) s = ",".join(translated_taxa) if fo2: fo2.write(s + "\n") if self.cfg.verbose: print s if fo2: fo2.close() ############################################# if not self.jplace_fname: if not self.cfg.debug: raxml.cleanup(job_name) FileUtils.remove_if_exists(reduced_align_fname) FileUtils.remove_if_exists(reftree_fname) FileUtils.remove_if_exists(optmod_fname) def novelty_check(self, place_edge, ranks, lws, minlw): """If the taxonomic assignment is not assigned to the genus level, we need to check if it is due to the incomplete reference taxonomy or it is likely to be something new: 1. If the final ranks are assinged because of lw cut, that means with samller lw the ranks can be further assinged to lowers. This indicate the undetermined ranks in the assignment is not due to the incomplete reference taxonomy, so the query sequence is likely to be something new. 2. Otherwise We check all leaf nodes' immediate lower rank below this ml placement point, if they are not empty, output all ranks and indicate this could be novelty. """ lowrank = 0 for i in range(len(ranks)): if i < 6: """above genus level""" rk = ranks[i] lw = lws[i] if rk == "-": break else: lowrank = lowrank + 1 if lw >=0 and lw < minlw: return True if lowrank >= 5 and not ranks[lowrank] == "-": return False else: placenode = self.reftree.search_nodes(B = place_edge)[0] if placenode.is_leaf(): return False else: leafnodes = placenode.get_leaves() flag = True for leaf in leafnodes: br_num = leaf.B branks = self.bid_taxonomy_map[br_num] if branks[lowrank] == "-": flag = False break return flag