def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): HIT_HEADER = [ "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "best_tax_level", ] HIT_OG_HEADER = [ "taxonomic scope", "eggNOG OGs", "best eggNOG OG", "COG Functional cat.", "eggNOG free text desc." ] start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if args.report_orthologs: ORTHOLOGS = open(annot_file + ".orthologs", "w") if not args.no_file_comments: print >> OUT, '# emapper version:', get_version( ), 'emapper DB:', get_db_version() print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:]) print >> OUT, '# time: ' + time.ctime() print >> OUT, '\t'.join(HIT_HEADER + ANNOTATIONS_HEADER + HIT_OG_HEADER) qn = 0 pool = multiprocessing.Pool(args.cpu) for result in pool.imap(annotate_hit_line, iter_hit_lines(seed_orthologs_file, args)): qn += 1 if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn, total_time, "%0.2f q/s (func. annotation)" % ( (float(qn) / total_time)) sys.stderr.flush() if result: (query_name, best_hit_name, best_hit_evalue, best_hit_score, annotations, annot_level_max, swallowest_level, match_nogs, orthologs) = result if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = annota.get_best_og_description(match_nogs) if args.report_orthologs: print >> ORTHOLOGS, '\t'.join( map(str, (query_name, ','.join(orthologs)))) # prepare annotations for printing annot_columns = [ query_name, best_hit_name, str(best_hit_evalue), str(best_hit_score), LEVEL_NAMES[swallowest_level] ] for h in ANNOTATIONS_HEADER: if h in annotations: annot_columns.append(','.join(sorted(annotations[h]))) else: annot_columns.append('') annot_columns.extend([ annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' ') ]) print >> OUT, '\t'.join(annot_columns) #OUT.flush() pool.terminate() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() if args.report_orthologs: ORTHOLOGS.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def annotate_hits_file_sequential(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ( "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_pathways", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if not args.no_file_comments: print >> OUT, '# ' + time.ctime() print >> OUT, '# ' + ' '.join(sys.argv) print >> OUT, '\t'.join(annot_header) qn = 0 for line in open(seed_orthologs_file): if not line.strip() or line.startswith('#'): continue if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn + 1, total_time, "%0.2f q/s (refinement)" % ( (float(qn + 1) / total_time)) sys.stderr.flush() qn += 1 r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: continue match_nogs = annota.get_member_ogs(best_hit_name) if not match_nogs: continue match_levels = set([nog.split("@")[1] for nog in match_nogs]) if args.tax_scope == "auto": for level in TAXONOMIC_RESOLUTION: if level in match_levels: annot_levels = set(LEVEL_CONTENT.get(level, [level])) annot_levels.add(level) annot_level_max = "%s[%d]" % (level, len(annot_levels)) break else: annot_levels = set( LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope])) annot_levels.add(args.tax_scope) annot_level_max = "%s[%d]" % (args.tax_scope, len(annot_levels)) all_orthologies = annota.get_member_orthologs( best_hit_name, target_levels=annot_levels) orthologs = sorted(all_orthologies[args.target_orthologs]) if args.excluded_taxa: orthologs = [ o for o in orthologs if not o.startswith("%s." % args.excluded_taxa) ] if orthologs: pname, gos, keggs = annota.get_member_annotations( orthologs, target_go_ev=args.go_evidence, excluded_go_ev=args.go_excluded) best_name = '' if pname: name_candidate, freq = pname.most_common(1)[0] if freq >= 2: best_name = name_candidate else: pname = [] best_name = '' gos = set() keggs = set() if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = '', '' print >> OUT, '\t'.join( map(str, ( query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(map(lambda x: "map%05d" % x, map(int, keggs)))), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn + 1) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ( "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_KOs", "BiGG_reactions", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if args.report_orthologs: ORTHOLOGS = open(annot_file + ".orthologs", "w") if not args.no_file_comments: print >> OUT, '# emapper version:', get_version( ), 'emapper DB:', get_db_version() print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:]) print >> OUT, '# time: ' + time.ctime() print >> OUT, '\t'.join(annot_header) qn = 0 pool = multiprocessing.Pool(args.cpu) for result in pool.imap(annotate_hit_line, iter_hit_lines(seed_orthologs_file, args)): qn += 1 if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn, total_time, "%0.2f q/s (refinement)" % ( (float(qn) / total_time)) sys.stderr.flush() if result: (query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, gos, kegg, bigg, annot_level_max, match_nogs, orthologs) = result if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = annota.get_best_og_description(match_nogs) if args.report_orthologs: print >> ORTHOLOGS, '\t'.join( map(str, (query_name, ','.join(orthologs)))) print >> OUT, '\t'.join( map(str, ( query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(kegg)), ','.join(sorted(bigg)), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() pool.terminate() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() if args.report_orthologs: ORTHOLOGS.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ("#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_pathways", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations(set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if not args.no_file_comments: print >>OUT, '# ' + time.ctime() print >>OUT, '# ' + ' '.join(sys.argv) print >>OUT, '\t'.join(annot_header) qn = 0 for line in open(seed_orthologs_file): if not line.strip() or line.startswith('#'): continue if qn and (qn % 500 == 0): total_time = time.time() - start_time print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" % ( (float(qn + 1) / total_time)) sys.stderr.flush() qn += 1 r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: continue match_nogs = annota.get_member_ogs(best_hit_name) if not match_nogs: continue match_levels = set([nog.split("@")[1] for nog in match_nogs]) if args.tax_scope == "auto": for level in TAXONOMIC_RESOLUTION: if level in match_levels: annot_levels = set(LEVEL_CONTENT.get(level, [level])) annot_levels.add(level) annot_level_max = "%s[%d]" %(level, len(annot_levels)) break else: annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope])) annot_levels.add(args.tax_scope) annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels)) all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels) orthologs = sorted(all_orthologies[args.target_orthologs]) if args.excluded_taxa: orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)] if orthologs: pname, gos, keggs = annota.get_member_annotations(orthologs, excluded_gos=set(["IEA", "ND"])) best_name = '' if pname: name_candidate, freq = pname.most_common(1)[0] if freq >= 2: best_name = name_candidate else: pname = [] best_name = '' gos = set() keggs = set() if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' %(hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = '', '' print >>OUT, '\t'.join(map(str, (query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(map(lambda x: "map%05d"%x, map(int, keggs)))), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() elapsed_time = time.time() - start_time if not args.no_file_comments: print >>OUT, '# %d queries scanned' % (qn + 1) print >>OUT, '# Total time (seconds):', elapsed_time print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')