def find_orthologs_per_hit(arguments): annota.connect() line, args = arguments if not line.strip() or line.startswith('#'): return None r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': return None best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) # dp we need this? #if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: # return None all_orthologies = annota.get_member_orthologs(best_hit_name) orthologs = sorted(all_orthologies[args.orthology_type]) taxid = query_name.split(".")[0] # target species and taxid to be added return (query_name, [], taxid, orthologs)
def main(args): fasta_file = args.input fields = fasta_file.split("/") file_id = fields[len(fields)-1].split(".fa")[0] orthologs_file = "%s.orthologs" %file_id print orthologs_file hmm_hits_file = "tmp.emapper.hmm_hits" seed_orthologs_file = "tmp.emapper.seed_orthologs" if args.target_species: target_species = args.target_species # Sequence search with hmmer host, port, dbpath, scantype, idmap = setup_hmm_search(args) # Start HMM SCANNING sequences (if requested) if not pexists(hmm_hits_file) or args.override: dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args) if not args.no_refine and (not pexists(seed_orthologs_file) or args.override): if args.db == 'viruses': print 'Skipping seed ortholog detection in "viruses" database' elif args.db in EGGNOG_DATABASES: refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args) else: print 'refined hits not available for custom hmm databases.' # Orthologs search annota.connect() find_orthologs(seed_orthologs_file, orthologs_file, hmm_hits_file, args) os.system("rm %s %s" % (hmm_hits_file, seed_orthologs_file)) print "done"
def annotate_hit_line(arguments): annota.connect() line, args = arguments if not line.strip() or line.startswith('#'): return None r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': return None best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: return None match_nogs = annota.get_member_ogs(best_hit_name) if not match_nogs: return None match_levels = set([nog.split("@")[1] for nog in match_nogs]) if args.tax_scope == "auto": for level in TAXONOMIC_RESOLUTION: if level in match_levels: annot_levels = set(LEVEL_CONTENT.get(level, [level])) annot_levels.add(level) annot_level_max = "%s[%d]" %(level, len(annot_levels)) break else: annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope])) annot_levels.add(args.tax_scope) annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels)) all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels) orthologs = sorted(all_orthologies[args.target_orthologs]) if args.excluded_taxa: orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)] if orthologs: pname, gos, kegg, bigg = annota.summarize_annotations(orthologs, target_go_ev=args.go_evidence, excluded_go_ev=args.go_excluded) best_name = '' if pname: name_candidate, freq = pname.most_common(1)[0] if freq >= 2: best_name = name_candidate else: pname = [] best_name = '' gos = set() kegg = set() bigg = set() return (query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, gos, kegg, bigg, annot_level_max, match_nogs, orthologs)
def annotate_hmm_matches(hits_file, hits_annot_file, args): hits_annot_header = map( str.strip, '''#query_name, hit, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage, members_in_og, og_description, og_COG_categories'''. split(',')) annota.connect() print colorify("Functional annotation of hits starts now", 'green') start_time = time.time() if pexists(hits_file): OUT = open(hits_annot_file, "w") if not args.no_file_comments: print >> OUT, get_call_info() print >> OUT, '\t'.join(hits_annot_header) qn = 0 t1 = time.time() for line in open(hits_file): if not line.strip() or line.startswith('#'): continue qn += 1 if qn and (qn % 10000 == 0): total_time = time.time() - start_time print >>sys.stderr, qn, total_time, "%0.2f q/s (refinement)" %\ ((float(qn) / total_time)) sys.stderr.flush() (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t')) if hit not in ['ERROR', '-']: hitname = cleanup_og_name(hit) level, nm, desc, cats = annota.get_og_annotations(hitname) print >> OUT, '\t'.join( map(str, [ query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats ])) else: print >> OUT, '\t'.join([query] + [hit] * (len(hits_annot_header) - 1)) elapsed_time = time.time() - t1 if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def annotate_hmm_matches(hits_file, hits_annot_file, args): hits_annot_header = map(str.strip, '''#query_name, hit, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage, members_in_og, og_description, og_COG_categories'''.split(',')) annota.connect() print colorify("Functional annotation of hits starts now", 'green') start_time = time.time() if pexists(hits_file): OUT = open(hits_annot_file, "w") if not args.no_file_comments: print >>OUT, get_call_info() print >>OUT, '\t'.join(hits_annot_header) qn = 0 t1 = time.time() for line in open(hits_file): if not line.strip() or line.startswith('#'): continue if qn and (qn % 10000 == 0): total_time = time.time() - start_time print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" %\ ((float(qn + 1) / total_time)) sys.stderr.flush() qn += 1 (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t')) if hit not in ['ERROR', '-']: hitname = cleanup_og_name(hit) level, nm, desc, cats = annota.get_og_annotations(hitname) print >>OUT, '\t'.join(map( str, [query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats])) else: print >>OUT, '\t'.join( [query] + [hit] * (len(hits_annot_header) - 1)) elapsed_time = time.time() - t1 if not args.no_file_comments: print >>OUT, '# %d queries scanned' % (qn + 1) print >>OUT, '# Total time (seconds):', elapsed_time print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def get_seq_hmm_matches(hits_file): annota.connect() print colorify("Reading HMM matches", 'green') seq2oginfo = {} start_time = time.time() hitnames = set() if pexists(hits_file): for line in open(hits_file): if not line.strip() or line.startswith('#'): continue (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t')) if query not in seq2oginfo and hit not in ['ERROR', '-']: hitname = cleanup_og_name(hit) seq2oginfo[query] = [hitname, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage] return seq2oginfo
def _annotate_hit_line(arguments): annota.connect() line, args = arguments if not line.strip() or line.startswith('#'): return None r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': return None best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: return None match_nogs = annota.get_member_ogs(best_hit_name) if not match_nogs: return None match_levels = set() for nog in match_nogs: match_levels.update(LEVEL_PARENTS[nog.split("@")[1]]) swallowest_level = sorted(match_levels & set(LEVEL_DEPTH.keys()), key=lambda x: LEVEL_DEPTH[x], reverse=True)[0] annot_levels = set() if args.tax_scope == "auto": for level in TAXONOMIC_RESOLUTION: if level in match_levels: annot_levels.add(level) annot_level_max = LEVEL_NAMES.get(level, level) break else: annot_levels.add(args.tax_scope) annot_level_max = LEVEL_NAMES.get(args.tax_scope, args.tax_scope) if args.target_taxa != 'all': target_taxa = orthology.normalize_target_taxa(args.target_taxa) else: target_taxa = None try: all_orthologies = annota.get_member_orthologs( best_hit_name, target_taxa=target_taxa, target_levels=annot_levels) except Exception: orthologs = None status = 'Error' else: orthologs = sorted(all_orthologies[args.target_orthologs]) if args.excluded_taxa: orthologs = [ o for o in orthologs if not o.startswith("%s." % args.excluded_taxa) ] status = 'OK' if orthologs: annotations = annota.summarize_annotations( orthologs, target_go_ev=args.go_evidence, excluded_go_ev=args.go_excluded) else: annotations = {} return (query_name, best_hit_name, best_hit_evalue, best_hit_score, annotations, annot_level_max, swallowest_level, match_nogs, orthologs)
def main(args): # Output and intermediate files hmm_hits_file = "%s.emapper.hmm_hits" % args.output seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output annot_file = "%s.emapper.annotations" % args.output orthologs_file = "%s.emapper.predict_orthologs" % args.output if args.no_search: output_files = [annot_file] elif args.no_annot: output_files = [hmm_hits_file, seed_orthologs_file] else: output_files = [hmm_hits_file, seed_orthologs_file, annot_file] # convert to absolute path before changing directory if args.annotate_hits_table: args.annotate_hits_table = os.path.abspath(args.annotate_hits_table) # force user to decide what to do with existing files os.chdir(args.output_dir) files_present = set([pexists(fname) for fname in output_files]) if True in files_present and not args.resume and not args.override: print "Output files detected in disk. Use --resume or --override to continue" raise emapperException() if args.override: for outf in output_files: silent_rm(outf) print '# ', get_version() print '# ./emapper.py ', ' '.join(sys.argv[1:]) if args.scratch_dir: # If resuming in and using --scratch_dir, transfer existing files. if args.resume and args.scratch_dir: for f in output_files: if pexists(f): print " Copying input file %s to scratch dir %s" % ( f, args.scratch_dir) shutil.copy(f, args.scratch_dir) # Change working dir os.chdir(args.scratch_dir) # Step 1. Sequence search if not args.no_search: if args.mode == 'diamond' and not args.no_search: dump_diamond_matches(args.input, seed_orthologs_file, args) elif args.mode == 'hmmer' and not args.no_search: host, port, dbpath, scantype, idmap = setup_hmm_search(args) # Start HMM SCANNING sequences (if requested) if not pexists(hmm_hits_file) or args.override: dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args) if not args.no_refine and (not pexists(seed_orthologs_file) or args.override): if args.db == 'viruses': print 'Skipping seed ortholog detection in "viruses" database' elif args.db in EGGNOG_DATABASES: refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args) else: print 'refined hits not available for custom hmm databases.' # Step 2. Annotation if not args.no_annot: annota.connect() if args.annotate_hits_table: if not os.path.exists(args.annotate_hits_table): raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), args.annotate_hits_table) annotate_hits_file(args.annotate_hits_table, annot_file, hmm_hits_file, args) elif args.db == 'viruses': annotate_hmm_matches(hmm_hits_file, hmm_hits_file + '.annotations', args) OUT = open(annot_file, 'w') for line in open(hmm_hits_file + '.annotations'): if line.startswith('#') or not line.strip(): continue (query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats) = line.split("\t") if hitname != '-' and hitname != 'ERROR': print >> OUT, '\t'.join( map(str, (query, hitname, evalue, sum_score, '', '', '', 'viruses', hitname + "@viruses", "%s|%s|%s" % (hitname, evalue, sum_score), cats.replace('\n', ''), desc.replace('\n', ' ')))) OUT.close() else: annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args) if args.predict_ortho: orthology.connect() dump_orthologs(seed_orthologs_file, orthologs_file, args) # If running in scratch, move files to real output dir and clean up if args.scratch_dir: for fname in output_files: if pexists(fname): print " Copying result file %s from scratch to %s" % ( fname, args.output_dir) shutil.copy(annot_file, args.output_dir) print " Cleaning result file %s from scratch dir" % (fname) # Finalize and exit print colorify('Done', 'green') for f in output_files: colorify('Result files:', 'yellow') if pexists(f): print " %s" % (f) print 'Total time: %g secs' % (time.time() - _total_time) if args.mode == 'hmmer': print get_citation(['hmmer']) elif args.mode == 'diamond': print get_citation(['diamond']) shutdown_server()
def main(args): # Output and intermediate files hmm_hits_file = "%s.emapper.hmm_hits" % args.output seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output annot_file = "%s.emapper.annotations" % args.output if args.no_search: output_files = [annot_file] elif args.no_annot: output_files = [hmm_hits_file, seed_orthologs_file] else: output_files = [hmm_hits_file, seed_orthologs_file, annot_file] # force user to decide what to do with existing files os.chdir(args.output_dir) files_present = set([pexists(fname) for fname in output_files]) if True in files_present and not args.resume and not args.override: print "Output files detected in disk. Use --resume or --override to continue" raise emapperException() if args.override: for outf in output_files: silent_rm(outf) print '# ', get_version() print '# ./emapper.py ', ' '.join(sys.argv[1:]) if args.scratch_dir: # If resuming in and using --scratch_dir, transfer existing files. if args.resume and args.scratch_dir: for f in output_files: if pexists(f): print " Copying input file %s to scratch dir %s" % (f, args.scratch_dir) shutil.copy(f, args.scratch_dir) # Change working dir os.chdir(args.scratch_dir) # Step 1. Sequence search if not args.no_search: if args.mode == 'diamond' and not args.no_search: dump_diamond_matches(args.input, seed_orthologs_file, args) elif args.mode == 'hmmer' and not args.no_search: host, port, dbpath, scantype, idmap = setup_hmm_search(args) # Start HMM SCANNING sequences (if requested) if not pexists(hmm_hits_file) or args.override: dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args) if not args.no_refine and (not pexists(seed_orthologs_file) or args.override): if args.db == 'viruses': print 'Skipping seed ortholog detection in "viruses" database' elif args.db in EGGNOG_DATABASES: refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args) else: print 'refined hits not available for custom hmm databases.' # Step 2. Annotation if not args.no_annot: annota.connect() if args.annotate_hits_table: annotate_hits_file(args.annotate_hits_table, annot_file, hmm_hits_file, args) elif args.db == 'viruses': annotate_hmm_matches(hmm_hits_file, hmm_hits_file+'.annotations', args) OUT = open(annot_file, 'w') for line in open(hmm_hits_file+'.annotations'): if line.startswith('#') or not line.strip(): continue (query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats) = line.split("\t") if hitname != '-' and hitname != 'ERROR': print >>OUT, '\t'.join(map(str, (query, hitname, evalue, sum_score, '', '', '', 'viruses', hitname+"@viruses", "%s|%s|%s" %(hitname, evalue, sum_score), cats.replace('\n', ''), desc.replace('\n', ' ')))) OUT.close() else: annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args) # If running in scratch, move files to real output dir and clean up if args.scratch_dir: for fname in output_files: if pexists(fname): print " Copying result file %s from scratch to %s" % (fname, args.output_dir) shutil.copy(annot_file, args.output_dir) print " Cleaning result file %s from scratch dir" %(fname) # Finalize and exit print colorify('Done', 'green') for f in output_files: colorify('Result files:', 'yellow') if pexists(f): print " %s" % (f) print 'Total time: %g secs' % (time.time()-_total_time) if args.mode == 'hmmer': print get_citation(['hmmer']) elif args.mode == 'diamond': print get_citation(['diamond']) shutdown_server()