def dump_diamond_matches(fasta_file, seed_orthologs_file, args): cpu = args.cpu score_thr = args.seed_ortholog_score evalue_thr = args.seed_ortholog_evalue excluded_taxa = args.excluded_taxa if args.excluded_taxa else None if args.translate: tool = 'blastx' else: tool = 'blastp' if not DIAMOND: raise ValueError("diamond not found in path") tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=args.temp_dir) raw_output_file = pjoin(tempdir, uuid.uuid4().hex) if excluded_taxa: cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25' %\ (DIAMOND, tool, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file) else: cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3' %\ (DIAMOND, tool, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file) print colorify(' '+cmd, 'yellow') status = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if status == 0: OUT = open('%s' %seed_orthologs_file, 'w') if not args.no_file_comments: print >>OUT, get_call_info() print >>OUT, '#', cmd visited = set() for line in open(raw_output_file): if not line.strip() or line.startswith('#'): continue fields = map(str.strip, line.split('\t')) query = fields[0] hit = fields[1] evalue = float(fields[10]) score = float(fields[11]) if query in visited: continue if evalue > evalue_thr or score < score_thr: continue if excluded_taxa and hit.startswith("%s." % excluded_taxa): continue visited.add(query) print >>OUT, '\t'.join(map(str, [query, hit, evalue, score])) OUT.close() else: print cmd raise ValueError('Error running diamond') shutil.rmtree(tempdir)
def refine_matches(fasta_file, refine_file, hits_file, args): refine_header = map( str.strip, '''#query_name, best_hit_eggNOG_ortholog, best_hit_evalue, best_hit_score'''.split(',')) print colorify("Hit refinement starts now", 'green') start_time = time.time() og2level = dict([ tuple(map(str.strip, line.split('\t'))) for line in gopen(OGLEVELS_FILE) ]) OUT = open(refine_file, "w") if not args.no_file_comments: print >> OUT, get_call_info() print >> OUT, '\t'.join(refine_header) qn = 0 for qn, r in enumerate( process_nog_hits_file(hits_file, fasta_file, og2level, translate=args.translate, cpu=args.cpu, excluded_taxa=args.excluded_taxa, base_tempdir=args.temp_dir)): if qn and (qn % 25 == 0): total_time = time.time() - start_time print >>sys.stderr, qn + \ 1, total_time, "%0.2f q/s (refinement)" % ( (float(qn + 1) / total_time)) sys.stderr.flush() query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) print >> OUT, '\t'.join( map(str, (query_name, best_hit_name, best_hit_evalue, best_hit_score))) #OUT.flush() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn + 1) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def dump_diamond_matches(fasta_file, seed_orthologs_file, args): cpu = args.cpu score_thr = args.seed_ortholog_score evalue_thr = args.seed_ortholog_evalue excluded_taxa = args.excluded_taxa if args.excluded_taxa else None if not DIAMOND: raise ValueError("diamond not found in path") tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=TEMPDIR) raw_output_file = pjoin(tempdir, uuid.uuid4().hex) cmd = '%s blastp -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3' %\ (DIAMOND, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file) print colorify(' '+cmd, 'yellow') status = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if status == 0: OUT = open('%s' %seed_orthologs_file, 'w') if not args.no_file_comments: print >>OUT, get_call_info() print >>OUT, '#', cmd visited = set() for line in open(raw_output_file): if not line.strip() or line.startswith('#'): continue fields = map(str.strip, line.split('\t')) query = fields[0] hit = fields[1] evalue = float(fields[10]) score = float(fields[11]) if query in visited: continue if evalue > evalue_thr or score < score_thr: continue if excluded_taxa or hit.startswith("%s." % excluded_taxa): continue visited.add(query) print >>OUT, '\t'.join(map(str, [query, hit, evalue, score])) OUT.close() else: print cmd raise ValueError('Error running diamond') shutil.rmtree(tempdir)
def annotate_hmm_matches(hits_file, hits_annot_file, args): hits_annot_header = map( str.strip, '''#query_name, hit, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage, members_in_og, og_description, og_COG_categories'''. split(',')) annota.connect() print colorify("Functional annotation of hits starts now", 'green') start_time = time.time() if pexists(hits_file): OUT = open(hits_annot_file, "w") if not args.no_file_comments: print >> OUT, get_call_info() print >> OUT, '\t'.join(hits_annot_header) qn = 0 t1 = time.time() for line in open(hits_file): if not line.strip() or line.startswith('#'): continue qn += 1 if qn and (qn % 10000 == 0): total_time = time.time() - start_time print >>sys.stderr, qn, total_time, "%0.2f q/s (refinement)" %\ ((float(qn) / total_time)) sys.stderr.flush() (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t')) if hit not in ['ERROR', '-']: hitname = cleanup_og_name(hit) level, nm, desc, cats = annota.get_og_annotations(hitname) print >> OUT, '\t'.join( map(str, [ query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats ])) else: print >> OUT, '\t'.join([query] + [hit] * (len(hits_annot_header) - 1)) elapsed_time = time.time() - t1 if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def annotate_hmm_matches(hits_file, hits_annot_file, args): hits_annot_header = map(str.strip, '''#query_name, hit, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage, members_in_og, og_description, og_COG_categories'''.split(',')) annota.connect() print colorify("Functional annotation of hits starts now", 'green') start_time = time.time() if pexists(hits_file): OUT = open(hits_annot_file, "w") if not args.no_file_comments: print >>OUT, get_call_info() print >>OUT, '\t'.join(hits_annot_header) qn = 0 t1 = time.time() for line in open(hits_file): if not line.strip() or line.startswith('#'): continue if qn and (qn % 10000 == 0): total_time = time.time() - start_time print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" %\ ((float(qn + 1) / total_time)) sys.stderr.flush() qn += 1 (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t')) if hit not in ['ERROR', '-']: hitname = cleanup_og_name(hit) level, nm, desc, cats = annota.get_og_annotations(hitname) print >>OUT, '\t'.join(map( str, [query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats])) else: print >>OUT, '\t'.join( [query] + [hit] * (len(hits_annot_header) - 1)) elapsed_time = time.time() - t1 if not args.no_file_comments: print >>OUT, '# %d queries scanned' % (qn + 1) print >>OUT, '# Total time (seconds):', elapsed_time print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def refine_matches(fasta_file, refine_file, hits_file, args): refine_header = map(str.strip, '''#query_name, best_hit_eggNOG_ortholog, best_hit_evalue, best_hit_score'''.split(',')) print colorify("Hit refinement starts now", 'green') start_time = time.time() og2level = dict([tuple(map(str.strip, line.split('\t'))) for line in gopen(OGLEVELS_FILE)]) OUT = open(refine_file, "w") if not args.no_file_comments: print >>OUT, get_call_info() print >>OUT, '\t'.join(refine_header) qn = 0 for qn, r in enumerate(process_nog_hits_file(hits_file, fasta_file, og2level, translate=args.translate, cpu=args.cpu, excluded_taxa=args.excluded_taxa)): if qn and (qn % 25 == 0): total_time = time.time() - start_time print >>sys.stderr, qn + \ 1, total_time, "%0.2f q/s (refinement)" % ( (float(qn + 1) / total_time)) sys.stderr.flush() query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) print >>OUT, '\t'.join(map(str, (query_name, best_hit_name, best_hit_evalue, best_hit_score))) #OUT.flush() elapsed_time = time.time() - start_time if not args.no_file_comments: print >>OUT, '# %d queries scanned' % (qn + 1) print >>OUT, '# Total time (seconds):', elapsed_time print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def get_seq_hmm_matches(hits_file): annota.connect() print colorify("Reading HMM matches", 'green') seq2oginfo = {} start_time = time.time() hitnames = set() if pexists(hits_file): for line in open(hits_file): if not line.strip() or line.startswith('#'): continue (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t')) if query not in seq2oginfo and hit not in ['ERROR', '-']: hitname = cleanup_og_name(hit) seq2oginfo[query] = [hitname, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage] return seq2oginfo
action="store_true", dest='quiet', help='quiet_mode') parser.add_argument("--data_dir", metavar='', type=existing_dir, help='Directory to use for DATA_PATH.') args = parser.parse_args() if args.data_dir: set_data_path(args.data_dir) if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')): print colorify('Downloading "og2level.tsv.gz" at %s' % get_data_path(), 'green') download_og2level() if 'all' in args.dbs: args.dbs = EGGNOG_DATABASES if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')): if args.allyes or ask("Download main annotation database?") == 'y': print colorify( 'Downloading "eggnog.db" at %s...' % get_data_path(), 'green') download_annotations() else: print 'Skipping' else: if not args.quiet:
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if not args.no_annot and not pexists(EGGNOGDB_FILE): print colorify( 'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB): print colorify( 'DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error( '--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error( 'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))' ) if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
help='assume "yes" to all questions') parser.add_argument( '-s', action="store_true", dest='simulate', help='simulate and print commands. Nothing is downloaded') ## args = parser.parse_args() if args.dbname is None or args.dbname == "": print( colorify( f'A prefix name for the DB to be created is required. Use the --dbname option.', 'red')) sys.exit(1) if (args.taxids is None or args.taxids == "") and (args.taxa is None or args.taxa == ""): print( colorify(f'Either --taxids or --taxa parameter is required', 'red')) sys.exit(1) if (args.taxids is not None and args.taxids != "") and (args.taxa is not None and args.taxa != ""): print(colorify(f'Use either --taxids or --taxa, not both', 'red')) sys.exit(1)
def parse_args(parser): args = parser.parse_args() if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) if args.version: version = "" try: version = get_full_version_info() except Exception: version = get_version() print(version) sys.exit(0) args.call_info = get_call_info() if args.list_taxa: from eggnogmapper.vars import LEVEL_DEPTH, LEVEL_DICT, LEVEL_NAMES, LEVEL_PARENTS print("tax_name\ttax_id\tdepth\tparents\tparents_names") for tax_name, tax_id in LEVEL_DICT.items(): depth = LEVEL_DEPTH.get(tax_id, "-") parents = LEVEL_PARENTS.get(tax_id, "-") parents_names = [LEVEL_NAMES.get(x, "-") for x in parents] print(f"{tax_name}\t{tax_id}\t{depth}\t{','.join(parents)}\t{','.join(parents_names)}") sys.exit(0) if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # translate if args.itype in [ITYPE_GENOME, ITYPE_META, ITYPE_PROTS] and args.translate == True: parser.error('"--translate" only can be used with "--itype CDS"') # Gene prediction if args.training_genome is not None and args.training_file is None: parser.error('"--training_genome requires --training_file"') if args.training_genome is None and args.training_file is not None: if not os.path.isfile(args.training_file): parser.error('"--training_file must point to an existing file, if no --training_genome is provided."') # Search modes if args.mode == SEARCH_MODE_DIAMOND: dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red')) raise EmapperException() if args.input is not None: if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue')) args.annotate_hits_table = None else: # the default -m is diamond, but we will consider -m no_search as default when # --annotate_hits_table has been provided and -i has not been provided if args.annotate_hits_table is not None: print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue')) args.mode = SEARCH_MODE_NO_SEARCH else: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') if args.resume == True: print(colorify("Diamond jobs cannot be resumed. --resume will be ignored.", 'blue')) args.resume = False elif args.mode == SEARCH_MODE_MMSEQS2: mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db() if not pexists(mmseqs_db): print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red')) raise EmapperException() if not args.input: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') if args.resume == True: print(colorify("MMseqs2 jobs cannot be resumed. --resume will be ignored.", 'blue')) args.resume = False if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue')) args.annotate_hits_table = None elif args.mode == SEARCH_MODE_HMMER: # if args.usemem == True: # total_workers = args.num_workers * args.num_servers # if args.cpu < total_workers: # parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.") # if args.cpu % total_workers != 0: # parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).") # args.cpus_per_worker = int(args.cpu / total_workers) # sys.stderr.write(f"CPUs per worker: {args.cpus_per_worker}\n") # else: # args.cpus_per_worker = args.cpu if not args.input: parser.error('An input file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') # Hmmer database # NOTE: hmmer database format, name and checking if exists is done within hmmer module if not args.db: parser.error('HMMER mode requires a target database (-d, --database).') if args.itype == ITYPE_CDS: args.translate = True if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH: parser.error('HMMER mode is not compatible with "--genepred search" option.') if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue')) args.annotate_hits_table = None if args.clean_overlaps is not None: if args.clean_overlaps == "none": args.clean_overlaps = None elif args.mode == SEARCH_MODE_CACHE: if args.cache_file is None: parser.error('A file with annotations and md5 of queries is required (-c FILE)') if args.decorate_gff != DECORATE_GFF_NONE: print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red')) if args.no_annot == True: parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.') elif args.mode == SEARCH_MODE_NO_SEARCH: if args.no_annot == False and not args.annotate_hits_table: parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)') if args.md5 == True and args.input is None: parser.error(f'--md5 requires an input FASTA file (-i FASTA).') # if args.no_annot == True and args.report_orthologs == False: # parser.error(f'Nothing to do if running in no search mode (-m {SEARCH_MODE_NO_SEARCH}), with --no_annot and without --report_orthologs.') else: parser.error(f'unrecognized search mode (-m {args.mode})') # Search thresholds args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score args.qcov = args.query_cover # Annotation options if args.no_annot == False or args.report_orthologs == True: if not pexists(get_eggnogdb_file()): print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')) raise EmapperException() args.tax_scope_mode, args.tax_scope_id = __parse_tax_scope(args.tax_scope) if args.target_taxa is not None: args.target_taxa = args.target_taxa.split(",") if args.excluded_taxa is not None: args.excluded_taxa = args.excluded_taxa.split(",") # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'all': args.go_evidence = None args.go_excluded = None else: raise ValueError('Invalid --go_evidence value') # PFAM annotation options if args.pfam_transfer in [PFAM_TRANSFER_BEST_OG, PFAM_TRANSFER_NARROWEST_OG, PFAM_TRANSFER_SEED_ORTHOLOG]: pass else: raise ValueError(f'Invalid --pfam_transfer option {args.pfam_transfer}') if args.pfam_realign == PFAM_REALIGN_NONE: pass elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO: if not args.input: parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}') else: raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}') total_workers = args.num_workers * args.num_servers if args.cpu < total_workers: parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.") if args.cpu % total_workers != 0: parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).") args.cpus_per_worker = int(args.cpu / total_workers) return args
def setup_hmm_search(args): host = 'localhost' idmap = None if args.usemem: scantype = 'mem' else: scantype = 'disk' connecting_to_server = False # If searching against a predefined database name if args.db in EGGNOG_DATABASES: dbpath, port = get_db_info(args.db) print dbpath db_present = [ pexists(dbpath + "." + ext) for ext in 'h3f h3i h3m h3p idmap'.split() ] if False in db_present: print db_present print colorify( 'Database %s not present. Use download_eggnog_database.py to fetch it' % (args.db), 'red') raise ValueError('Database not found') if not args.no_refine: if not pexists(pjoin(get_data_path(), 'OG_fasta')): print colorify( 'Database data/OG_fasta/ not present. Use download_eggnog_database.py to fetch it', 'red') raise ValueError('Database not found') if scantype == 'mem': idmap_file = dbpath + '.idmap' end_port = 53200 # If searching against a custom hmm database elif os.path.isfile(args.db + '.h3f'): dbpath = args.db if scantype == 'mem': idmap_file = args.db + ".idmap" if not pexists(idmap_file): if generate_idmap(args.db): idmap_file = args.db + ".idmap" print >> sys.stderr, "idmap succesfully created!" else: raise ValueError("idmap could not be created!") port = 53000 end_port = 53200 else: idmap_file = None port = None # If searching against a emapper hmm server elif ":" in args.db: dbname, host, port = map(str.strip, args.db.split(":")) scantype = 'mem' port = int(port) if dbname in EGGNOG_DATABASES: dbfile, port = get_db_info(dbname) args.db = dbname else: dbfile = dbname idmap_file = dbfile + '.idmap' if not pexists(idmap_file): raise ValueError("idmap file not found: %s" % idmap_file) dbpath = host if not server_functional(host, port, args.dbtype): print colorify( "eggnog-mapper server not found at %s:%s" % (host, port), 'red') exit(1) connecting_to_server = True else: raise ValueError('Invalid database name/server') # If memory based searches requested, start server if scantype == "mem" and not connecting_to_server: master_db, worker_db = None, None for try_port in range(port, end_port, 2): print colorify( "Loading server at localhost, port %s-%s" % (try_port, try_port + 1), 'lblue') dbpath, master_db, worker_db = load_server(dbpath, try_port, try_port + 1, args.cpu) port = try_port ready = False for _ in xrange(TIMEOUT_LOAD_SERVER): print "Waiting for server to become ready...", host, try_port time.sleep(1) if not master_db.is_alive() or not worker_db.is_alive(): master_db.terminate() master_db.join() worker_db.terminate() worker_db.join() break elif server_functional(host, port, args.dbtype): ready = True break if ready: dbpath = host break elif scantype == "mem": print colorify("DB Server already running or not needed!", 'yellow') dbpath = host # Preload seqid map to translate hits from hmmpgmd if scantype == "mem": print colorify("Reading idmap %s" % idmap_file, color='lblue') idmap = {} for _lnum, _line in enumerate(open(idmap_file)): if not _line.strip(): continue try: _seqid, _seqname = map(str, _line.strip().split(' ')) except ValueError: if _lnum == 0: # idmap generated by esl_reformat has info line at beginning continue else: raise _seqid = int(_seqid) idmap[_seqid] = [_seqname] print len(idmap), "names loaded" # If server mode, just listen for connections and exit when interrupted if args.servermode: while True: print colorify( "Server ready listening at %s:%s and using %d CPU cores" % (host, port, args.cpu), 'green') print colorify( "Use `emapper.py -d %s:%s:%s (...)` to search against this server" % (args.db, host, port), 'lblue') time.sleep(10) raise emapperException() else: return host, port, dbpath, scantype, idmap
args = parser.parse_args() if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) data_path = get_data_path() ## # Annotation DB if args.force or not pexists(get_eggnogdb_file()): if args.allyes or ask("Download main annotation database?") == 'y': print(colorify(f'Downloading "eggnog.db" at {data_path}...', 'green')) download_annotations(data_path) else: print('Skipping') else: if not args.quiet: print(colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')) ## # NCBI taxa if args.force or not pexists(get_ncbitaxadb_file()): if args.allyes or ask("Download taxa database?") == 'y': print(colorify(f'Downloading "eggnog.taxa.db" at {data_path}...', 'green')) download_taxa(data_path) else:
dbpath, host, idmap_file = setup_custom_db(args.db, scantype=SCANTYPE_MEM, dbtype=args.dbtype) host = 'localhost' port = args.port end_port = args.end_port wport = args.wport dbpath, host, port, servers = create_servers(args.dbtype, dbpath, host, port, end_port, args.num_servers, args.num_workers, args.cpus_per_worker) print(colorify("All servers ready and listening", 'green')) if args.output_servers_list is not None: print(f"Creating servers list file: {args.output_servers_list}") with open(args.output_servers_list, 'w') as outfn: for server in servers: print(f"{server[0]}:{server[1]}", file=outfn) print(f"File {args.output_servers_list} created successfully.") print( colorify( f"Use `emapper.py (-d db:host:port or --servers_list {args.output_servers_list}) to search against these servers", 'lblue')) else: print( colorify( "Use `emapper.py (-d db:host:port or --servers_list FILE) to search against these servers",
def run(cmd): print colorify(cmd, 'cyan') if not args.simulate: os.system(cmd)
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ( "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_KOs", "BiGG_reactions", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if args.report_orthologs: ORTHOLOGS = open(annot_file + ".orthologs", "w") if not args.no_file_comments: print >> OUT, '# emapper version:', get_version( ), 'emapper DB:', get_db_version() print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:]) print >> OUT, '# time: ' + time.ctime() print >> OUT, '\t'.join(annot_header) qn = 0 pool = multiprocessing.Pool(args.cpu) for result in pool.imap(annotate_hit_line, iter_hit_lines(seed_orthologs_file, args)): qn += 1 if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn, total_time, "%0.2f q/s (refinement)" % ( (float(qn) / total_time)) sys.stderr.flush() if result: (query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, gos, kegg, bigg, annot_level_max, match_nogs, orthologs) = result if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = annota.get_best_og_description(match_nogs) if args.report_orthologs: print >> ORTHOLOGS, '\t'.join( map(str, (query_name, ','.join(orthologs)))) print >> OUT, '\t'.join( map(str, ( query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(kegg)), ','.join(sorted(bigg)), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() pool.terminate() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() if args.report_orthologs: ORTHOLOGS.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def setup_hmm_search(args): host = 'localhost' idmap = None if args.usemem: scantype = 'mem' else: scantype = 'disk' connecting_to_server = False # If searching against a predefined database name if args.db in EGGNOG_DATABASES: dbpath, port = get_db_info(args.db) db_present = [pexists(dbpath + "." + ext) for ext in 'h3f h3i h3m h3p idmap'.split()] if False in db_present: print db_present print colorify('Database %s not present. Use download_eggnog_database.py to fetch it' % (args.db), 'red') raise ValueError('Database not found') if not args.no_refine: if not pexists(pjoin(DATA_PATH, 'OG_fasta')): print colorify('Database data/OG_fasta/ not present. Use download_eggnog_database.py to fetch it', 'red') raise ValueError('Database not found') if scantype == 'mem': idmap_file = dbpath + '.idmap' end_port = 53200 # If searching against a custom hmm database elif os.path.isfile(args.db + '.h3f'): dbpath = args.db if scantype == 'mem': idmap_file = args.db + ".idmap" if not pexists(idmap_file): if generate_idmap(args.db): idmap_file = args.db + ".idmap" print >>sys.stderr, "idmap succesfully created!" else: raise ValueError("idmap could not be created!") port = 53000 end_port = 53200 else: idmap_file = None port = None # If searching against a emapper hmm server elif ":" in args.db: dbname, host, port = map(str.strip, args.db.split(":")) scantype = 'mem' port = int(port) if dbname in EGGNOG_DATABASES: dbfile, port = get_db_info(dbname) args.db = dbname else: dbfile = dbname idmap_file = dbfile + '.idmap' if not pexists(idmap_file): raise ValueError("idmap file not found: %s" % idmap_file) dbpath = host if not server_functional(host, port, args.dbtype): print colorify("eggnog-mapper server not found at %s:%s" % (host, port), 'red') exit(1) connecting_to_server = True else: raise ValueError('Invalid database name/server') # If memory based searches requested, start server if scantype == "mem" and not connecting_to_server: master_db, worker_db = None, None for try_port in range(port, end_port, 2): print colorify("Loading server at localhost, port %s-%s" % (try_port, try_port + 1), 'lblue') dbpath, master_db, worker_db = load_server( dbpath, try_port, try_port + 1, args.cpu) port = try_port ready = False for _ in xrange(TIMEOUT_LOAD_SERVER): print "Waiting for server to become ready...", host, try_port time.sleep(1) if not master_db.is_alive() or not worker_db.is_alive(): master_db.terminate() master_db.join() worker_db.terminate() worker_db.join() break elif server_functional(host, port, args.dbtype): ready = True break if ready: dbpath = host break elif scantype == "mem": print colorify("DB Server already running or not needed!", 'yellow') dbpath = host # Preload seqid map to translate hits from hmmpgmd if scantype == "mem": print colorify("Reading idmap %s" % idmap_file, color='lblue') idmap = {} for _lnum, _line in enumerate(open(idmap_file)): if not _line.strip(): continue try: _seqid, _seqname = map(str, _line.strip().split(' ')) except ValueError: if _lnum == 0: # idmap generated by esl_reformat has info line at beginning continue else: raise _seqid = int(_seqid) idmap[_seqid] = [_seqname] print len(idmap), "names loaded" # If server mode, just listen for connections and exit when interrupted if args.servermode: while True: print colorify("Server ready listening at %s:%s and using %d CPU cores" % (host, port, args.cpu), 'green') print colorify("Use `emapper.py -d %s:%s:%s (...)` to search against this server" % (args.db, host, port), 'lblue') time.sleep(10) raise emapperException() else: return host, port, dbpath, scantype, idmap
return __author__+" "+__license__+" : "+__description__ if __name__ == "__main__": parser = create_arg_parser() args = parse_args(parser) _total_time = time.time() try: print('# ', get_version()) print('# hmm_worker.py ', ' '.join(sys.argv[1:])) worker_db = None print(colorify(f"Loading worker at localhost, port {args.port}, connecting to {args.host}", 'green')) worker_db = load_worker(args.host, args.port, args.cpu) ready = False for _ in range(TIMEOUT_LOAD_SERVER): print(f"Waiting for worker to become ready at localhost:{args.port} ...") time.sleep(1) if worker_db.is_alive(): break else: worker_db.terminate() worker_db.join() print(colorify("worker not alive"), 'red') break print(colorify("Worker of master %s ready listening at localhost:%s and using %d CPU cores" % (args.host, args.port, args.cpu), 'lblue'))
def dump_hmm_matches(fasta_file, hits_file, dbpath, port, scantype, idmap, args): hits_header = ("#query_name", "hit", "evalue", "sum_score", "query_length", "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage") # Cache previous results if resuming is enabled VISITED = set() if args.resume and pexists(hits_file): print colorify("Resuming previous run. Reading computed output from %s" % hits_file, 'yellow') VISITED = set([line.split('\t')[0].strip() for line in open(hits_file) if not line.startswith('#')]) print len(VISITED), 'queries skipped' OUT = open(hits_file, 'a') else: OUT = open(hits_file, 'w') print colorify("Sequence mapping starts now!", 'green') if not args.no_file_comments: print >>OUT, get_call_info() print >>OUT, '# ' + '\t'.join(hits_header) total_time = 0 last_time = time.time() start_time = time.time() qn = 0 for qn, (name, elapsed, hits, querylen, seq) in enumerate(search.iter_hits( fasta_file, args.translate, args.qtype, args.dbtype, scantype, dbpath, port, evalue_thr=args.evalue, score_thr=args.score, qcov_thr=args.qcov, fixed_Z=args.Z, max_hits=args.maxhits, skip=VISITED, maxseqlen=args.maxseqlen, cpus=args.cpu)): if elapsed == -1: # error occurred print >>OUT, '\t'.join( [name] + ['ERROR'] * (len(hits_header) - 1)) elif not hits: print >>OUT, '\t'.join([name] + ['-'] * (len(hits_header) - 1)) else: for hitindex, (hid, heval, hscore, hmmfrom, hmmto, sqfrom, sqto, domscore) in enumerate(hits): hitname = hid if idmap: hitname = idmap[hid][0] print >>OUT, '\t'.join(map(str, [name, hitname, heval, hscore, int(querylen), int(hmmfrom), int(hmmto), int(sqfrom), int(sqto), float(sqto - sqfrom) / querylen])) OUT.flush() # monitoring total_time += time.time() - last_time last_time = time.time() if qn and (qn % 25 == 0): print >>sys.stderr, qn + \ 1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time)) sys.stderr.flush() # Writes final stats elapsed_time = time.time() - start_time if not args.no_file_comments: print >>OUT, '# %d queries scanned' % (qn + 1) print >>OUT, '# Total time (seconds):', elapsed_time print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def annotate_hits_file_sequential(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ( "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_pathways", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if not args.no_file_comments: print >> OUT, '# ' + time.ctime() print >> OUT, '# ' + ' '.join(sys.argv) print >> OUT, '\t'.join(annot_header) qn = 0 for line in open(seed_orthologs_file): if not line.strip() or line.startswith('#'): continue if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn + 1, total_time, "%0.2f q/s (refinement)" % ( (float(qn + 1) / total_time)) sys.stderr.flush() qn += 1 r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: continue match_nogs = annota.get_member_ogs(best_hit_name) if not match_nogs: continue match_levels = set([nog.split("@")[1] for nog in match_nogs]) if args.tax_scope == "auto": for level in TAXONOMIC_RESOLUTION: if level in match_levels: annot_levels = set(LEVEL_CONTENT.get(level, [level])) annot_levels.add(level) annot_level_max = "%s[%d]" % (level, len(annot_levels)) break else: annot_levels = set( LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope])) annot_levels.add(args.tax_scope) annot_level_max = "%s[%d]" % (args.tax_scope, len(annot_levels)) all_orthologies = annota.get_member_orthologs( best_hit_name, target_levels=annot_levels) orthologs = sorted(all_orthologies[args.target_orthologs]) if args.excluded_taxa: orthologs = [ o for o in orthologs if not o.startswith("%s." % args.excluded_taxa) ] if orthologs: pname, gos, keggs = annota.get_member_annotations( orthologs, target_go_ev=args.go_evidence, excluded_go_ev=args.go_excluded) best_name = '' if pname: name_candidate, freq = pname.most_common(1)[0] if freq >= 2: best_name = name_candidate else: pname = [] best_name = '' gos = set() keggs = set() if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = '', '' print >> OUT, '\t'.join( map(str, ( query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(map(lambda x: "map%05d" % x, map(int, keggs)))), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn + 1) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def main(args): # Output and intermediate files hmm_hits_file = "%s.emapper.hmm_hits" % args.output seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output annot_file = "%s.emapper.annotations" % args.output orthologs_file = "%s.emapper.predict_orthologs" % args.output if args.no_search: output_files = [annot_file] elif args.no_annot: output_files = [hmm_hits_file, seed_orthologs_file] else: output_files = [hmm_hits_file, seed_orthologs_file, annot_file] # convert to absolute path before changing directory if args.annotate_hits_table: args.annotate_hits_table = os.path.abspath(args.annotate_hits_table) # force user to decide what to do with existing files os.chdir(args.output_dir) files_present = set([pexists(fname) for fname in output_files]) if True in files_present and not args.resume and not args.override: print "Output files detected in disk. Use --resume or --override to continue" raise emapperException() if args.override: for outf in output_files: silent_rm(outf) print '# ', get_version() print '# ./emapper.py ', ' '.join(sys.argv[1:]) if args.scratch_dir: # If resuming in and using --scratch_dir, transfer existing files. if args.resume and args.scratch_dir: for f in output_files: if pexists(f): print " Copying input file %s to scratch dir %s" % ( f, args.scratch_dir) shutil.copy(f, args.scratch_dir) # Change working dir os.chdir(args.scratch_dir) # Step 1. Sequence search if not args.no_search: if args.mode == 'diamond' and not args.no_search: dump_diamond_matches(args.input, seed_orthologs_file, args) elif args.mode == 'hmmer' and not args.no_search: host, port, dbpath, scantype, idmap = setup_hmm_search(args) # Start HMM SCANNING sequences (if requested) if not pexists(hmm_hits_file) or args.override: dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args) if not args.no_refine and (not pexists(seed_orthologs_file) or args.override): if args.db == 'viruses': print 'Skipping seed ortholog detection in "viruses" database' elif args.db in EGGNOG_DATABASES: refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args) else: print 'refined hits not available for custom hmm databases.' # Step 2. Annotation if not args.no_annot: annota.connect() if args.annotate_hits_table: if not os.path.exists(args.annotate_hits_table): raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), args.annotate_hits_table) annotate_hits_file(args.annotate_hits_table, annot_file, hmm_hits_file, args) elif args.db == 'viruses': annotate_hmm_matches(hmm_hits_file, hmm_hits_file + '.annotations', args) OUT = open(annot_file, 'w') for line in open(hmm_hits_file + '.annotations'): if line.startswith('#') or not line.strip(): continue (query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats) = line.split("\t") if hitname != '-' and hitname != 'ERROR': print >> OUT, '\t'.join( map(str, (query, hitname, evalue, sum_score, '', '', '', 'viruses', hitname + "@viruses", "%s|%s|%s" % (hitname, evalue, sum_score), cats.replace('\n', ''), desc.replace('\n', ' ')))) OUT.close() else: annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args) if args.predict_ortho: orthology.connect() dump_orthologs(seed_orthologs_file, orthologs_file, args) # If running in scratch, move files to real output dir and clean up if args.scratch_dir: for fname in output_files: if pexists(fname): print " Copying result file %s from scratch to %s" % ( fname, args.output_dir) shutil.copy(annot_file, args.output_dir) print " Cleaning result file %s from scratch dir" % (fname) # Finalize and exit print colorify('Done', 'green') for f in output_files: colorify('Result files:', 'yellow') if pexists(f): print " %s" % (f) print 'Total time: %g secs' % (time.time() - _total_time) if args.mode == 'hmmer': print get_citation(['hmmer']) elif args.mode == 'diamond': print get_citation(['diamond']) shutdown_server()
def dump_hmm_matches(fasta_file, hits_file, dbpath, port, scantype, idmap, args): hits_header = ("#query_name", "hit", "evalue", "sum_score", "query_length", "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage") # Cache previous results if resuming is enabled VISITED = set() if args.resume and pexists(hits_file): print colorify( "Resuming previous run. Reading computed output from %s" % hits_file, 'yellow') VISITED = set([ line.split('\t')[0].strip() for line in open(hits_file) if not line.startswith('#') ]) print len(VISITED), 'queries skipped' OUT = open(hits_file, 'a') else: OUT = open(hits_file, 'w') print colorify("Sequence mapping starts now!", 'green') if not args.no_file_comments: print >> OUT, get_call_info() print >> OUT, '# ' + '\t'.join(hits_header) total_time = 0 last_time = time.time() start_time = time.time() qn = 0 # in case nothing to loop bellow for qn, (name, elapsed, hits, querylen, seq) in enumerate( search.iter_hits(fasta_file, args.translate, args.qtype, args.dbtype, scantype, dbpath, port, evalue_thr=args.evalue, score_thr=args.score, qcov_thr=args.qcov, fixed_Z=args.Z, max_hits=args.maxhits, skip=VISITED, maxseqlen=args.maxseqlen, cpus=args.cpu, base_tempdir=args.temp_dir)): if elapsed == -1: # error occurred print >> OUT, '\t'.join([name] + ['ERROR'] * (len(hits_header) - 1)) elif not hits: print >> OUT, '\t'.join([name] + ['-'] * (len(hits_header) - 1)) else: for hitindex, (hid, heval, hscore, hmmfrom, hmmto, sqfrom, sqto, domscore) in enumerate(hits): hitname = hid if idmap: hitname = idmap[hid][0] print >> OUT, '\t'.join( map(str, [ name, hitname, heval, hscore, int(querylen), int(hmmfrom), int(hmmto), int(sqfrom), int(sqto), float(sqto - sqfrom) / querylen ])) OUT.flush() # monitoring total_time += time.time() - last_time last_time = time.time() if qn and (qn % 25 == 0): print >>sys.stderr, qn + \ 1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time)) sys.stderr.flush() # Writes final stats elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn + 1) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def dump_diamond_matches(fasta_file, seed_orthologs_file, args): cpu = args.cpu score_thr = args.seed_ortholog_score evalue_thr = args.seed_ortholog_evalue excluded_taxa = args.excluded_taxa if args.excluded_taxa else None if args.translate: tool = 'blastx' else: tool = 'blastp' dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() query_cov = args.query_cover subject_cov = args.subject_cover dmnd_opts = '' if args.matrix is not None: dmnd_opts += ' --matrix %s' % args.matrix if args.gapopen is not None: dmnd_opts += ' --gapopen %d' % args.gapopen if args.gapextend is not None: dmnd_opts += ' --gapextend %d' % args.gapextend if not DIAMOND: raise ValueError("diamond not found in path") tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=args.temp_dir) raw_output_file = pjoin(tempdir, uuid.uuid4().hex) if excluded_taxa: cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25 --query-cover %s --subject-cover %s' %\ (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov) else: cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3 --query-cover %s --subject-cover %s' %\ (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov) #diamond blastp --threads "${GALAXY_SLOTS:-12}" --db ./database --query '/panfs/roc/galaxy/GALAXYP/files/000/164/dataset_164640.dat' --query-gencode '1' --outfmt '6' qseqid sseqid sallseqid qlen slen pident length nident mismatch positive gapopen gaps ppos qstart qend sstart send qseq sseq evalue bitscore score qframe stitle salltitles qcovhsp --out '/panfs/roc/galaxy/GALAXYP/files/000/164/dataset_164759.dat' --compress '0' --gapopen '10' --gapextend '1' --matrix 'PAM30' --seg 'yes' --max-target-seqs '25' --evalue '0.001' --id '0' --query-cover '0' --block-size '2.0' print colorify(' ' + cmd, 'yellow') try: subprocess.check_call(cmd, shell=True, stdout=subprocess.PIPE) OUT = open('%s' % seed_orthologs_file, 'w') if not args.no_file_comments: print >> OUT, get_call_info() print >> OUT, '#', cmd visited = set() for line in open(raw_output_file): if not line.strip() or line.startswith('#'): continue fields = map(str.strip, line.split('\t')) query = fields[0] hit = fields[1] evalue = float(fields[10]) score = float(fields[11]) if query in visited: continue if evalue > evalue_thr or score < score_thr: continue if excluded_taxa and hit.startswith("%s." % excluded_taxa): continue visited.add(query) print >> OUT, '\t'.join(map(str, [query, hit, evalue, score])) OUT.close() except subprocess.CalledProcessError as e: raise e finally: shutil.rmtree(tempdir)
def main(args): # Output and intermediate files hmm_hits_file = "%s.emapper.hmm_hits" % args.output seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output annot_file = "%s.emapper.annotations" % args.output if args.no_search: output_files = [annot_file] elif args.no_annot: output_files = [hmm_hits_file, seed_orthologs_file] else: output_files = [hmm_hits_file, seed_orthologs_file, annot_file] # force user to decide what to do with existing files os.chdir(args.output_dir) files_present = set([pexists(fname) for fname in output_files]) if True in files_present and not args.resume and not args.override: print "Output files detected in disk. Use --resume or --override to continue" raise emapperException() if args.override: for outf in output_files: silent_rm(outf) print '# ', get_version() print '# ./emapper.py ', ' '.join(sys.argv[1:]) if args.scratch_dir: # If resuming in and using --scratch_dir, transfer existing files. if args.resume and args.scratch_dir: for f in output_files: if pexists(f): print " Copying input file %s to scratch dir %s" % (f, args.scratch_dir) shutil.copy(f, args.scratch_dir) # Change working dir os.chdir(args.scratch_dir) # Step 1. Sequence search if not args.no_search: if args.mode == 'diamond' and not args.no_search: dump_diamond_matches(args.input, seed_orthologs_file, args) elif args.mode == 'hmmer' and not args.no_search: host, port, dbpath, scantype, idmap = setup_hmm_search(args) # Start HMM SCANNING sequences (if requested) if not pexists(hmm_hits_file) or args.override: dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args) if not args.no_refine and (not pexists(seed_orthologs_file) or args.override): if args.db == 'viruses': print 'Skipping seed ortholog detection in "viruses" database' elif args.db in EGGNOG_DATABASES: refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args) else: print 'refined hits not available for custom hmm databases.' # Step 2. Annotation if not args.no_annot: annota.connect() if args.annotate_hits_table: annotate_hits_file(args.annotate_hits_table, annot_file, hmm_hits_file, args) elif args.db == 'viruses': annotate_hmm_matches(hmm_hits_file, hmm_hits_file+'.annotations', args) OUT = open(annot_file, 'w') for line in open(hmm_hits_file+'.annotations'): if line.startswith('#') or not line.strip(): continue (query, hitname, level, evalue, sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats) = line.split("\t") if hitname != '-' and hitname != 'ERROR': print >>OUT, '\t'.join(map(str, (query, hitname, evalue, sum_score, '', '', '', 'viruses', hitname+"@viruses", "%s|%s|%s" %(hitname, evalue, sum_score), cats.replace('\n', ''), desc.replace('\n', ' ')))) OUT.close() else: annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args) # If running in scratch, move files to real output dir and clean up if args.scratch_dir: for fname in output_files: if pexists(fname): print " Copying result file %s from scratch to %s" % (fname, args.output_dir) shutil.copy(annot_file, args.output_dir) print " Cleaning result file %s from scratch dir" %(fname) # Finalize and exit print colorify('Done', 'green') for f in output_files: colorify('Result files:', 'yellow') if pexists(f): print " %s" % (f) print 'Total time: %g secs' % (time.time()-_total_time) if args.mode == 'hmmer': print get_citation(['hmmer']) elif args.mode == 'diamond': print get_citation(['diamond']) shutdown_server()
help='assume "yes" to all questions') parser.add_argument('-f', action="store_true", dest='force', help='forces download even if the files exist') parser.add_argument('-s', action="store_true", dest='simulate', help='simulate and print commands. Nothing is downloaded') args = parser.parse_args() if 'all' in args.dbs: args.dbs = EGGNOG_DATABASES if args.force or not pexists(pjoin(DATA_PATH, 'eggnog.db')): if args.allyes or ask("Download main annotation database?") == 'y': print colorify('Downloading "eggnog.db" at %s...' %DATA_PATH, 'green') download_annotations() else: print 'Skipping' else: print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue') if args.force or not pexists(pjoin(DATA_PATH, 'OG_fasta')): if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y': print colorify('Downloading fasta files " at %s/OG_fasta...' %DATA_PATH, 'green') download_groups() else: print 'Skipping' else:
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): annot_header = ("#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "predicted_gene_name", "GO_terms", "KEGG_pathways", "Annotation_tax_scope", "OGs", "bestOG|evalue|score", "COG cat", "eggNOG annot", ) start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations(set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if not args.no_file_comments: print >>OUT, '# ' + time.ctime() print >>OUT, '# ' + ' '.join(sys.argv) print >>OUT, '\t'.join(annot_header) qn = 0 for line in open(seed_orthologs_file): if not line.strip() or line.startswith('#'): continue if qn and (qn % 500 == 0): total_time = time.time() - start_time print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" % ( (float(qn + 1) / total_time)) sys.stderr.flush() qn += 1 r = map(str.strip, line.split('\t')) query_name = r[0] best_hit_name = r[1] if best_hit_name == '-' or best_hit_name == 'ERROR': continue best_hit_evalue = float(r[2]) best_hit_score = float(r[3]) if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue: continue match_nogs = annota.get_member_ogs(best_hit_name) if not match_nogs: continue match_levels = set([nog.split("@")[1] for nog in match_nogs]) if args.tax_scope == "auto": for level in TAXONOMIC_RESOLUTION: if level in match_levels: annot_levels = set(LEVEL_CONTENT.get(level, [level])) annot_levels.add(level) annot_level_max = "%s[%d]" %(level, len(annot_levels)) break else: annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope])) annot_levels.add(args.tax_scope) annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels)) all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels) orthologs = sorted(all_orthologies[args.target_orthologs]) if args.excluded_taxa: orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)] if orthologs: pname, gos, keggs = annota.get_member_annotations(orthologs, excluded_gos=set(["IEA", "ND"])) best_name = '' if pname: name_candidate, freq = pname.most_common(1)[0] if freq >= 2: best_name = name_candidate else: pname = [] best_name = '' gos = set() keggs = set() if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' %(hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = '', '' print >>OUT, '\t'.join(map(str, (query_name, best_hit_name, best_hit_evalue, best_hit_score, best_name, ','.join(sorted(gos)), ','.join(sorted(map(lambda x: "map%05d"%x, map(int, keggs)))), annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' '), ))) OUT.flush() elapsed_time = time.time() - start_time if not args.no_file_comments: print >>OUT, '# %d queries scanned' % (qn + 1) print >>OUT, '# Total time (seconds):', elapsed_time print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time)) OUT.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) if not args.no_annot and not pexists(get_eggnogdb_file()): print colorify( 'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond': dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print colorify( 'DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error( '--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) else: raise ValueError('Invalid --go_evidence value') # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error( 'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))' ) if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
def parse_args(parser): args = parser.parse_args() if args.version: print get_version() sys.exit(0) if not args.no_annot and not pexists(EGGNOGDB_FILE): print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB): print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red') raise emapperException() if args.cpu == 0: args.cpu = multiprocessing.cpu_count() # No --servermode available for diamond if args.mode == 'diamond' and args.servermode: parser.error('--mode [diamond] and --servermode are mutually exclusive') # Output file required unless running in servermode if not args.servermode and not args.output: parser.error('An output project name is required (-o)') # Servermode implies using mem-based databases if args.servermode: args.usemem = True # Direct annotation implies no searches if args.annotate_hits_table: args.no_search = True args.no_annot = False # Check inputs for running sequence searches if not args.no_search and not args.servermode: if not args.input: parser.error('An input fasta file is required (-i)') # HMM if args.mode == 'hmmer': if not args.db and not args.guessdb: parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))') if args.db and args.guessdb: parser.error('-d and --guessdb options are mutually exclusive') if args.guessdb: from ete3 import NCBITaxa ncbi = NCBITaxa() lineage = ncbi.get_lineage(args.guessdb) for tid in reversed(lineage): if tid in TAXID2LEVEL: print tid, TAXID2LEVEL[tid] args.db = TAXID2LEVEL[tid] break # DIAMOND elif args.mode == 'diamond': #if args.db or args.guessdb: # parser.error('diamond mode does not require -d or --guessdb options') pass return args
def dump_diamond_matches(fasta_file, seed_orthologs_file, args): cpu = args.cpu score_thr = args.seed_ortholog_score evalue_thr = args.seed_ortholog_evalue excluded_taxa = args.excluded_taxa if args.excluded_taxa else None if args.translate: tool = 'blastx' else: tool = 'blastp' dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() query_cov = args.query_cover subject_cov = args.subject_cover dmnd_opts = '' if args.matrix is not None: dmnd_opts += ' --matrix %s' % args.matrix if args.gapopen is not None: dmnd_opts += ' --gapopen %d' % args.gapopen if args.gapextend is not None: dmnd_opts += ' --gapextend %d' % args.gapextend if not DIAMOND: raise ValueError("diamond not found in path") tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=args.temp_dir) raw_output_file = pjoin(tempdir, uuid.uuid4().hex) if excluded_taxa: cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25 --query-cover %s --subject-cover %s' %\ (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov) else: cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3 --query-cover %s --subject-cover %s' %\ (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov) print colorify(' ' + cmd, 'yellow') try: with open(raw_output_file + '.stdout', 'w') as STDOUT: subprocess.check_call(cmd, shell=True, stdout=STDOUT) OUT = open('%s' % seed_orthologs_file, 'w') if not args.no_file_comments: print >> OUT, get_call_info() print >> OUT, '#', cmd visited = set() for line in open(raw_output_file): if not line.strip() or line.startswith('#'): continue fields = map(str.strip, line.split('\t')) query = fields[0] hit = fields[1] evalue = float(fields[10]) score = float(fields[11]) if query in visited: continue if evalue > evalue_thr or score < score_thr: continue if excluded_taxa and hit.startswith("%s." % excluded_taxa): continue visited.add(query) print >> OUT, '\t'.join(map(str, [query, hit, evalue, score])) OUT.close() except subprocess.CalledProcessError as e: raise e finally: shutil.rmtree(tempdir)
if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) # if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')): # print colorify('Downloading "og2level.tsv.gz" at %s' %get_data_path(), 'green') # download_og2level() # if 'all' in args.dbs: # args.dbs = EGGNOG_DATABASES if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')): if args.allyes or ask("Download main annotation database?") == 'y': print colorify( 'Downloading "eggnog.db" at %s...' % get_data_path(), 'green') download_annotations() else: print 'Skipping' else: if not args.quiet: print colorify( 'Skipping eggnog.db database (already present). Use -f to force download', 'lblue') # if args.force or not pexists(pjoin(get_data_path(), 'OG_fasta')): # if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y': # print colorify('Downloading fasta files " at %s/OG_fasta...' %get_data_path(), 'green') # download_groups() # else:
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args): HIT_HEADER = [ "#query_name", "seed_eggNOG_ortholog", "seed_ortholog_evalue", "seed_ortholog_score", "best_tax_level", ] HIT_OG_HEADER = [ "taxonomic scope", "eggNOG OGs", "best eggNOG OG", "COG Functional cat.", "eggNOG free text desc." ] start_time = time.time() seq2bestOG = {} if pexists(hmm_hits_file): seq2bestOG = get_seq_hmm_matches(hmm_hits_file) seq2annotOG = annota.get_ogs_annotations( set([v[0] for v in seq2bestOG.itervalues()])) print colorify("Functional annotation of refined hits starts now", 'green') OUT = open(annot_file, "w") if args.report_orthologs: ORTHOLOGS = open(annot_file + ".orthologs", "w") if not args.no_file_comments: print >> OUT, '# emapper version:', get_version( ), 'emapper DB:', get_db_version() print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:]) print >> OUT, '# time: ' + time.ctime() print >> OUT, '\t'.join(HIT_HEADER + ANNOTATIONS_HEADER + HIT_OG_HEADER) qn = 0 pool = multiprocessing.Pool(args.cpu) for result in pool.imap(annotate_hit_line, iter_hit_lines(seed_orthologs_file, args)): qn += 1 if qn and (qn % 500 == 0): total_time = time.time() - start_time print >> sys.stderr, qn, total_time, "%0.2f q/s (func. annotation)" % ( (float(qn) / total_time)) sys.stderr.flush() if result: (query_name, best_hit_name, best_hit_evalue, best_hit_score, annotations, annot_level_max, swallowest_level, match_nogs, orthologs) = result if query_name in seq2bestOG: (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto, q_coverage) = seq2bestOG[query_name] bestOG = '%s|%s|%s' % (hitname, evalue, score) og_cat, og_desc = seq2annotOG.get(hitname, ['', '']) else: bestOG = 'NA|NA|NA' og_cat, og_desc = annota.get_best_og_description(match_nogs) if args.report_orthologs: print >> ORTHOLOGS, '\t'.join( map(str, (query_name, ','.join(orthologs)))) # prepare annotations for printing annot_columns = [ query_name, best_hit_name, str(best_hit_evalue), str(best_hit_score), LEVEL_NAMES[swallowest_level] ] for h in ANNOTATIONS_HEADER: if h in annotations: annot_columns.append(','.join(sorted(annotations[h]))) else: annot_columns.append('') annot_columns.extend([ annot_level_max, ','.join(match_nogs), bestOG, og_cat.replace('\n', ''), og_desc.replace('\n', ' ') ]) print >> OUT, '\t'.join(annot_columns) #OUT.flush() pool.terminate() elapsed_time = time.time() - start_time if not args.no_file_comments: print >> OUT, '# %d queries scanned' % (qn) print >> OUT, '# Total time (seconds):', elapsed_time print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time)) OUT.close() if args.report_orthologs: ORTHOLOGS.close() print colorify(" Processed queries:%s total_time:%s rate:%s" %\ (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
def parse_args(parser): args = parser.parse_args() if "EGGNOG_DATA_DIR" in os.environ: set_data_path(os.environ["EGGNOG_DATA_DIR"]) if args.data_dir: set_data_path(args.data_dir) if args.version: version = "" try: version = get_full_version_info() except Exception: version = get_version() print(version) sys.exit(0) args.call_info = get_call_info() if args.list_taxa: print_taxa() sys.exit(0) if args.cpu == 0: args.cpu = multiprocessing.cpu_count() multiprocessing.set_start_method(args.mp_start_method) if args.resume == True and args.override == True: parser.error('Only one of --resume or --override is allowed.') # Gene prediction if args.training_genome is not None and args.training_file is None: parser.error('"--training_genome requires --training_file"') if args.training_genome is None and args.training_file is not None: if not os.path.isfile(args.training_file): parser.error('"--training_file must point to an existing file, if no --training_genome is provided."') # Search modes if args.mode == SEARCH_MODE_DIAMOND: dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db() if not pexists(dmnd_db): print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red')) raise EmapperException() if args.input is not None: if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue')) args.annotate_hits_table = None else: # the default -m is diamond, but we will consider -m no_search as default when # --annotate_hits_table has been provided and -i has not been provided if args.annotate_hits_table is not None: print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue')) args.mode = SEARCH_MODE_NO_SEARCH else: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') elif args.mode == SEARCH_MODE_MMSEQS2: mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db() if not pexists(mmseqs_db): print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red')) raise EmapperException() if not args.input: parser.error('An input fasta file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue')) args.annotate_hits_table = None elif args.mode == SEARCH_MODE_HMMER: if not args.input: parser.error('An input file is required (-i)') # Output file required if not args.output: parser.error('An output project name is required (-o)') # Hmmer database # NOTE: hmmer database format, name and checking if exists is done within hmmer module if not args.db: parser.error('HMMER mode requires a target database (-d, --database).') if args.itype == ITYPE_CDS: args.translate = True if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH: parser.error('HMMER mode is not compatible with "--genepred search" option.') if args.annotate_hits_table is not None: print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue')) args.annotate_hits_table = None if args.clean_overlaps is not None: if args.clean_overlaps == "none": args.clean_overlaps = None elif args.mode == SEARCH_MODE_CACHE: if args.cache_file is None: parser.error('A file with annotations and md5 of queries is required (-c FILE)') if args.decorate_gff != DECORATE_GFF_NONE: print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red')) if args.no_annot == True: parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.') elif args.mode == SEARCH_MODE_NO_SEARCH: if args.no_annot == False and not args.annotate_hits_table: parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)') if args.md5 == True and args.input is None: parser.error(f'--md5 requires an input FASTA file (-i FASTA).') else: parser.error(f'unrecognized search mode (-m {args.mode})') # Search thresholds args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score args.qcov = args.query_cover # Annotation options if args.no_annot == False or args.report_orthologs == True: if not pexists(get_eggnogdb_file()): print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')) raise EmapperException() args.tax_scope_ids = parse_tax_scope(args.tax_scope) if args.target_taxa is not None: args.target_taxa = args.target_taxa.split(",") if args.excluded_taxa is not None: args.excluded_taxa = args.excluded_taxa.split(",") # Sets GO evidence bases if args.go_evidence == 'experimental': args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"]) args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'non-electronic': args.go_evidence = None args.go_excluded = set(["ND", "IEA"]) elif args.go_evidence == 'all': args.go_evidence = None args.go_excluded = None else: raise ValueError('Invalid --go_evidence value') # PFAM annotation options if args.pfam_realign == PFAM_REALIGN_NONE: pass elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO: if not args.input: parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}') else: raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}') total_workers = args.num_workers * args.num_servers if args.cpu < total_workers: parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.") if args.cpu % total_workers != 0: parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).") args.cpus_per_worker = int(args.cpu / total_workers) return args