Example #1
0
def dump_diamond_matches(fasta_file, seed_orthologs_file, args):
    cpu = args.cpu
    score_thr = args.seed_ortholog_score
    evalue_thr = args.seed_ortholog_evalue
    excluded_taxa = args.excluded_taxa if args.excluded_taxa else None
    if args.translate:
        tool = 'blastx'
    else:
        tool = 'blastp'

    if not DIAMOND:
        raise ValueError("diamond not found in path")

    tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=args.temp_dir)

    raw_output_file = pjoin(tempdir, uuid.uuid4().hex)
    if excluded_taxa:
        cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25' %\
          (DIAMOND, tool, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file)
    else:
        cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3' %\
          (DIAMOND, tool, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file)

    print colorify('  '+cmd, 'yellow')
    status = subprocess.call(cmd, shell=True,
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if status == 0:
        OUT = open('%s' %seed_orthologs_file, 'w')

        if not args.no_file_comments:
            print >>OUT, get_call_info()
            print >>OUT, '#', cmd

        visited = set()
        for line in open(raw_output_file):
            if not line.strip() or line.startswith('#'):
                continue
            fields = map(str.strip, line.split('\t'))
            query = fields[0]
            hit = fields[1]
            evalue = float(fields[10])
            score = float(fields[11])

            if query in visited:
                continue

            if evalue > evalue_thr or score < score_thr:
                continue

            if excluded_taxa and hit.startswith("%s." % excluded_taxa):
                continue

            visited.add(query)
            print >>OUT, '\t'.join(map(str, [query, hit, evalue, score]))
        OUT.close()
    else:
        print cmd
        raise ValueError('Error running diamond')
    shutil.rmtree(tempdir)
Example #2
0
def refine_matches(fasta_file, refine_file, hits_file, args):
    refine_header = map(
        str.strip, '''#query_name, best_hit_eggNOG_ortholog,
                        best_hit_evalue, best_hit_score'''.split(','))

    print colorify("Hit refinement starts now", 'green')
    start_time = time.time()
    og2level = dict([
        tuple(map(str.strip, line.split('\t')))
        for line in gopen(OGLEVELS_FILE)
    ])
    OUT = open(refine_file, "w")

    if not args.no_file_comments:
        print >> OUT, get_call_info()
        print >> OUT, '\t'.join(refine_header)

    qn = 0
    for qn, r in enumerate(
            process_nog_hits_file(hits_file,
                                  fasta_file,
                                  og2level,
                                  translate=args.translate,
                                  cpu=args.cpu,
                                  excluded_taxa=args.excluded_taxa,
                                  base_tempdir=args.temp_dir)):
        if qn and (qn % 25 == 0):
            total_time = time.time() - start_time
            print >>sys.stderr, qn + \
                1, total_time, "%0.2f q/s (refinement)" % (
                    (float(qn + 1) / total_time))
            sys.stderr.flush()
        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue
        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        print >> OUT, '\t'.join(
            map(str,
                (query_name, best_hit_name, best_hit_evalue, best_hit_score)))
        #OUT.flush()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn + 1)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #3
0
def dump_diamond_matches(fasta_file, seed_orthologs_file, args):
    cpu = args.cpu
    score_thr = args.seed_ortholog_score
    evalue_thr = args.seed_ortholog_evalue
    excluded_taxa = args.excluded_taxa if args.excluded_taxa else None

    if not DIAMOND:
        raise ValueError("diamond not found in path")

    tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=TEMPDIR)

    raw_output_file = pjoin(tempdir, uuid.uuid4().hex)
    cmd = '%s blastp -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3' %\
          (DIAMOND, EGGNOG_DMND_DB, fasta_file, cpu, evalue_thr, raw_output_file)
    print colorify('  '+cmd, 'yellow')
    status = subprocess.call(cmd, shell=True,
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if status == 0:
        OUT = open('%s' %seed_orthologs_file, 'w')

        if not args.no_file_comments:
            print >>OUT, get_call_info()
            print >>OUT, '#', cmd 

        visited = set()
        for line in open(raw_output_file):
            if not line.strip() or line.startswith('#'):
                continue
            fields = map(str.strip, line.split('\t'))
            query = fields[0]
            hit = fields[1]
            evalue = float(fields[10])
            score = float(fields[11])

            if query in visited:
                continue

            if evalue > evalue_thr or score < score_thr:
                continue

            if excluded_taxa or hit.startswith("%s." % excluded_taxa):
                continue

            visited.add(query)
            print >>OUT, '\t'.join(map(str, [query, hit, evalue, score]))
        OUT.close()
    else:
        print cmd
        raise ValueError('Error running diamond')
    shutil.rmtree(tempdir)
Example #4
0
def annotate_hmm_matches(hits_file, hits_annot_file, args):
    hits_annot_header = map(
        str.strip, '''#query_name, hit, level, evalue,
                         sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage,
                         members_in_og, og_description, og_COG_categories'''.
        split(','))

    annota.connect()
    print colorify("Functional annotation of hits starts now", 'green')
    start_time = time.time()
    if pexists(hits_file):
        OUT = open(hits_annot_file, "w")
        if not args.no_file_comments:
            print >> OUT, get_call_info()
            print >> OUT, '\t'.join(hits_annot_header)
        qn = 0
        t1 = time.time()
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue
            qn += 1
            if qn and (qn % 10000 == 0):
                total_time = time.time() - start_time
                print >>sys.stderr, qn, total_time, "%0.2f q/s (refinement)" %\
                    ((float(qn) / total_time))
                sys.stderr.flush()

            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))
            if hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                level, nm, desc, cats = annota.get_og_annotations(hitname)
                print >> OUT, '\t'.join(
                    map(str, [
                        query, hitname, level, evalue, sum_score, query_length,
                        hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc,
                        cats
                    ]))
            else:
                print >> OUT, '\t'.join([query] + [hit] *
                                        (len(hits_annot_header) - 1))
        elapsed_time = time.time() - t1
        if not args.no_file_comments:
            print >> OUT, '# %d queries scanned' % (qn)
            print >> OUT, '# Total time (seconds):', elapsed_time
            print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
        OUT.close()
        print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                       (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Example #5
0
def annotate_hmm_matches(hits_file, hits_annot_file, args):
    hits_annot_header = map(str.strip, '''#query_name, hit, level, evalue,
                         sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage,
                         members_in_og, og_description, og_COG_categories'''.split(','))

    annota.connect()
    print colorify("Functional annotation of hits starts now", 'green')
    start_time = time.time()
    if pexists(hits_file):
        OUT = open(hits_annot_file, "w")
        if not args.no_file_comments:
            print >>OUT, get_call_info()
            print >>OUT, '\t'.join(hits_annot_header)
        qn = 0
        t1 = time.time()
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue
            if qn and (qn % 10000 == 0):
                total_time = time.time() - start_time
                print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" %\
                    ((float(qn + 1) / total_time))
                sys.stderr.flush()
            qn += 1
            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))
            if hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                level, nm, desc, cats = annota.get_og_annotations(hitname)
                print >>OUT, '\t'.join(map( str, [query, hitname, level, evalue,
                                                  sum_score, query_length,
                                                  hmmfrom, hmmto, seqfrom,
                                                  seqto, q_coverage, nm, desc,
                                                  cats]))
            else:
                print >>OUT, '\t'.join(
                    [query] + [hit] * (len(hits_annot_header) - 1))
        elapsed_time = time.time() - t1
        if not args.no_file_comments:
            print >>OUT, '# %d queries scanned' % (qn + 1)
            print >>OUT, '# Total time (seconds):', elapsed_time
            print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
        OUT.close()
        print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                       (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #6
0
def refine_matches(fasta_file, refine_file, hits_file, args):
    refine_header = map(str.strip, '''#query_name, best_hit_eggNOG_ortholog,
                        best_hit_evalue, best_hit_score'''.split(','))

    print colorify("Hit refinement starts now", 'green')
    start_time = time.time()
    og2level = dict([tuple(map(str.strip, line.split('\t')))
                     for line in gopen(OGLEVELS_FILE)])
    OUT = open(refine_file, "w")

    if not args.no_file_comments:
        print >>OUT, get_call_info()
        print >>OUT, '\t'.join(refine_header)

    qn = 0
    for qn, r in enumerate(process_nog_hits_file(hits_file, fasta_file, og2level,
                                                 translate=args.translate,
                                                 cpu=args.cpu,
                                                 excluded_taxa=args.excluded_taxa)):
        if qn and (qn % 25 == 0):
            total_time = time.time() - start_time
            print >>sys.stderr, qn + \
                1, total_time, "%0.2f q/s (refinement)" % (
                    (float(qn + 1) / total_time))
            sys.stderr.flush()
        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue
        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        print >>OUT, '\t'.join(map(str, (query_name, best_hit_name,
                                         best_hit_evalue, best_hit_score)))
        #OUT.flush()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >>OUT, '# %d queries scanned' % (qn + 1)
        print >>OUT, '# Total time (seconds):', elapsed_time
        print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #7
0
def get_seq_hmm_matches(hits_file):
    annota.connect()
    print colorify("Reading HMM matches", 'green')
    seq2oginfo = {}
    start_time = time.time()
    hitnames = set()
    if pexists(hits_file):
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue

            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))

            if query not in seq2oginfo and hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                seq2oginfo[query] = [hitname, evalue, sum_score, query_length,
                                     hmmfrom, hmmto, seqfrom, seqto,
                                     q_coverage]
    return seq2oginfo
Example #8
0
def get_seq_hmm_matches(hits_file):
    annota.connect()
    print colorify("Reading HMM matches", 'green')
    seq2oginfo = {}
    start_time = time.time()
    hitnames = set()
    if pexists(hits_file):
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue

            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))

            if query not in seq2oginfo and hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                seq2oginfo[query] = [hitname, evalue, sum_score, query_length,
                                     hmmfrom, hmmto, seqfrom, seqto,
                                     q_coverage]
    return seq2oginfo
                        action="store_true",
                        dest='quiet',
                        help='quiet_mode')

    parser.add_argument("--data_dir",
                        metavar='',
                        type=existing_dir,
                        help='Directory to use for DATA_PATH.')

    args = parser.parse_args()

    if args.data_dir:
        set_data_path(args.data_dir)

    if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')):
        print colorify('Downloading "og2level.tsv.gz" at %s' % get_data_path(),
                       'green')
        download_og2level()

    if 'all' in args.dbs:
        args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify(
                'Downloading "eggnog.db" at %s...' % get_data_path(), 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        if not args.quiet:
Example #10
0
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if not args.no_annot and not pexists(EGGNOGDB_FILE):
        print colorify(
            'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it',
            'red')
        raise emapperException()

    if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB):
        print colorify(
            'DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it',
            'red')
        raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error(
            '--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error(
                    'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))'
                )
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
Example #11
0
                        help='assume "yes" to all questions')

    parser.add_argument(
        '-s',
        action="store_true",
        dest='simulate',
        help='simulate and print commands. Nothing is downloaded')

    ##

    args = parser.parse_args()

    if args.dbname is None or args.dbname == "":
        print(
            colorify(
                f'A prefix name for the DB to be created is required. Use the --dbname option.',
                'red'))
        sys.exit(1)

    if (args.taxids is None or args.taxids == "") and (args.taxa is None
                                                       or args.taxa == ""):
        print(
            colorify(f'Either --taxids or --taxa parameter is required',
                     'red'))
        sys.exit(1)

    if (args.taxids is not None and
            args.taxids != "") and (args.taxa is not None and args.taxa != ""):
        print(colorify(f'Use either --taxids or --taxa, not both', 'red'))
        sys.exit(1)
Example #12
0
def parse_args(parser):
    
    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])
    
    if args.data_dir:
        set_data_path(args.data_dir)
        
    if args.version:
        version = ""
        try:
            version = get_full_version_info()
        except Exception:
            version = get_version()
        print(version)
        sys.exit(0)

    args.call_info = get_call_info()

    if args.list_taxa:
        from eggnogmapper.vars import LEVEL_DEPTH, LEVEL_DICT, LEVEL_NAMES, LEVEL_PARENTS
        print("tax_name\ttax_id\tdepth\tparents\tparents_names")
        for tax_name, tax_id in LEVEL_DICT.items():
            depth = LEVEL_DEPTH.get(tax_id, "-")
            parents = LEVEL_PARENTS.get(tax_id, "-")
            parents_names = [LEVEL_NAMES.get(x, "-") for x in parents]
            print(f"{tax_name}\t{tax_id}\t{depth}\t{','.join(parents)}\t{','.join(parents_names)}")
        sys.exit(0)

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()


    # translate
    if args.itype in [ITYPE_GENOME, ITYPE_META, ITYPE_PROTS] and args.translate == True:
        parser.error('"--translate" only can be used with "--itype CDS"')

    # Gene prediction
    if args.training_genome is not None and args.training_file is None:
        parser.error('"--training_genome requires --training_file"')

    if args.training_genome is None and args.training_file is not None:
        if not os.path.isfile(args.training_file):
            parser.error('"--training_file must point to an existing file, if no --training_genome is provided."')
    
    # Search modes
    if args.mode == SEARCH_MODE_DIAMOND:
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red'))
            raise EmapperException()

        if args.input is not None:
            if args.annotate_hits_table is not None:
                print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue'))
                args.annotate_hits_table = None
        else:
            # the default -m is diamond, but we will consider -m no_search as default when
            # --annotate_hits_table has been provided and -i has not been provided
            if args.annotate_hits_table is not None:
                print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue'))
                args.mode = SEARCH_MODE_NO_SEARCH
            else:
                parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        if args.resume == True:
            print(colorify("Diamond jobs cannot be resumed. --resume will be ignored.", 'blue'))
            args.resume = False
            
    elif args.mode == SEARCH_MODE_MMSEQS2:
        mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db()
        if not pexists(mmseqs_db):
            print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red'))
            raise EmapperException()

        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        if args.resume == True:
            print(colorify("MMseqs2 jobs cannot be resumed. --resume will be ignored.", 'blue'))
            args.resume = False

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue'))
            args.annotate_hits_table = None
            
    elif args.mode == SEARCH_MODE_HMMER:

        # if args.usemem == True:
        #     total_workers = args.num_workers * args.num_servers
        #     if args.cpu < total_workers:
        #         parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.")
        #     if args.cpu % total_workers != 0:
        #         parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).")        

        #     args.cpus_per_worker = int(args.cpu / total_workers)
        #     sys.stderr.write(f"CPUs per worker: {args.cpus_per_worker}\n")
        # else:
        #     args.cpus_per_worker = args.cpu
        
        if not args.input:
            parser.error('An input file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        # Hmmer database
        # NOTE: hmmer database format, name and checking if exists is done within hmmer module
        if not args.db:
            parser.error('HMMER mode requires a target database (-d, --database).')

        if args.itype == ITYPE_CDS:
            args.translate = True

        if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH:
            parser.error('HMMER mode is not compatible with "--genepred search" option.')            

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue'))
            args.annotate_hits_table = None

        if args.clean_overlaps is not None:
            if args.clean_overlaps == "none":
                args.clean_overlaps = None

    elif args.mode == SEARCH_MODE_CACHE:
        if args.cache_file is None:
            parser.error('A file with annotations and md5 of queries is required (-c FILE)')
        if args.decorate_gff != DECORATE_GFF_NONE:
            print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red'))
                
        if args.no_annot == True:
            parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.')
            
    elif args.mode == SEARCH_MODE_NO_SEARCH:
        if args.no_annot == False and not args.annotate_hits_table:
            parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)')
        if args.md5 == True and args.input is None:
            parser.error(f'--md5 requires an input FASTA file (-i FASTA).')            
        # if args.no_annot == True and args.report_orthologs == False:
        #     parser.error(f'Nothing to do if running in no search mode (-m {SEARCH_MODE_NO_SEARCH}), with --no_annot and without --report_orthologs.')
            
    else:
        parser.error(f'unrecognized search mode (-m {args.mode})')


    # Search thresholds
    args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue
    args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score
    args.qcov = args.query_cover
    
    # Annotation options
    if args.no_annot == False or args.report_orthologs == True:
        if not pexists(get_eggnogdb_file()):
            print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red'))
            raise EmapperException()

        args.tax_scope_mode, args.tax_scope_id = __parse_tax_scope(args.tax_scope)
        if args.target_taxa is not None:
            args.target_taxa = args.target_taxa.split(",")
        if args.excluded_taxa is not None:
            args.excluded_taxa = args.excluded_taxa.split(",")
        
    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'all':
        args.go_evidence = None
        args.go_excluded = None
        
    else:
        raise ValueError('Invalid --go_evidence value')

    # PFAM annotation options
    if args.pfam_transfer in [PFAM_TRANSFER_BEST_OG, PFAM_TRANSFER_NARROWEST_OG, PFAM_TRANSFER_SEED_ORTHOLOG]:
        pass
    else:
        raise ValueError(f'Invalid --pfam_transfer option {args.pfam_transfer}')
    
    if args.pfam_realign == PFAM_REALIGN_NONE:
        pass
    elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO:
        if not args.input:
            parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}')
    else:
        raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}')

    total_workers = args.num_workers * args.num_servers
    if args.cpu < total_workers:
        parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.")
    if args.cpu % total_workers != 0:
        parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).")        

    args.cpus_per_worker = int(args.cpu / total_workers)
    
    return args
Example #13
0
def setup_hmm_search(args):
    host = 'localhost'
    idmap = None
    if args.usemem:
        scantype = 'mem'
    else:
        scantype = 'disk'

    connecting_to_server = False
    # If searching against a predefined database name
    if args.db in EGGNOG_DATABASES:
        dbpath, port = get_db_info(args.db)
        print dbpath
        db_present = [
            pexists(dbpath + "." + ext)
            for ext in 'h3f h3i h3m h3p idmap'.split()
        ]

        if False in db_present:
            print db_present
            print colorify(
                'Database %s not present. Use download_eggnog_database.py to fetch it'
                % (args.db), 'red')
            raise ValueError('Database not found')

        if not args.no_refine:
            if not pexists(pjoin(get_data_path(), 'OG_fasta')):
                print colorify(
                    'Database data/OG_fasta/ not present. Use download_eggnog_database.py to fetch it',
                    'red')
                raise ValueError('Database not found')

        if scantype == 'mem':
            idmap_file = dbpath + '.idmap'
            end_port = 53200

    # If searching against a custom hmm database
    elif os.path.isfile(args.db + '.h3f'):
        dbpath = args.db
        if scantype == 'mem':
            idmap_file = args.db + ".idmap"
            if not pexists(idmap_file):
                if generate_idmap(args.db):
                    idmap_file = args.db + ".idmap"
                    print >> sys.stderr, "idmap succesfully created!"
                else:
                    raise ValueError("idmap could not be created!")
            port = 53000
            end_port = 53200
        else:
            idmap_file = None
            port = None

    # If searching against a emapper hmm server
    elif ":" in args.db:
        dbname, host, port = map(str.strip, args.db.split(":"))
        scantype = 'mem'
        port = int(port)
        if dbname in EGGNOG_DATABASES:
            dbfile, port = get_db_info(dbname)
            args.db = dbname
        else:
            dbfile = dbname

        idmap_file = dbfile + '.idmap'
        if not pexists(idmap_file):
            raise ValueError("idmap file not found: %s" % idmap_file)

        dbpath = host
        if not server_functional(host, port, args.dbtype):
            print colorify(
                "eggnog-mapper server not found at %s:%s" % (host, port),
                'red')
            exit(1)
        connecting_to_server = True
    else:
        raise ValueError('Invalid database name/server')

    # If memory based searches requested, start server
    if scantype == "mem" and not connecting_to_server:
        master_db, worker_db = None, None
        for try_port in range(port, end_port, 2):
            print colorify(
                "Loading server at localhost, port %s-%s" %
                (try_port, try_port + 1), 'lblue')
            dbpath, master_db, worker_db = load_server(dbpath, try_port,
                                                       try_port + 1, args.cpu)
            port = try_port
            ready = False
            for _ in xrange(TIMEOUT_LOAD_SERVER):
                print "Waiting for server to become ready...", host, try_port
                time.sleep(1)
                if not master_db.is_alive() or not worker_db.is_alive():
                    master_db.terminate()
                    master_db.join()
                    worker_db.terminate()
                    worker_db.join()
                    break
                elif server_functional(host, port, args.dbtype):
                    ready = True
                    break
            if ready:
                dbpath = host
                break
    elif scantype == "mem":
        print colorify("DB Server already running or not needed!", 'yellow')
        dbpath = host

    # Preload seqid map to translate hits from hmmpgmd
    if scantype == "mem":
        print colorify("Reading idmap %s" % idmap_file, color='lblue')
        idmap = {}
        for _lnum, _line in enumerate(open(idmap_file)):
            if not _line.strip():
                continue
            try:
                _seqid, _seqname = map(str, _line.strip().split(' '))
            except ValueError:
                if _lnum == 0:
                    # idmap generated by esl_reformat has info line at beginning
                    continue
                else:
                    raise
            _seqid = int(_seqid)
            idmap[_seqid] = [_seqname]
        print len(idmap), "names loaded"

    # If server mode, just listen for connections and exit when interrupted
    if args.servermode:
        while True:
            print colorify(
                "Server ready listening at %s:%s and using %d CPU cores" %
                (host, port, args.cpu), 'green')
            print colorify(
                "Use `emapper.py -d %s:%s:%s (...)` to search against this server"
                % (args.db, host, port), 'lblue')
            time.sleep(10)
        raise emapperException()
    else:
        return host, port, dbpath, scantype, idmap
    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])

    if args.data_dir:
        set_data_path(args.data_dir)

    data_path = get_data_path()

    ##
    # Annotation DB
    
    if args.force or not pexists(get_eggnogdb_file()):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print(colorify(f'Downloading "eggnog.db" at {data_path}...', 'green'))
            download_annotations(data_path)
        else:
            print('Skipping')
    else:
        if not args.quiet:
            print(colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue'))

    ##
    # NCBI taxa
    
    if args.force or not pexists(get_ncbitaxadb_file()):
        if args.allyes or ask("Download taxa database?") == 'y':
            print(colorify(f'Downloading "eggnog.taxa.db" at {data_path}...', 'green'))
            download_taxa(data_path)
        else:
Example #15
0
        dbpath, host, idmap_file = setup_custom_db(args.db,
                                                   scantype=SCANTYPE_MEM,
                                                   dbtype=args.dbtype)

        host = 'localhost'
        port = args.port
        end_port = args.end_port
        wport = args.wport

        dbpath, host, port, servers = create_servers(args.dbtype, dbpath, host,
                                                     port, end_port,
                                                     args.num_servers,
                                                     args.num_workers,
                                                     args.cpus_per_worker)

        print(colorify("All servers ready and listening", 'green'))
        if args.output_servers_list is not None:
            print(f"Creating servers list file: {args.output_servers_list}")
            with open(args.output_servers_list, 'w') as outfn:
                for server in servers:
                    print(f"{server[0]}:{server[1]}", file=outfn)
            print(f"File {args.output_servers_list} created successfully.")

            print(
                colorify(
                    f"Use `emapper.py (-d db:host:port or --servers_list {args.output_servers_list}) to search against these servers",
                    'lblue'))
        else:
            print(
                colorify(
                    "Use `emapper.py (-d db:host:port or --servers_list FILE) to search against these servers",
Example #16
0
def run(cmd):
    print colorify(cmd, 'cyan')
    if not args.simulate:
        os.system(cmd)
Example #17
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    annot_header = (
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "predicted_gene_name",
        "GO_terms",
        "KEGG_KOs",
        "BiGG_reactions",
        "Annotation_tax_scope",
        "OGs",
        "bestOG|evalue|score",
        "COG cat",
        "eggNOG annot",
    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')

    OUT = open(annot_file, "w")

    if args.report_orthologs:
        ORTHOLOGS = open(annot_file + ".orthologs", "w")

    if not args.no_file_comments:
        print >> OUT, '# emapper version:', get_version(
        ), 'emapper DB:', get_db_version()
        print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:])
        print >> OUT, '# time: ' + time.ctime()
        print >> OUT, '\t'.join(annot_header)
    qn = 0
    pool = multiprocessing.Pool(args.cpu)
    for result in pool.imap(annotate_hit_line,
                            iter_hit_lines(seed_orthologs_file, args)):
        qn += 1
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn, total_time, "%0.2f q/s (refinement)" % (
                (float(qn) / total_time))
            sys.stderr.flush()

        if result:
            (query_name, best_hit_name, best_hit_evalue, best_hit_score,
             best_name, gos, kegg, bigg, annot_level_max, match_nogs,
             orthologs) = result

            if query_name in seq2bestOG:
                (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
                 seqto, q_coverage) = seq2bestOG[query_name]
                bestOG = '%s|%s|%s' % (hitname, evalue, score)
                og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
            else:
                bestOG = 'NA|NA|NA'
                og_cat, og_desc = annota.get_best_og_description(match_nogs)

            if args.report_orthologs:
                print >> ORTHOLOGS, '\t'.join(
                    map(str, (query_name, ','.join(orthologs))))

            print >> OUT, '\t'.join(
                map(str, (
                    query_name,
                    best_hit_name,
                    best_hit_evalue,
                    best_hit_score,
                    best_name,
                    ','.join(sorted(gos)),
                    ','.join(sorted(kegg)),
                    ','.join(sorted(bigg)),
                    annot_level_max,
                    ','.join(match_nogs),
                    bestOG,
                    og_cat.replace('\n', ''),
                    og_desc.replace('\n', ' '),
                )))

        OUT.flush()

    pool.terminate()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
    OUT.close()

    if args.report_orthologs:
        ORTHOLOGS.close()

    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Example #18
0
def setup_hmm_search(args):
    host = 'localhost'
    idmap = None
    if args.usemem:
        scantype = 'mem'
    else:
       scantype = 'disk'

    connecting_to_server = False
    # If searching against a predefined database name
    if args.db in EGGNOG_DATABASES:
        dbpath, port = get_db_info(args.db)
        db_present = [pexists(dbpath + "." + ext)
                      for ext in 'h3f h3i h3m h3p idmap'.split()]

        if False in db_present:
            print db_present
            print colorify('Database %s not present. Use download_eggnog_database.py to fetch it' % (args.db), 'red')
            raise ValueError('Database not found')

        if not args.no_refine:
            if not pexists(pjoin(DATA_PATH, 'OG_fasta')):
                print colorify('Database data/OG_fasta/ not present. Use download_eggnog_database.py to fetch it', 'red')
                raise ValueError('Database not found')

        if scantype == 'mem':
            idmap_file = dbpath + '.idmap'
            end_port = 53200

    # If searching against a custom hmm database
    elif os.path.isfile(args.db + '.h3f'):
        dbpath = args.db
        if scantype == 'mem':
            idmap_file = args.db + ".idmap"
            if not pexists(idmap_file):
                if generate_idmap(args.db):
                    idmap_file = args.db + ".idmap"
                    print >>sys.stderr, "idmap succesfully created!"
                else:
                    raise ValueError("idmap could not be created!")
            port = 53000
            end_port = 53200
        else:
            idmap_file = None
            port = None

    # If searching against a emapper hmm server
    elif ":" in args.db:
        dbname, host, port = map(str.strip, args.db.split(":"))
        scantype = 'mem'
        port = int(port)
        if dbname in EGGNOG_DATABASES:
            dbfile, port = get_db_info(dbname)
            args.db = dbname
        else:
            dbfile = dbname

        idmap_file = dbfile + '.idmap'
        if not pexists(idmap_file):
            raise ValueError("idmap file not found: %s" % idmap_file)

        dbpath = host
        if not server_functional(host, port, args.dbtype):
            print colorify("eggnog-mapper server not found at %s:%s" % (host, port), 'red')
            exit(1)
        connecting_to_server = True
    else:
        raise ValueError('Invalid database name/server')


    # If memory based searches requested, start server
    if scantype == "mem" and not connecting_to_server:
        master_db, worker_db = None, None
        for try_port in range(port, end_port, 2):
            print colorify("Loading server at localhost, port %s-%s" %
                           (try_port, try_port + 1), 'lblue')
            dbpath, master_db, worker_db = load_server(
                dbpath, try_port, try_port + 1, args.cpu)
            port = try_port
            ready = False
            for _ in xrange(TIMEOUT_LOAD_SERVER):
                print "Waiting for server to become ready...", host, try_port
                time.sleep(1)
                if not master_db.is_alive() or not worker_db.is_alive():
                    master_db.terminate()
                    master_db.join()
                    worker_db.terminate()
                    worker_db.join()
                    break
                elif server_functional(host, port, args.dbtype):
                    ready = True
                    break
            if ready:
                dbpath = host
                break
    elif scantype == "mem":
        print colorify("DB Server already running or not needed!", 'yellow')
        dbpath = host

    # Preload seqid map to translate hits from hmmpgmd
    if scantype == "mem":
        print colorify("Reading idmap %s" % idmap_file, color='lblue')
        idmap = {}
        for _lnum, _line in enumerate(open(idmap_file)):
            if not _line.strip():
                continue
            try:
                _seqid, _seqname = map(str, _line.strip().split(' '))
            except ValueError:
                if _lnum == 0:
                    # idmap generated by esl_reformat has info line at beginning
                    continue  
                else:
                    raise
            _seqid = int(_seqid)
            idmap[_seqid] = [_seqname]
        print len(idmap), "names loaded"

    # If server mode, just listen for connections and exit when interrupted
    if args.servermode:
        while True:
            print colorify("Server ready listening at %s:%s and using %d CPU cores" % (host, port, args.cpu), 'green')
            print colorify("Use `emapper.py -d %s:%s:%s (...)` to search against this server" % (args.db, host, port), 'lblue')
            time.sleep(10)
        raise emapperException()
    else:
        return host, port, dbpath, scantype, idmap
Example #19
0
    return __author__+" "+__license__+" : "+__description__

if __name__ == "__main__":

    parser = create_arg_parser()
    args = parse_args(parser)

    _total_time = time.time()
    try:
        
        print('# ', get_version())
        print('# hmm_worker.py ', ' '.join(sys.argv[1:]))

        worker_db = None
    
        print(colorify(f"Loading worker at localhost, port {args.port}, connecting to {args.host}", 'green'))
        worker_db = load_worker(args.host, args.port, args.cpu)
    
        ready = False
        for _ in range(TIMEOUT_LOAD_SERVER):
            print(f"Waiting for worker to become ready at localhost:{args.port} ...")
            time.sleep(1)
            if worker_db.is_alive():
                break
            else:
                worker_db.terminate()
                worker_db.join()
                print(colorify("worker not alive"), 'red')
                break
        
        print(colorify("Worker of master %s ready listening at localhost:%s and using %d CPU cores" % (args.host, args.port, args.cpu), 'lblue'))
Example #20
0
def dump_hmm_matches(fasta_file, hits_file, dbpath, port, scantype, idmap, args):
    hits_header = ("#query_name", "hit", "evalue", "sum_score", "query_length",
                   "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage")

    # Cache previous results if resuming is enabled
    VISITED = set()
    if args.resume and pexists(hits_file):
        print colorify("Resuming previous run. Reading computed output from %s" % hits_file, 'yellow')
        VISITED = set([line.split('\t')[0].strip()
                       for line in open(hits_file) if not line.startswith('#')])
        print len(VISITED), 'queries skipped'
        OUT = open(hits_file, 'a')
    else:
        OUT = open(hits_file, 'w')

    print colorify("Sequence mapping starts now!", 'green')
    if not args.no_file_comments:
        print >>OUT, get_call_info()
        print >>OUT, '# ' + '\t'.join(hits_header)
    total_time = 0
    last_time = time.time()
    start_time = time.time()
    qn = 0

    for qn, (name, elapsed, hits, querylen, seq) in enumerate(search.iter_hits(
                                                        fasta_file,
                                                        args.translate,
                                                        args.qtype,
                                                        args.dbtype,
                                                        scantype,
                                                        dbpath,
                                                        port,
                                                        evalue_thr=args.evalue,
                                                        score_thr=args.score,
                                                        qcov_thr=args.qcov,
                                                        fixed_Z=args.Z,
                                                        max_hits=args.maxhits,
                                                        skip=VISITED,
                                                        maxseqlen=args.maxseqlen,
                                                        cpus=args.cpu)):

        if elapsed == -1:
            # error occurred
            print >>OUT, '\t'.join(
                [name] + ['ERROR'] * (len(hits_header) - 1))
        elif not hits:
            print >>OUT, '\t'.join([name] + ['-'] * (len(hits_header) - 1))
        else:
            for hitindex, (hid, heval, hscore, hmmfrom, hmmto, sqfrom, sqto, domscore) in enumerate(hits):
                hitname = hid
                if idmap:
                    hitname = idmap[hid][0]

                print >>OUT, '\t'.join(map(str, [name, hitname, heval, hscore,
                                                 int(querylen), int(hmmfrom),
                                                 int(hmmto), int(sqfrom),
                                                 int(sqto),
                                                 float(sqto - sqfrom) / querylen]))
        OUT.flush()

        # monitoring
        total_time += time.time() - last_time
        last_time = time.time()
        if qn and (qn % 25 == 0):
            print >>sys.stderr, qn + \
                1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time))
            sys.stderr.flush()

    # Writes final stats
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >>OUT, '# %d queries scanned' % (qn + 1)
        print >>OUT, '# Total time (seconds):', elapsed_time
        print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #21
0
def annotate_hits_file_sequential(seed_orthologs_file, annot_file,
                                  hmm_hits_file, args):
    annot_header = (
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "predicted_gene_name",
        "GO_terms",
        "KEGG_pathways",
        "Annotation_tax_scope",
        "OGs",
        "bestOG|evalue|score",
        "COG cat",
        "eggNOG annot",
    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')
    OUT = open(annot_file, "w")
    if not args.no_file_comments:
        print >> OUT, '# ' + time.ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '\t'.join(annot_header)

    qn = 0
    for line in open(seed_orthologs_file):
        if not line.strip() or line.startswith('#'):
            continue
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn + 1, total_time, "%0.2f q/s (refinement)" % (
                (float(qn + 1) / total_time))
            sys.stderr.flush()
        qn += 1
        r = map(str.strip, line.split('\t'))

        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue

        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
            continue

        match_nogs = annota.get_member_ogs(best_hit_name)
        if not match_nogs:
            continue

        match_levels = set([nog.split("@")[1] for nog in match_nogs])
        if args.tax_scope == "auto":
            for level in TAXONOMIC_RESOLUTION:
                if level in match_levels:
                    annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                    annot_levels.add(level)
                    annot_level_max = "%s[%d]" % (level, len(annot_levels))
                    break
        else:
            annot_levels = set(
                LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
            annot_levels.add(args.tax_scope)
            annot_level_max = "%s[%d]" % (args.tax_scope, len(annot_levels))

        all_orthologies = annota.get_member_orthologs(
            best_hit_name, target_levels=annot_levels)
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [
                o for o in orthologs
                if not o.startswith("%s." % args.excluded_taxa)
            ]

        if orthologs:
            pname, gos, keggs = annota.get_member_annotations(
                orthologs,
                target_go_ev=args.go_evidence,
                excluded_go_ev=args.go_excluded)
            best_name = ''
            if pname:
                name_candidate, freq = pname.most_common(1)[0]
                if freq >= 2:
                    best_name = name_candidate
        else:
            pname = []
            best_name = ''
            gos = set()
            keggs = set()

        if query_name in seq2bestOG:
            (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto,
             q_coverage) = seq2bestOG[query_name]
            bestOG = '%s|%s|%s' % (hitname, evalue, score)
            og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
        else:
            bestOG = 'NA|NA|NA'
            og_cat, og_desc = '', ''

        print >> OUT, '\t'.join(
            map(str, (
                query_name,
                best_hit_name,
                best_hit_evalue,
                best_hit_score,
                best_name,
                ','.join(sorted(gos)),
                ','.join(sorted(map(lambda x: "map%05d" % x, map(int,
                                                                 keggs)))),
                annot_level_max,
                ','.join(match_nogs),
                bestOG,
                og_cat.replace('\n', ''),
                og_desc.replace('\n', ' '),
            )))
        OUT.flush()
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn + 1)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #22
0
def main(args):
    # Output and intermediate files
    hmm_hits_file = "%s.emapper.hmm_hits" % args.output
    seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output
    annot_file = "%s.emapper.annotations" % args.output
    orthologs_file = "%s.emapper.predict_orthologs" % args.output

    if args.no_search:
        output_files = [annot_file]
    elif args.no_annot:
        output_files = [hmm_hits_file, seed_orthologs_file]
    else:
        output_files = [hmm_hits_file, seed_orthologs_file, annot_file]

    # convert to absolute path before changing directory
    if args.annotate_hits_table:
        args.annotate_hits_table = os.path.abspath(args.annotate_hits_table)
    # force user to decide what to do with existing files
    os.chdir(args.output_dir)
    files_present = set([pexists(fname) for fname in output_files])
    if True in files_present and not args.resume and not args.override:
        print "Output files detected in disk. Use --resume or --override to continue"
        raise emapperException()

    if args.override:
        for outf in output_files:
            silent_rm(outf)

    print '# ', get_version()
    print '# ./emapper.py ', ' '.join(sys.argv[1:])

    if args.scratch_dir:
        # If resuming in and using --scratch_dir, transfer existing files.
        if args.resume and args.scratch_dir:
            for f in output_files:
                if pexists(f):
                    print "   Copying input file %s to scratch dir %s" % (
                        f, args.scratch_dir)
                    shutil.copy(f, args.scratch_dir)

        # Change working dir
        os.chdir(args.scratch_dir)

    # Step 1. Sequence search
    if not args.no_search:
        if args.mode == 'diamond' and not args.no_search:
            dump_diamond_matches(args.input, seed_orthologs_file, args)

        elif args.mode == 'hmmer' and not args.no_search:
            host, port, dbpath, scantype, idmap = setup_hmm_search(args)
            # Start HMM SCANNING sequences (if requested)
            if not pexists(hmm_hits_file) or args.override:
                dump_hmm_matches(args.input, hmm_hits_file, dbpath, port,
                                 scantype, idmap, args)

            if not args.no_refine and (not pexists(seed_orthologs_file)
                                       or args.override):
                if args.db == 'viruses':
                    print 'Skipping seed ortholog detection in "viruses" database'
                elif args.db in EGGNOG_DATABASES:
                    refine_matches(args.input, seed_orthologs_file,
                                   hmm_hits_file, args)
                else:
                    print 'refined hits not available for custom hmm databases.'

    # Step 2. Annotation
    if not args.no_annot:
        annota.connect()
        if args.annotate_hits_table:
            if not os.path.exists(args.annotate_hits_table):
                raise IOError(errno.ENOENT, os.strerror(errno.ENOENT),
                              args.annotate_hits_table)
            annotate_hits_file(args.annotate_hits_table, annot_file,
                               hmm_hits_file, args)
        elif args.db == 'viruses':
            annotate_hmm_matches(hmm_hits_file, hmm_hits_file + '.annotations',
                                 args)
            OUT = open(annot_file, 'w')
            for line in open(hmm_hits_file + '.annotations'):
                if line.startswith('#') or not line.strip():
                    continue
                (query, hitname, level, evalue, sum_score, query_length,
                 hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc,
                 cats) = line.split("\t")

                if hitname != '-' and hitname != 'ERROR':
                    print >> OUT, '\t'.join(
                        map(str,
                            (query, hitname, evalue, sum_score, '', '', '',
                             'viruses', hitname + "@viruses", "%s|%s|%s" %
                             (hitname, evalue, sum_score),
                             cats.replace('\n', ''), desc.replace('\n', ' '))))
            OUT.close()
        else:
            annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file,
                               args)

    if args.predict_ortho:
        orthology.connect()
        dump_orthologs(seed_orthologs_file, orthologs_file, args)

    # If running in scratch, move files to real output dir and clean up

    if args.scratch_dir:
        for fname in output_files:
            if pexists(fname):
                print " Copying result file %s from scratch to %s" % (
                    fname, args.output_dir)
                shutil.copy(annot_file, args.output_dir)
                print "  Cleaning result file %s from scratch dir" % (fname)

    # Finalize and exit
    print colorify('Done', 'green')
    for f in output_files:
        colorify('Result files:', 'yellow')
        if pexists(f):
            print "   %s" % (f)

    print 'Total time: %g secs' % (time.time() - _total_time)

    if args.mode == 'hmmer':
        print get_citation(['hmmer'])
    elif args.mode == 'diamond':
        print get_citation(['diamond'])

    shutdown_server()
Example #23
0
def dump_hmm_matches(fasta_file, hits_file, dbpath, port, scantype, idmap,
                     args):
    hits_header = ("#query_name", "hit", "evalue", "sum_score", "query_length",
                   "hmmfrom", "hmmto", "seqfrom", "seqto", "query_coverage")

    # Cache previous results if resuming is enabled
    VISITED = set()
    if args.resume and pexists(hits_file):
        print colorify(
            "Resuming previous run. Reading computed output from %s" %
            hits_file, 'yellow')
        VISITED = set([
            line.split('\t')[0].strip() for line in open(hits_file)
            if not line.startswith('#')
        ])
        print len(VISITED), 'queries skipped'
        OUT = open(hits_file, 'a')
    else:
        OUT = open(hits_file, 'w')

    print colorify("Sequence mapping starts now!", 'green')
    if not args.no_file_comments:
        print >> OUT, get_call_info()
        print >> OUT, '# ' + '\t'.join(hits_header)
    total_time = 0
    last_time = time.time()
    start_time = time.time()
    qn = 0  # in case nothing to loop bellow
    for qn, (name, elapsed, hits, querylen, seq) in enumerate(
            search.iter_hits(fasta_file,
                             args.translate,
                             args.qtype,
                             args.dbtype,
                             scantype,
                             dbpath,
                             port,
                             evalue_thr=args.evalue,
                             score_thr=args.score,
                             qcov_thr=args.qcov,
                             fixed_Z=args.Z,
                             max_hits=args.maxhits,
                             skip=VISITED,
                             maxseqlen=args.maxseqlen,
                             cpus=args.cpu,
                             base_tempdir=args.temp_dir)):

        if elapsed == -1:
            # error occurred
            print >> OUT, '\t'.join([name] + ['ERROR'] *
                                    (len(hits_header) - 1))
        elif not hits:
            print >> OUT, '\t'.join([name] + ['-'] * (len(hits_header) - 1))
        else:
            for hitindex, (hid, heval, hscore, hmmfrom, hmmto, sqfrom, sqto,
                           domscore) in enumerate(hits):
                hitname = hid
                if idmap:
                    hitname = idmap[hid][0]

                print >> OUT, '\t'.join(
                    map(str, [
                        name, hitname, heval, hscore,
                        int(querylen),
                        int(hmmfrom),
                        int(hmmto),
                        int(sqfrom),
                        int(sqto),
                        float(sqto - sqfrom) / querylen
                    ]))
        OUT.flush()

        # monitoring
        total_time += time.time() - last_time
        last_time = time.time()
        if qn and (qn % 25 == 0):
            print >>sys.stderr, qn + \
                1, total_time, "%0.2f q/s" % ((float(qn + 1) / total_time))
            sys.stderr.flush()

    # Writes final stats
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn + 1)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #24
0
def dump_diamond_matches(fasta_file, seed_orthologs_file, args):
    cpu = args.cpu
    score_thr = args.seed_ortholog_score
    evalue_thr = args.seed_ortholog_evalue
    excluded_taxa = args.excluded_taxa if args.excluded_taxa else None

    if args.translate:
        tool = 'blastx'
    else:
        tool = 'blastp'

    dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
    query_cov = args.query_cover
    subject_cov = args.subject_cover
    dmnd_opts = ''

    if args.matrix is not None:
        dmnd_opts += ' --matrix %s' % args.matrix
    if args.gapopen is not None:
        dmnd_opts += ' --gapopen %d' % args.gapopen
    if args.gapextend is not None:
        dmnd_opts += ' --gapextend %d' % args.gapextend

    if not DIAMOND:
        raise ValueError("diamond not found in path")

    tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=args.temp_dir)

    raw_output_file = pjoin(tempdir, uuid.uuid4().hex)

    if excluded_taxa:
        cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25 --query-cover %s --subject-cover %s' %\
          (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov)
    else:
        cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3 --query-cover %s --subject-cover %s' %\
          (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov)

    #diamond blastp --threads "${GALAXY_SLOTS:-12}" --db ./database --query '/panfs/roc/galaxy/GALAXYP/files/000/164/dataset_164640.dat' --query-gencode '1'  --outfmt '6' qseqid sseqid sallseqid qlen slen pident length nident mismatch positive gapopen gaps ppos qstart qend sstart send qseq sseq evalue bitscore score qframe stitle salltitles qcovhsp --out '/panfs/roc/galaxy/GALAXYP/files/000/164/dataset_164759.dat'  --compress '0'   --gapopen '10' --gapextend '1' --matrix 'PAM30' --seg 'yes'  --max-target-seqs '25'  --evalue '0.001'  --id '0' --query-cover '0' --block-size '2.0'

    print colorify('  ' + cmd, 'yellow')

    try:
        subprocess.check_call(cmd, shell=True, stdout=subprocess.PIPE)
        OUT = open('%s' % seed_orthologs_file, 'w')

        if not args.no_file_comments:
            print >> OUT, get_call_info()
            print >> OUT, '#', cmd

        visited = set()
        for line in open(raw_output_file):
            if not line.strip() or line.startswith('#'):
                continue
            fields = map(str.strip, line.split('\t'))
            query = fields[0]
            hit = fields[1]
            evalue = float(fields[10])
            score = float(fields[11])

            if query in visited:
                continue

            if evalue > evalue_thr or score < score_thr:
                continue

            if excluded_taxa and hit.startswith("%s." % excluded_taxa):
                continue

            visited.add(query)
            print >> OUT, '\t'.join(map(str, [query, hit, evalue, score]))
        OUT.close()

    except subprocess.CalledProcessError as e:
        raise e
    finally:
        shutil.rmtree(tempdir)
Example #25
0
def main(args):
    # Output and intermediate files
    hmm_hits_file = "%s.emapper.hmm_hits" % args.output
    seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output
    annot_file = "%s.emapper.annotations" % args.output

    if args.no_search:
        output_files = [annot_file]
    elif args.no_annot:
        output_files = [hmm_hits_file, seed_orthologs_file]
    else:
        output_files = [hmm_hits_file, seed_orthologs_file, annot_file]

    # force user to decide what to do with existing files
    os.chdir(args.output_dir)
    files_present = set([pexists(fname) for fname in output_files])
    if True in files_present and not args.resume and not args.override:
        print "Output files detected in disk. Use --resume or --override to continue"
        raise emapperException()

    if args.override:
        for outf in output_files:
            silent_rm(outf)

    print '# ', get_version()
    print '# ./emapper.py ', ' '.join(sys.argv[1:])

    if args.scratch_dir:
        # If resuming in and using --scratch_dir, transfer existing files.
        if args.resume and args.scratch_dir:
            for f in output_files:
                if pexists(f):
                    print "   Copying input file %s to scratch dir %s" % (f, args.scratch_dir)
                    shutil.copy(f, args.scratch_dir)

        # Change working dir
        os.chdir(args.scratch_dir)

    # Step 1. Sequence search
    if not args.no_search:
        if args.mode == 'diamond' and not args.no_search:
            dump_diamond_matches(args.input, seed_orthologs_file, args)

        elif args.mode == 'hmmer' and not args.no_search:
            host, port, dbpath, scantype, idmap = setup_hmm_search(args)
            # Start HMM SCANNING sequences (if requested)
            if not pexists(hmm_hits_file) or args.override:
                dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args)

            if not args.no_refine and (not pexists(seed_orthologs_file) or args.override):
                if args.db == 'viruses':
                    print 'Skipping seed ortholog detection in "viruses" database'
                elif args.db in EGGNOG_DATABASES:
                    refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args)
                else:
                    print 'refined hits not available for custom hmm databases.'

    # Step 2. Annotation
    if not args.no_annot:
        annota.connect()
        if args.annotate_hits_table:
            annotate_hits_file(args.annotate_hits_table, annot_file, hmm_hits_file, args)
        elif args.db == 'viruses':
            annotate_hmm_matches(hmm_hits_file, hmm_hits_file+'.annotations', args)
            OUT = open(annot_file, 'w')
            for line in open(hmm_hits_file+'.annotations'):
                if line.startswith('#') or not line.strip():
                    continue
                (query, hitname, level, evalue, sum_score, query_length,
                 hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats) = line.split("\t")

                if hitname != '-' and hitname != 'ERROR':
                    print >>OUT, '\t'.join(map(str, (query,
                                                     hitname,
                                                     evalue,
                                                     sum_score,
                                                     '',
                                                     '',
                                                     '',
                                                     'viruses',
                                                     hitname+"@viruses",
                                                     "%s|%s|%s" %(hitname, evalue, sum_score),
                                                     cats.replace('\n', ''),
                                                     desc.replace('\n', ' '))))
            OUT.close()
        else:
            annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args)

    # If running in scratch, move files to real output dir and clean up
    if args.scratch_dir:
        for fname in output_files:
            if pexists(fname):
                print " Copying result file %s from scratch to %s" % (fname, args.output_dir)
                shutil.copy(annot_file, args.output_dir)
                print "  Cleaning result file %s from scratch dir" %(fname)

    # Finalize and exit
    print colorify('Done', 'green')
    for f in output_files:
        colorify('Result files:', 'yellow')
        if pexists(f):
            print "   %s" % (f)

    print 'Total time: %g secs' % (time.time()-_total_time)

    if args.mode == 'hmmer':
        print get_citation(['hmmer'])
    elif args.mode == 'diamond':
        print get_citation(['diamond'])

    shutdown_server()
Example #26
0
                        help='assume "yes" to all questions')

    parser.add_argument('-f', action="store_true", dest='force',
                        help='forces download even if the files exist')

    parser.add_argument('-s', action="store_true", dest='simulate',
                        help='simulate and print commands. Nothing is downloaded')


    args = parser.parse_args()
    if 'all' in args.dbs:
        args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(DATA_PATH, 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify('Downloading "eggnog.db" at %s...' %DATA_PATH, 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')

    if args.force or not pexists(pjoin(DATA_PATH, 'OG_fasta')):
        if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
            print colorify('Downloading fasta files " at %s/OG_fasta...' %DATA_PATH, 'green')
            download_groups()
        else:
            print 'Skipping'

    else:
Example #27
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    annot_header = ("#query_name",
                    "seed_eggNOG_ortholog",
                    "seed_ortholog_evalue",
                    "seed_ortholog_score",
                    "predicted_gene_name",
                    "GO_terms",
                    "KEGG_pathways",
                    "Annotation_tax_scope",
                    "OGs",
                    "bestOG|evalue|score",
                    "COG cat",
                    "eggNOG annot",
                    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')
    OUT = open(annot_file, "w")
    if not args.no_file_comments:
        print >>OUT, '# ' + time.ctime()
        print >>OUT, '# ' + ' '.join(sys.argv)
        print >>OUT, '\t'.join(annot_header)

    qn = 0
    for line in open(seed_orthologs_file):
        if not line.strip() or line.startswith('#'):
            continue
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" % (
                (float(qn + 1) / total_time))
            sys.stderr.flush()
        qn += 1
        r = map(str.strip, line.split('\t'))

        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue

        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
            continue

        match_nogs = annota.get_member_ogs(best_hit_name)
        if not match_nogs:
            continue

        match_levels = set([nog.split("@")[1] for nog in match_nogs])
        if args.tax_scope == "auto":
            for level in TAXONOMIC_RESOLUTION:
                if level in match_levels:
                    annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                    annot_levels.add(level)
                    annot_level_max = "%s[%d]" %(level, len(annot_levels))
                    break
        else:
            annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
            annot_levels.add(args.tax_scope)
            annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels))

        all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels)
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)]

        if orthologs:
            pname, gos, keggs = annota.get_member_annotations(orthologs,
                                                              excluded_gos=set(["IEA", "ND"]))
            best_name = ''
            if pname:
                name_candidate, freq = pname.most_common(1)[0]
                if freq >= 2:
                    best_name = name_candidate
        else:
            pname = []
            best_name = ''
            gos = set()
            keggs = set()

        if query_name in seq2bestOG:
            (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
             seqto, q_coverage) = seq2bestOG[query_name]
            bestOG = '%s|%s|%s' %(hitname, evalue, score)
            og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
        else:
            bestOG = 'NA|NA|NA'
            og_cat, og_desc = '', ''


        print >>OUT, '\t'.join(map(str, (query_name,
                                         best_hit_name,
                                         best_hit_evalue,
                                         best_hit_score,
                                         best_name,
                                         ','.join(sorted(gos)),
                                         ','.join(sorted(map(lambda x: "map%05d"%x, map(int, keggs)))),
                                         annot_level_max,
                                         ','.join(match_nogs),
                                         bestOG,
                                         og_cat.replace('\n', ''),
                                         og_desc.replace('\n', ' '),
                                         )))
        OUT.flush()
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >>OUT, '# %d queries scanned' % (qn + 1)
        print >>OUT, '# Total time (seconds):', elapsed_time
        print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #28
0
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])

    if args.data_dir:
        set_data_path(args.data_dir)

    if not args.no_annot and not pexists(get_eggnogdb_file()):
        print colorify(
            'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it',
            'red')
        raise emapperException()

    if args.mode == 'diamond':
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print colorify(
                'DIAMOND database %s not present. Use download_eggnog_database.py to fetch it'
                % dmnd_db, 'red')
            raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error(
            '--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])
    else:
        raise ValueError('Invalid --go_evidence value')

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error(
                    'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))'
                )
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
Example #29
0
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if not args.no_annot and not pexists(EGGNOGDB_FILE):
        print colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.mode == 'diamond' and not pexists(EGGNOG_DMND_DB):
        print colorify('DIAMOND database data/eggnog_proteins.dmnd not present. Use download_eggnog_database.py to fetch it', 'red')
        raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error('--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error('HMMER mode requires specifying a target database (i.e. -d, --guessdb ))')
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
Example #30
0
def dump_diamond_matches(fasta_file, seed_orthologs_file, args):
    cpu = args.cpu
    score_thr = args.seed_ortholog_score
    evalue_thr = args.seed_ortholog_evalue
    excluded_taxa = args.excluded_taxa if args.excluded_taxa else None

    if args.translate:
        tool = 'blastx'
    else:
        tool = 'blastp'

    dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
    query_cov = args.query_cover
    subject_cov = args.subject_cover
    dmnd_opts = ''

    if args.matrix is not None:
        dmnd_opts += ' --matrix %s' % args.matrix
    if args.gapopen is not None:
        dmnd_opts += ' --gapopen %d' % args.gapopen
    if args.gapextend is not None:
        dmnd_opts += ' --gapextend %d' % args.gapextend

    if not DIAMOND:
        raise ValueError("diamond not found in path")

    tempdir = mkdtemp(prefix='emappertmp_dmdn_', dir=args.temp_dir)

    raw_output_file = pjoin(tempdir, uuid.uuid4().hex)

    if excluded_taxa:
        cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --max-target-seqs 25 --query-cover %s --subject-cover %s' %\
          (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov)
    else:
        cmd = '%s %s -d %s -q %s --more-sensitive --threads %s -e %f -o %s --top 3 --query-cover %s --subject-cover %s' %\
          (DIAMOND, tool, dmnd_db, fasta_file, cpu, evalue_thr, raw_output_file, query_cov, subject_cov)

    print colorify('  ' + cmd, 'yellow')

    try:
        with open(raw_output_file + '.stdout', 'w') as STDOUT:
            subprocess.check_call(cmd, shell=True, stdout=STDOUT)

        OUT = open('%s' % seed_orthologs_file, 'w')

        if not args.no_file_comments:
            print >> OUT, get_call_info()
            print >> OUT, '#', cmd

        visited = set()
        for line in open(raw_output_file):
            if not line.strip() or line.startswith('#'):
                continue
            fields = map(str.strip, line.split('\t'))
            query = fields[0]
            hit = fields[1]
            evalue = float(fields[10])
            score = float(fields[11])

            if query in visited:
                continue

            if evalue > evalue_thr or score < score_thr:
                continue

            if excluded_taxa and hit.startswith("%s." % excluded_taxa):
                continue

            visited.add(query)
            print >> OUT, '\t'.join(map(str, [query, hit, evalue, score]))
        OUT.close()

    except subprocess.CalledProcessError as e:
        raise e
    finally:
        shutil.rmtree(tempdir)
                        help='assume "yes" to all questions')

    parser.add_argument('-f', action="store_true", dest='force',
                        help='forces download even if the files exist')

    parser.add_argument('-s', action="store_true", dest='simulate',
                        help='simulate and print commands. Nothing is downloaded')


    args = parser.parse_args()
    if 'all' in args.dbs:
        args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(DATA_PATH, 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify('Downloading "eggnog.db" at %s...' %DATA_PATH, 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        print colorify('Skipping eggnog.db database (already present). Use -f to force download', 'lblue')

    if args.force or not pexists(pjoin(DATA_PATH, 'OG_fasta')):
        if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
            print colorify('Downloading fasta files " at %s/OG_fasta...' %DATA_PATH, 'green')
            download_groups()
        else:
            print 'Skipping'

    else:
def run(cmd):
    print colorify(cmd, 'cyan')
    if not args.simulate:
        os.system(cmd)
    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])

    if args.data_dir:
        set_data_path(args.data_dir)

    # if args.force or not pexists(pjoin(get_data_path(), 'og2level.tsv.gz')):
    #     print colorify('Downloading "og2level.tsv.gz" at %s' %get_data_path(), 'green')
    #     download_og2level()

    # if 'all' in args.dbs:
    #     args.dbs = EGGNOG_DATABASES

    if args.force or not pexists(pjoin(get_data_path(), 'eggnog.db')):
        if args.allyes or ask("Download main annotation database?") == 'y':
            print colorify(
                'Downloading "eggnog.db" at %s...' % get_data_path(), 'green')
            download_annotations()
        else:
            print 'Skipping'

    else:
        if not args.quiet:
            print colorify(
                'Skipping eggnog.db database (already present). Use -f to force download',
                'lblue')

    # if args.force or not pexists(pjoin(get_data_path(), 'OG_fasta')):
    #     if args.allyes or ask("Download OG fasta files for annotation refinement (~20GB after decompression)?") == 'y':
    #         print colorify('Downloading fasta files " at %s/OG_fasta...' %get_data_path(), 'green')
    #         download_groups()
    #     else:
Example #34
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    HIT_HEADER = [
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "best_tax_level",
    ]

    HIT_OG_HEADER = [
        "taxonomic scope", "eggNOG OGs", "best eggNOG OG",
        "COG Functional cat.", "eggNOG free text desc."
    ]

    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')

    OUT = open(annot_file, "w")

    if args.report_orthologs:
        ORTHOLOGS = open(annot_file + ".orthologs", "w")

    if not args.no_file_comments:
        print >> OUT, '# emapper version:', get_version(
        ), 'emapper DB:', get_db_version()
        print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:])
        print >> OUT, '# time: ' + time.ctime()
        print >> OUT, '\t'.join(HIT_HEADER + ANNOTATIONS_HEADER +
                                HIT_OG_HEADER)
    qn = 0

    pool = multiprocessing.Pool(args.cpu)

    for result in pool.imap(annotate_hit_line,
                            iter_hit_lines(seed_orthologs_file, args)):
        qn += 1
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn, total_time, "%0.2f q/s (func. annotation)" % (
                (float(qn) / total_time))
            sys.stderr.flush()

        if result:
            (query_name, best_hit_name, best_hit_evalue, best_hit_score,
             annotations, annot_level_max, swallowest_level, match_nogs,
             orthologs) = result
            if query_name in seq2bestOG:
                (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
                 seqto, q_coverage) = seq2bestOG[query_name]
                bestOG = '%s|%s|%s' % (hitname, evalue, score)
                og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
            else:
                bestOG = 'NA|NA|NA'
                og_cat, og_desc = annota.get_best_og_description(match_nogs)

            if args.report_orthologs:
                print >> ORTHOLOGS, '\t'.join(
                    map(str, (query_name, ','.join(orthologs))))

            # prepare annotations for printing
            annot_columns = [
                query_name, best_hit_name,
                str(best_hit_evalue),
                str(best_hit_score), LEVEL_NAMES[swallowest_level]
            ]

            for h in ANNOTATIONS_HEADER:
                if h in annotations:
                    annot_columns.append(','.join(sorted(annotations[h])))
                else:
                    annot_columns.append('')

            annot_columns.extend([
                annot_level_max, ','.join(match_nogs), bestOG,
                og_cat.replace('\n', ''),
                og_desc.replace('\n', ' ')
            ])

            print >> OUT, '\t'.join(annot_columns)

        #OUT.flush()

    pool.terminate()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
    OUT.close()

    if args.report_orthologs:
        ORTHOLOGS.close()

    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Example #35
0
def parse_args(parser):
    
    args = parser.parse_args()

    if "EGGNOG_DATA_DIR" in os.environ:
        set_data_path(os.environ["EGGNOG_DATA_DIR"])
    
    if args.data_dir:
        set_data_path(args.data_dir)
        
    if args.version:
        version = ""
        try:
            version = get_full_version_info()
        except Exception:
            version = get_version()
        print(version)
        sys.exit(0)

    args.call_info = get_call_info()

    if args.list_taxa:
        print_taxa()
        sys.exit(0)

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()
    multiprocessing.set_start_method(args.mp_start_method)

    if args.resume == True and args.override == True:
        parser.error('Only one of --resume or --override is allowed.')        

    # Gene prediction
    if args.training_genome is not None and args.training_file is None:
        parser.error('"--training_genome requires --training_file"')

    if args.training_genome is None and args.training_file is not None:
        if not os.path.isfile(args.training_file):
            parser.error('"--training_file must point to an existing file, if no --training_genome is provided."')
    
    # Search modes
    if args.mode == SEARCH_MODE_DIAMOND:
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print(colorify('DIAMOND database %s not present. Use download_eggnog_database.py to fetch it' % dmnd_db, 'red'))
            raise EmapperException()

        if args.input is not None:
            if args.annotate_hits_table is not None:
                print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_DIAMOND}", 'blue'))
                args.annotate_hits_table = None
        else:
            # the default -m is diamond, but we will consider -m no_search as default when
            # --annotate_hits_table has been provided and -i has not been provided
            if args.annotate_hits_table is not None:
                print(colorify(f"Assuming -m {SEARCH_MODE_NO_SEARCH}", 'blue'))
                args.mode = SEARCH_MODE_NO_SEARCH
            else:
                parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')
            
    elif args.mode == SEARCH_MODE_MMSEQS2:
        mmseqs_db = args.mmseqs_db if args.mmseqs_db else get_eggnog_mmseqs_db()
        if not pexists(mmseqs_db):
            print(colorify('MMseqs2 database %s not present. Use download_eggnog_database.py to fetch it' % mmseqs_db, 'red'))
            raise EmapperException()

        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_MMSEQS2}", 'blue'))
            args.annotate_hits_table = None
            
    elif args.mode == SEARCH_MODE_HMMER:
        
        if not args.input:
            parser.error('An input file is required (-i)')

        # Output file required
        if not args.output:
            parser.error('An output project name is required (-o)')

        # Hmmer database
        # NOTE: hmmer database format, name and checking if exists is done within hmmer module
        if not args.db:
            parser.error('HMMER mode requires a target database (-d, --database).')

        if args.itype == ITYPE_CDS:
            args.translate = True

        if (args.itype == ITYPE_GENOME or args.itype == ITYPE_META) and args.genepred == GENEPRED_MODE_SEARCH:
            parser.error('HMMER mode is not compatible with "--genepred search" option.')            

        if args.annotate_hits_table is not None:
            print(colorify(f"--annotate_hits_table will be ignored, due to -m {SEARCH_MODE_HMMER}", 'blue'))
            args.annotate_hits_table = None

        if args.clean_overlaps is not None:
            if args.clean_overlaps == "none":
                args.clean_overlaps = None

    elif args.mode == SEARCH_MODE_CACHE:
        if args.cache_file is None:
            parser.error('A file with annotations and md5 of queries is required (-c FILE)')
        if args.decorate_gff != DECORATE_GFF_NONE:
            print(colorify("WARNING: no GFF will be created for cache-based annotations. It is not implemented yet, sorry.", 'red'))
                
        if args.no_annot == True:
            parser.error(f'Cache mode (-m {SEARCH_MODE_CACHE}) should be used to annotate.')
            
    elif args.mode == SEARCH_MODE_NO_SEARCH:
        if args.no_annot == False and not args.annotate_hits_table:
            parser.error(f'No search mode (-m {SEARCH_MODE_NO_SEARCH}) requires a hits table to annotate (--annotate_hits_table FILE.seed_orthologs)')
        if args.md5 == True and args.input is None:
            parser.error(f'--md5 requires an input FASTA file (-i FASTA).')            
            
    else:
        parser.error(f'unrecognized search mode (-m {args.mode})')


    # Search thresholds
    args.dmnd_evalue = args.mmseqs_evalue = args.hmm_evalue = args.evalue
    args.dmnd_score = args.mmseqs_score = args_hmm_score = args.score
    args.qcov = args.query_cover
    
    # Annotation options
    if args.no_annot == False or args.report_orthologs == True:
        if not pexists(get_eggnogdb_file()):
            print(colorify('Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it', 'red'))
            raise EmapperException()

        args.tax_scope_ids = parse_tax_scope(args.tax_scope)
        
        if args.target_taxa is not None:
            args.target_taxa = args.target_taxa.split(",")
        if args.excluded_taxa is not None:
            args.excluded_taxa = args.excluded_taxa.split(",")
        
    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP","IDA","IPI","IMP","IGI","IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'all':
        args.go_evidence = None
        args.go_excluded = None
        
    else:
        raise ValueError('Invalid --go_evidence value')

    # PFAM annotation options
    
    if args.pfam_realign == PFAM_REALIGN_NONE:
        pass
    elif args.pfam_realign == PFAM_REALIGN_REALIGN or args.pfam_realign == PFAM_REALIGN_DENOVO:
        if not args.input:
            parser.error(f'An input fasta file is required (-i) for --pfam_realign {args.pfam_realign}')
    else:
        raise ValueError(f'Invalid --pfam_realign option {args.pfam_realign}')

    total_workers = args.num_workers * args.num_servers
    if args.cpu < total_workers:
        parser.error(f"Less cpus ({args.cpu}) than total workers ({total_workers}) were specified.")
    if args.cpu % total_workers != 0:
        parser.error(f"Number of cpus ({args.cpu}) must be a multiple of total workers ({total_workers}).")        

    args.cpus_per_worker = int(args.cpu / total_workers)
    
    return args