Example #1
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    HIT_HEADER = [
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "best_tax_level",
    ]

    HIT_OG_HEADER = [
        "taxonomic scope", "eggNOG OGs", "best eggNOG OG",
        "COG Functional cat.", "eggNOG free text desc."
    ]

    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')

    OUT = open(annot_file, "w")

    if args.report_orthologs:
        ORTHOLOGS = open(annot_file + ".orthologs", "w")

    if not args.no_file_comments:
        print >> OUT, '# emapper version:', get_version(
        ), 'emapper DB:', get_db_version()
        print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:])
        print >> OUT, '# time: ' + time.ctime()
        print >> OUT, '\t'.join(HIT_HEADER + ANNOTATIONS_HEADER +
                                HIT_OG_HEADER)
    qn = 0

    pool = multiprocessing.Pool(args.cpu)

    for result in pool.imap(annotate_hit_line,
                            iter_hit_lines(seed_orthologs_file, args)):
        qn += 1
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn, total_time, "%0.2f q/s (func. annotation)" % (
                (float(qn) / total_time))
            sys.stderr.flush()

        if result:
            (query_name, best_hit_name, best_hit_evalue, best_hit_score,
             annotations, annot_level_max, swallowest_level, match_nogs,
             orthologs) = result
            if query_name in seq2bestOG:
                (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
                 seqto, q_coverage) = seq2bestOG[query_name]
                bestOG = '%s|%s|%s' % (hitname, evalue, score)
                og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
            else:
                bestOG = 'NA|NA|NA'
                og_cat, og_desc = annota.get_best_og_description(match_nogs)

            if args.report_orthologs:
                print >> ORTHOLOGS, '\t'.join(
                    map(str, (query_name, ','.join(orthologs))))

            # prepare annotations for printing
            annot_columns = [
                query_name, best_hit_name,
                str(best_hit_evalue),
                str(best_hit_score), LEVEL_NAMES[swallowest_level]
            ]

            for h in ANNOTATIONS_HEADER:
                if h in annotations:
                    annot_columns.append(','.join(sorted(annotations[h])))
                else:
                    annot_columns.append('')

            annot_columns.extend([
                annot_level_max, ','.join(match_nogs), bestOG,
                og_cat.replace('\n', ''),
                og_desc.replace('\n', ' ')
            ])

            print >> OUT, '\t'.join(annot_columns)

        #OUT.flush()

    pool.terminate()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
    OUT.close()

    if args.report_orthologs:
        ORTHOLOGS.close()

    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Example #2
0
def annotate_hits_file_sequential(seed_orthologs_file, annot_file,
                                  hmm_hits_file, args):
    annot_header = (
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "predicted_gene_name",
        "GO_terms",
        "KEGG_pathways",
        "Annotation_tax_scope",
        "OGs",
        "bestOG|evalue|score",
        "COG cat",
        "eggNOG annot",
    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')
    OUT = open(annot_file, "w")
    if not args.no_file_comments:
        print >> OUT, '# ' + time.ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '\t'.join(annot_header)

    qn = 0
    for line in open(seed_orthologs_file):
        if not line.strip() or line.startswith('#'):
            continue
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn + 1, total_time, "%0.2f q/s (refinement)" % (
                (float(qn + 1) / total_time))
            sys.stderr.flush()
        qn += 1
        r = map(str.strip, line.split('\t'))

        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue

        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
            continue

        match_nogs = annota.get_member_ogs(best_hit_name)
        if not match_nogs:
            continue

        match_levels = set([nog.split("@")[1] for nog in match_nogs])
        if args.tax_scope == "auto":
            for level in TAXONOMIC_RESOLUTION:
                if level in match_levels:
                    annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                    annot_levels.add(level)
                    annot_level_max = "%s[%d]" % (level, len(annot_levels))
                    break
        else:
            annot_levels = set(
                LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
            annot_levels.add(args.tax_scope)
            annot_level_max = "%s[%d]" % (args.tax_scope, len(annot_levels))

        all_orthologies = annota.get_member_orthologs(
            best_hit_name, target_levels=annot_levels)
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [
                o for o in orthologs
                if not o.startswith("%s." % args.excluded_taxa)
            ]

        if orthologs:
            pname, gos, keggs = annota.get_member_annotations(
                orthologs,
                target_go_ev=args.go_evidence,
                excluded_go_ev=args.go_excluded)
            best_name = ''
            if pname:
                name_candidate, freq = pname.most_common(1)[0]
                if freq >= 2:
                    best_name = name_candidate
        else:
            pname = []
            best_name = ''
            gos = set()
            keggs = set()

        if query_name in seq2bestOG:
            (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom, seqto,
             q_coverage) = seq2bestOG[query_name]
            bestOG = '%s|%s|%s' % (hitname, evalue, score)
            og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
        else:
            bestOG = 'NA|NA|NA'
            og_cat, og_desc = '', ''

        print >> OUT, '\t'.join(
            map(str, (
                query_name,
                best_hit_name,
                best_hit_evalue,
                best_hit_score,
                best_name,
                ','.join(sorted(gos)),
                ','.join(sorted(map(lambda x: "map%05d" % x, map(int,
                                                                 keggs)))),
                annot_level_max,
                ','.join(match_nogs),
                bestOG,
                og_cat.replace('\n', ''),
                og_desc.replace('\n', ' '),
            )))
        OUT.flush()
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn + 1)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Example #3
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    annot_header = (
        "#query_name",
        "seed_eggNOG_ortholog",
        "seed_ortholog_evalue",
        "seed_ortholog_score",
        "predicted_gene_name",
        "GO_terms",
        "KEGG_KOs",
        "BiGG_reactions",
        "Annotation_tax_scope",
        "OGs",
        "bestOG|evalue|score",
        "COG cat",
        "eggNOG annot",
    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(
        set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')

    OUT = open(annot_file, "w")

    if args.report_orthologs:
        ORTHOLOGS = open(annot_file + ".orthologs", "w")

    if not args.no_file_comments:
        print >> OUT, '# emapper version:', get_version(
        ), 'emapper DB:', get_db_version()
        print >> OUT, '# command: ./emapper.py ', ' '.join(sys.argv[1:])
        print >> OUT, '# time: ' + time.ctime()
        print >> OUT, '\t'.join(annot_header)
    qn = 0
    pool = multiprocessing.Pool(args.cpu)
    for result in pool.imap(annotate_hit_line,
                            iter_hit_lines(seed_orthologs_file, args)):
        qn += 1
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >> sys.stderr, qn, total_time, "%0.2f q/s (refinement)" % (
                (float(qn) / total_time))
            sys.stderr.flush()

        if result:
            (query_name, best_hit_name, best_hit_evalue, best_hit_score,
             best_name, gos, kegg, bigg, annot_level_max, match_nogs,
             orthologs) = result

            if query_name in seq2bestOG:
                (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
                 seqto, q_coverage) = seq2bestOG[query_name]
                bestOG = '%s|%s|%s' % (hitname, evalue, score)
                og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
            else:
                bestOG = 'NA|NA|NA'
                og_cat, og_desc = annota.get_best_og_description(match_nogs)

            if args.report_orthologs:
                print >> ORTHOLOGS, '\t'.join(
                    map(str, (query_name, ','.join(orthologs))))

            print >> OUT, '\t'.join(
                map(str, (
                    query_name,
                    best_hit_name,
                    best_hit_evalue,
                    best_hit_score,
                    best_name,
                    ','.join(sorted(gos)),
                    ','.join(sorted(kegg)),
                    ','.join(sorted(bigg)),
                    annot_level_max,
                    ','.join(match_nogs),
                    bestOG,
                    og_cat.replace('\n', ''),
                    og_desc.replace('\n', ' '),
                )))

        OUT.flush()

    pool.terminate()

    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >> OUT, '# %d queries scanned' % (qn)
        print >> OUT, '# Total time (seconds):', elapsed_time
        print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
    OUT.close()

    if args.report_orthologs:
        ORTHOLOGS.close()

    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Example #4
0
def annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args):
    annot_header = ("#query_name",
                    "seed_eggNOG_ortholog",
                    "seed_ortholog_evalue",
                    "seed_ortholog_score",
                    "predicted_gene_name",
                    "GO_terms",
                    "KEGG_pathways",
                    "Annotation_tax_scope",
                    "OGs",
                    "bestOG|evalue|score",
                    "COG cat",
                    "eggNOG annot",
                    )
    start_time = time.time()
    seq2bestOG = {}
    if pexists(hmm_hits_file):
        seq2bestOG = get_seq_hmm_matches(hmm_hits_file)

    seq2annotOG = annota.get_ogs_annotations(set([v[0] for v in seq2bestOG.itervalues()]))

    print colorify("Functional annotation of refined hits starts now", 'green')
    OUT = open(annot_file, "w")
    if not args.no_file_comments:
        print >>OUT, '# ' + time.ctime()
        print >>OUT, '# ' + ' '.join(sys.argv)
        print >>OUT, '\t'.join(annot_header)

    qn = 0
    for line in open(seed_orthologs_file):
        if not line.strip() or line.startswith('#'):
            continue
        if qn and (qn % 500 == 0):
            total_time = time.time() - start_time
            print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" % (
                (float(qn + 1) / total_time))
            sys.stderr.flush()
        qn += 1
        r = map(str.strip, line.split('\t'))

        query_name = r[0]
        best_hit_name = r[1]
        if best_hit_name == '-' or best_hit_name == 'ERROR':
            continue

        best_hit_evalue = float(r[2])
        best_hit_score = float(r[3])
        if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
            continue

        match_nogs = annota.get_member_ogs(best_hit_name)
        if not match_nogs:
            continue

        match_levels = set([nog.split("@")[1] for nog in match_nogs])
        if args.tax_scope == "auto":
            for level in TAXONOMIC_RESOLUTION:
                if level in match_levels:
                    annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                    annot_levels.add(level)
                    annot_level_max = "%s[%d]" %(level, len(annot_levels))
                    break
        else:
            annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
            annot_levels.add(args.tax_scope)
            annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels))

        all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels)
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)]

        if orthologs:
            pname, gos, keggs = annota.get_member_annotations(orthologs,
                                                              excluded_gos=set(["IEA", "ND"]))
            best_name = ''
            if pname:
                name_candidate, freq = pname.most_common(1)[0]
                if freq >= 2:
                    best_name = name_candidate
        else:
            pname = []
            best_name = ''
            gos = set()
            keggs = set()

        if query_name in seq2bestOG:
            (hitname, evalue, score, qlength, hmmfrom, hmmto, seqfrom,
             seqto, q_coverage) = seq2bestOG[query_name]
            bestOG = '%s|%s|%s' %(hitname, evalue, score)
            og_cat, og_desc = seq2annotOG.get(hitname, ['', ''])
        else:
            bestOG = 'NA|NA|NA'
            og_cat, og_desc = '', ''


        print >>OUT, '\t'.join(map(str, (query_name,
                                         best_hit_name,
                                         best_hit_evalue,
                                         best_hit_score,
                                         best_name,
                                         ','.join(sorted(gos)),
                                         ','.join(sorted(map(lambda x: "map%05d"%x, map(int, keggs)))),
                                         annot_level_max,
                                         ','.join(match_nogs),
                                         bestOG,
                                         og_cat.replace('\n', ''),
                                         og_desc.replace('\n', ' '),
                                         )))
        OUT.flush()
    elapsed_time = time.time() - start_time
    if not args.no_file_comments:
        print >>OUT, '# %d queries scanned' % (qn + 1)
        print >>OUT, '# Total time (seconds):', elapsed_time
        print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
    OUT.close()
    print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                   (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')