def find_orthologs_per_hit(arguments):
    annota.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])
    
    # dp we need this?
    #if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
    #    return None

    all_orthologies = annota.get_member_orthologs(best_hit_name)
    orthologs = sorted(all_orthologies[args.orthology_type])
    taxid = query_name.split(".")[0]
    # target species and taxid to be added
    return (query_name, [], taxid, orthologs)
def main(args):
    fasta_file = args.input
    fields = fasta_file.split("/")
    file_id = fields[len(fields)-1].split(".fa")[0]
    orthologs_file = "%s.orthologs" %file_id
    print orthologs_file
    hmm_hits_file = "tmp.emapper.hmm_hits"
    seed_orthologs_file = "tmp.emapper.seed_orthologs"
    if args.target_species:
        target_species = args.target_species

    # Sequence search with hmmer
    host, port, dbpath, scantype, idmap = setup_hmm_search(args)
    # Start HMM SCANNING sequences (if requested)
    if not pexists(hmm_hits_file) or args.override:
        dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args)

        if not args.no_refine and (not pexists(seed_orthologs_file) or args.override):
            if args.db == 'viruses':
                print 'Skipping seed ortholog detection in "viruses" database'
            elif args.db in EGGNOG_DATABASES:
                refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args)
            else:
                print 'refined hits not available for custom hmm databases.'

    # Orthologs search
    annota.connect()
    find_orthologs(seed_orthologs_file, orthologs_file, hmm_hits_file, args)

    os.system("rm %s %s" % (hmm_hits_file, seed_orthologs_file))
    
    print "done"
Ejemplo n.º 3
0
def annotate_hit_line(arguments):
    annota.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])
    if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
        return None

    match_nogs = annota.get_member_ogs(best_hit_name)
    if not match_nogs:
        return None

    match_levels = set([nog.split("@")[1] for nog in match_nogs])
    if args.tax_scope == "auto":
        for level in TAXONOMIC_RESOLUTION:
            if level in match_levels:
                annot_levels = set(LEVEL_CONTENT.get(level, [level]))
                annot_levels.add(level)
                annot_level_max = "%s[%d]" %(level, len(annot_levels))
                break
    else:
        annot_levels = set(LEVEL_CONTENT.get(args.tax_scope, [args.tax_scope]))
        annot_levels.add(args.tax_scope)
        annot_level_max = "%s[%d]" %(args.tax_scope, len(annot_levels))

    all_orthologies = annota.get_member_orthologs(best_hit_name, target_levels=annot_levels)
    orthologs = sorted(all_orthologies[args.target_orthologs])
    if args.excluded_taxa:
        orthologs = [o for o in orthologs if not o.startswith("%s." %args.excluded_taxa)]

    if orthologs:
        pname, gos, kegg, bigg = annota.summarize_annotations(orthologs,
                                                         target_go_ev=args.go_evidence,
                                                         excluded_go_ev=args.go_excluded)

        best_name = ''
        if pname:
            name_candidate, freq = pname.most_common(1)[0]
            if freq >= 2:
                best_name = name_candidate
    else:
        pname = []
        best_name = ''
        gos = set()
        kegg = set()
        bigg = set()

    return (query_name, best_hit_name, best_hit_evalue, best_hit_score,
            best_name, gos, kegg, bigg, annot_level_max, match_nogs, orthologs)
Ejemplo n.º 4
0
def annotate_hmm_matches(hits_file, hits_annot_file, args):
    hits_annot_header = map(
        str.strip, '''#query_name, hit, level, evalue,
                         sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage,
                         members_in_og, og_description, og_COG_categories'''.
        split(','))

    annota.connect()
    print colorify("Functional annotation of hits starts now", 'green')
    start_time = time.time()
    if pexists(hits_file):
        OUT = open(hits_annot_file, "w")
        if not args.no_file_comments:
            print >> OUT, get_call_info()
            print >> OUT, '\t'.join(hits_annot_header)
        qn = 0
        t1 = time.time()
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue
            qn += 1
            if qn and (qn % 10000 == 0):
                total_time = time.time() - start_time
                print >>sys.stderr, qn, total_time, "%0.2f q/s (refinement)" %\
                    ((float(qn) / total_time))
                sys.stderr.flush()

            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))
            if hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                level, nm, desc, cats = annota.get_og_annotations(hitname)
                print >> OUT, '\t'.join(
                    map(str, [
                        query, hitname, level, evalue, sum_score, query_length,
                        hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc,
                        cats
                    ]))
            else:
                print >> OUT, '\t'.join([query] + [hit] *
                                        (len(hits_annot_header) - 1))
        elapsed_time = time.time() - t1
        if not args.no_file_comments:
            print >> OUT, '# %d queries scanned' % (qn)
            print >> OUT, '# Total time (seconds):', elapsed_time
            print >> OUT, '# Rate:', "%0.2f q/s" % ((float(qn) / elapsed_time))
        OUT.close()
        print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                       (qn, elapsed_time, "%0.2f q/s" % ((float(qn) / elapsed_time))), 'lblue')
Ejemplo n.º 5
0
def annotate_hmm_matches(hits_file, hits_annot_file, args):
    hits_annot_header = map(str.strip, '''#query_name, hit, level, evalue,
                         sum_score, query_length, hmmfrom, hmmto, seqfrom, seqto, query_coverage,
                         members_in_og, og_description, og_COG_categories'''.split(','))

    annota.connect()
    print colorify("Functional annotation of hits starts now", 'green')
    start_time = time.time()
    if pexists(hits_file):
        OUT = open(hits_annot_file, "w")
        if not args.no_file_comments:
            print >>OUT, get_call_info()
            print >>OUT, '\t'.join(hits_annot_header)
        qn = 0
        t1 = time.time()
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue
            if qn and (qn % 10000 == 0):
                total_time = time.time() - start_time
                print >>sys.stderr, qn+1, total_time, "%0.2f q/s (refinement)" %\
                    ((float(qn + 1) / total_time))
                sys.stderr.flush()
            qn += 1
            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))
            if hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                level, nm, desc, cats = annota.get_og_annotations(hitname)
                print >>OUT, '\t'.join(map( str, [query, hitname, level, evalue,
                                                  sum_score, query_length,
                                                  hmmfrom, hmmto, seqfrom,
                                                  seqto, q_coverage, nm, desc,
                                                  cats]))
            else:
                print >>OUT, '\t'.join(
                    [query] + [hit] * (len(hits_annot_header) - 1))
        elapsed_time = time.time() - t1
        if not args.no_file_comments:
            print >>OUT, '# %d queries scanned' % (qn + 1)
            print >>OUT, '# Total time (seconds):', elapsed_time
            print >>OUT, '# Rate:', "%0.2f q/s" % ((float(qn + 1) / elapsed_time))
        OUT.close()
        print colorify(" Processed queries:%s total_time:%s rate:%s" %\
                       (qn+1, elapsed_time, "%0.2f q/s" % ((float(qn+1) / elapsed_time))), 'lblue')
Ejemplo n.º 6
0
def get_seq_hmm_matches(hits_file):
    annota.connect()
    print colorify("Reading HMM matches", 'green')
    seq2oginfo = {}
    start_time = time.time()
    hitnames = set()
    if pexists(hits_file):
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue

            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))

            if query not in seq2oginfo and hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                seq2oginfo[query] = [hitname, evalue, sum_score, query_length,
                                     hmmfrom, hmmto, seqfrom, seqto,
                                     q_coverage]
    return seq2oginfo
Ejemplo n.º 7
0
def get_seq_hmm_matches(hits_file):
    annota.connect()
    print colorify("Reading HMM matches", 'green')
    seq2oginfo = {}
    start_time = time.time()
    hitnames = set()
    if pexists(hits_file):
        for line in open(hits_file):
            if not line.strip() or line.startswith('#'):
                continue

            (query, hit, evalue, sum_score, query_length, hmmfrom, hmmto,
             seqfrom, seqto, q_coverage) = map(str.strip, line.split('\t'))

            if query not in seq2oginfo and hit not in ['ERROR', '-']:
                hitname = cleanup_og_name(hit)
                seq2oginfo[query] = [hitname, evalue, sum_score, query_length,
                                     hmmfrom, hmmto, seqfrom, seqto,
                                     q_coverage]
    return seq2oginfo
Ejemplo n.º 8
0
def _annotate_hit_line(arguments):
    annota.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])
    if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
        return None

    match_nogs = annota.get_member_ogs(best_hit_name)
    if not match_nogs:
        return None

    match_levels = set()
    for nog in match_nogs:
        match_levels.update(LEVEL_PARENTS[nog.split("@")[1]])

    swallowest_level = sorted(match_levels & set(LEVEL_DEPTH.keys()),
                              key=lambda x: LEVEL_DEPTH[x],
                              reverse=True)[0]

    annot_levels = set()
    if args.tax_scope == "auto":
        for level in TAXONOMIC_RESOLUTION:
            if level in match_levels:
                annot_levels.add(level)
                annot_level_max = LEVEL_NAMES.get(level, level)
                break
    else:
        annot_levels.add(args.tax_scope)
        annot_level_max = LEVEL_NAMES.get(args.tax_scope, args.tax_scope)

    if args.target_taxa != 'all':
        target_taxa = orthology.normalize_target_taxa(args.target_taxa)
    else:
        target_taxa = None

    try:
        all_orthologies = annota.get_member_orthologs(
            best_hit_name, target_taxa=target_taxa, target_levels=annot_levels)
    except Exception:
        orthologs = None
        status = 'Error'
    else:
        orthologs = sorted(all_orthologies[args.target_orthologs])
        if args.excluded_taxa:
            orthologs = [
                o for o in orthologs
                if not o.startswith("%s." % args.excluded_taxa)
            ]
        status = 'OK'

    if orthologs:
        annotations = annota.summarize_annotations(
            orthologs,
            target_go_ev=args.go_evidence,
            excluded_go_ev=args.go_excluded)
    else:
        annotations = {}

    return (query_name, best_hit_name, best_hit_evalue, best_hit_score,
            annotations, annot_level_max, swallowest_level, match_nogs,
            orthologs)
Ejemplo n.º 9
0
def main(args):
    # Output and intermediate files
    hmm_hits_file = "%s.emapper.hmm_hits" % args.output
    seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output
    annot_file = "%s.emapper.annotations" % args.output
    orthologs_file = "%s.emapper.predict_orthologs" % args.output

    if args.no_search:
        output_files = [annot_file]
    elif args.no_annot:
        output_files = [hmm_hits_file, seed_orthologs_file]
    else:
        output_files = [hmm_hits_file, seed_orthologs_file, annot_file]

    # convert to absolute path before changing directory
    if args.annotate_hits_table:
        args.annotate_hits_table = os.path.abspath(args.annotate_hits_table)
    # force user to decide what to do with existing files
    os.chdir(args.output_dir)
    files_present = set([pexists(fname) for fname in output_files])
    if True in files_present and not args.resume and not args.override:
        print "Output files detected in disk. Use --resume or --override to continue"
        raise emapperException()

    if args.override:
        for outf in output_files:
            silent_rm(outf)

    print '# ', get_version()
    print '# ./emapper.py ', ' '.join(sys.argv[1:])

    if args.scratch_dir:
        # If resuming in and using --scratch_dir, transfer existing files.
        if args.resume and args.scratch_dir:
            for f in output_files:
                if pexists(f):
                    print "   Copying input file %s to scratch dir %s" % (
                        f, args.scratch_dir)
                    shutil.copy(f, args.scratch_dir)

        # Change working dir
        os.chdir(args.scratch_dir)

    # Step 1. Sequence search
    if not args.no_search:
        if args.mode == 'diamond' and not args.no_search:
            dump_diamond_matches(args.input, seed_orthologs_file, args)

        elif args.mode == 'hmmer' and not args.no_search:
            host, port, dbpath, scantype, idmap = setup_hmm_search(args)
            # Start HMM SCANNING sequences (if requested)
            if not pexists(hmm_hits_file) or args.override:
                dump_hmm_matches(args.input, hmm_hits_file, dbpath, port,
                                 scantype, idmap, args)

            if not args.no_refine and (not pexists(seed_orthologs_file)
                                       or args.override):
                if args.db == 'viruses':
                    print 'Skipping seed ortholog detection in "viruses" database'
                elif args.db in EGGNOG_DATABASES:
                    refine_matches(args.input, seed_orthologs_file,
                                   hmm_hits_file, args)
                else:
                    print 'refined hits not available for custom hmm databases.'

    # Step 2. Annotation
    if not args.no_annot:
        annota.connect()
        if args.annotate_hits_table:
            if not os.path.exists(args.annotate_hits_table):
                raise IOError(errno.ENOENT, os.strerror(errno.ENOENT),
                              args.annotate_hits_table)
            annotate_hits_file(args.annotate_hits_table, annot_file,
                               hmm_hits_file, args)
        elif args.db == 'viruses':
            annotate_hmm_matches(hmm_hits_file, hmm_hits_file + '.annotations',
                                 args)
            OUT = open(annot_file, 'w')
            for line in open(hmm_hits_file + '.annotations'):
                if line.startswith('#') or not line.strip():
                    continue
                (query, hitname, level, evalue, sum_score, query_length,
                 hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc,
                 cats) = line.split("\t")

                if hitname != '-' and hitname != 'ERROR':
                    print >> OUT, '\t'.join(
                        map(str,
                            (query, hitname, evalue, sum_score, '', '', '',
                             'viruses', hitname + "@viruses", "%s|%s|%s" %
                             (hitname, evalue, sum_score),
                             cats.replace('\n', ''), desc.replace('\n', ' '))))
            OUT.close()
        else:
            annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file,
                               args)

    if args.predict_ortho:
        orthology.connect()
        dump_orthologs(seed_orthologs_file, orthologs_file, args)

    # If running in scratch, move files to real output dir and clean up

    if args.scratch_dir:
        for fname in output_files:
            if pexists(fname):
                print " Copying result file %s from scratch to %s" % (
                    fname, args.output_dir)
                shutil.copy(annot_file, args.output_dir)
                print "  Cleaning result file %s from scratch dir" % (fname)

    # Finalize and exit
    print colorify('Done', 'green')
    for f in output_files:
        colorify('Result files:', 'yellow')
        if pexists(f):
            print "   %s" % (f)

    print 'Total time: %g secs' % (time.time() - _total_time)

    if args.mode == 'hmmer':
        print get_citation(['hmmer'])
    elif args.mode == 'diamond':
        print get_citation(['diamond'])

    shutdown_server()
Ejemplo n.º 10
0
def main(args):
    # Output and intermediate files
    hmm_hits_file = "%s.emapper.hmm_hits" % args.output
    seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output
    annot_file = "%s.emapper.annotations" % args.output

    if args.no_search:
        output_files = [annot_file]
    elif args.no_annot:
        output_files = [hmm_hits_file, seed_orthologs_file]
    else:
        output_files = [hmm_hits_file, seed_orthologs_file, annot_file]

    # force user to decide what to do with existing files
    os.chdir(args.output_dir)
    files_present = set([pexists(fname) for fname in output_files])
    if True in files_present and not args.resume and not args.override:
        print "Output files detected in disk. Use --resume or --override to continue"
        raise emapperException()

    if args.override:
        for outf in output_files:
            silent_rm(outf)

    print '# ', get_version()
    print '# ./emapper.py ', ' '.join(sys.argv[1:])

    if args.scratch_dir:
        # If resuming in and using --scratch_dir, transfer existing files.
        if args.resume and args.scratch_dir:
            for f in output_files:
                if pexists(f):
                    print "   Copying input file %s to scratch dir %s" % (f, args.scratch_dir)
                    shutil.copy(f, args.scratch_dir)

        # Change working dir
        os.chdir(args.scratch_dir)

    # Step 1. Sequence search
    if not args.no_search:
        if args.mode == 'diamond' and not args.no_search:
            dump_diamond_matches(args.input, seed_orthologs_file, args)

        elif args.mode == 'hmmer' and not args.no_search:
            host, port, dbpath, scantype, idmap = setup_hmm_search(args)
            # Start HMM SCANNING sequences (if requested)
            if not pexists(hmm_hits_file) or args.override:
                dump_hmm_matches(args.input, hmm_hits_file, dbpath, port, scantype, idmap, args)

            if not args.no_refine and (not pexists(seed_orthologs_file) or args.override):
                if args.db == 'viruses':
                    print 'Skipping seed ortholog detection in "viruses" database'
                elif args.db in EGGNOG_DATABASES:
                    refine_matches(args.input, seed_orthologs_file, hmm_hits_file, args)
                else:
                    print 'refined hits not available for custom hmm databases.'

    # Step 2. Annotation
    if not args.no_annot:
        annota.connect()
        if args.annotate_hits_table:
            annotate_hits_file(args.annotate_hits_table, annot_file, hmm_hits_file, args)
        elif args.db == 'viruses':
            annotate_hmm_matches(hmm_hits_file, hmm_hits_file+'.annotations', args)
            OUT = open(annot_file, 'w')
            for line in open(hmm_hits_file+'.annotations'):
                if line.startswith('#') or not line.strip():
                    continue
                (query, hitname, level, evalue, sum_score, query_length,
                 hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc, cats) = line.split("\t")

                if hitname != '-' and hitname != 'ERROR':
                    print >>OUT, '\t'.join(map(str, (query,
                                                     hitname,
                                                     evalue,
                                                     sum_score,
                                                     '',
                                                     '',
                                                     '',
                                                     'viruses',
                                                     hitname+"@viruses",
                                                     "%s|%s|%s" %(hitname, evalue, sum_score),
                                                     cats.replace('\n', ''),
                                                     desc.replace('\n', ' '))))
            OUT.close()
        else:
            annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file, args)

    # If running in scratch, move files to real output dir and clean up
    if args.scratch_dir:
        for fname in output_files:
            if pexists(fname):
                print " Copying result file %s from scratch to %s" % (fname, args.output_dir)
                shutil.copy(annot_file, args.output_dir)
                print "  Cleaning result file %s from scratch dir" %(fname)

    # Finalize and exit
    print colorify('Done', 'green')
    for f in output_files:
        colorify('Result files:', 'yellow')
        if pexists(f):
            print "   %s" % (f)

    print 'Total time: %g secs' % (time.time()-_total_time)

    if args.mode == 'hmmer':
        print get_citation(['hmmer'])
    elif args.mode == 'diamond':
        print get_citation(['diamond'])

    shutdown_server()