Ejemplo n.º 1
0
def find_orthologs_per_hit(arguments):
    #Copy from predict_orthologs.py

    orthology.connect()
    line, args = arguments

    if not line.strip() or line.startswith('#'):
        return None
    r = map(str.strip, line.split('\t'))

    query_name = r[0]
    best_hit_name = r[1]
    if best_hit_name == '-' or best_hit_name == 'ERROR':
        return None

    best_hit_evalue = float(r[2])
    best_hit_score = float(r[3])

    if best_hit_score < args.seed_ortholog_score or best_hit_evalue > args.seed_ortholog_evalue:
        return None

    target_taxa = args._expanded_target_taxa

    orthologs_pred = orthology.predict_orthologs_by_seed(
        best_hit_name, target_taxa=target_taxa, target_levels=None)
    return (query_name, best_hit_name, orthologs_pred)
Ejemplo n.º 2
0
def main(args):
    # Output and intermediate files
    hmm_hits_file = "%s.emapper.hmm_hits" % args.output
    seed_orthologs_file = "%s.emapper.seed_orthologs" % args.output
    annot_file = "%s.emapper.annotations" % args.output
    orthologs_file = "%s.emapper.predict_orthologs" % args.output

    if args.no_search:
        output_files = [annot_file]
    elif args.no_annot:
        output_files = [hmm_hits_file, seed_orthologs_file]
    else:
        output_files = [hmm_hits_file, seed_orthologs_file, annot_file]

    # convert to absolute path before changing directory
    if args.annotate_hits_table:
        args.annotate_hits_table = os.path.abspath(args.annotate_hits_table)
    # force user to decide what to do with existing files
    os.chdir(args.output_dir)
    files_present = set([pexists(fname) for fname in output_files])
    if True in files_present and not args.resume and not args.override:
        print "Output files detected in disk. Use --resume or --override to continue"
        raise emapperException()

    if args.override:
        for outf in output_files:
            silent_rm(outf)

    print '# ', get_version()
    print '# ./emapper.py ', ' '.join(sys.argv[1:])

    if args.scratch_dir:
        # If resuming in and using --scratch_dir, transfer existing files.
        if args.resume and args.scratch_dir:
            for f in output_files:
                if pexists(f):
                    print "   Copying input file %s to scratch dir %s" % (
                        f, args.scratch_dir)
                    shutil.copy(f, args.scratch_dir)

        # Change working dir
        os.chdir(args.scratch_dir)

    # Step 1. Sequence search
    if not args.no_search:
        if args.mode == 'diamond' and not args.no_search:
            dump_diamond_matches(args.input, seed_orthologs_file, args)

        elif args.mode == 'hmmer' and not args.no_search:
            host, port, dbpath, scantype, idmap = setup_hmm_search(args)
            # Start HMM SCANNING sequences (if requested)
            if not pexists(hmm_hits_file) or args.override:
                dump_hmm_matches(args.input, hmm_hits_file, dbpath, port,
                                 scantype, idmap, args)

            if not args.no_refine and (not pexists(seed_orthologs_file)
                                       or args.override):
                if args.db == 'viruses':
                    print 'Skipping seed ortholog detection in "viruses" database'
                elif args.db in EGGNOG_DATABASES:
                    refine_matches(args.input, seed_orthologs_file,
                                   hmm_hits_file, args)
                else:
                    print 'refined hits not available for custom hmm databases.'

    # Step 2. Annotation
    if not args.no_annot:
        annota.connect()
        if args.annotate_hits_table:
            if not os.path.exists(args.annotate_hits_table):
                raise IOError(errno.ENOENT, os.strerror(errno.ENOENT),
                              args.annotate_hits_table)
            annotate_hits_file(args.annotate_hits_table, annot_file,
                               hmm_hits_file, args)
        elif args.db == 'viruses':
            annotate_hmm_matches(hmm_hits_file, hmm_hits_file + '.annotations',
                                 args)
            OUT = open(annot_file, 'w')
            for line in open(hmm_hits_file + '.annotations'):
                if line.startswith('#') or not line.strip():
                    continue
                (query, hitname, level, evalue, sum_score, query_length,
                 hmmfrom, hmmto, seqfrom, seqto, q_coverage, nm, desc,
                 cats) = line.split("\t")

                if hitname != '-' and hitname != 'ERROR':
                    print >> OUT, '\t'.join(
                        map(str,
                            (query, hitname, evalue, sum_score, '', '', '',
                             'viruses', hitname + "@viruses", "%s|%s|%s" %
                             (hitname, evalue, sum_score),
                             cats.replace('\n', ''), desc.replace('\n', ' '))))
            OUT.close()
        else:
            annotate_hits_file(seed_orthologs_file, annot_file, hmm_hits_file,
                               args)

    if args.predict_ortho:
        orthology.connect()
        dump_orthologs(seed_orthologs_file, orthologs_file, args)

    # If running in scratch, move files to real output dir and clean up

    if args.scratch_dir:
        for fname in output_files:
            if pexists(fname):
                print " Copying result file %s from scratch to %s" % (
                    fname, args.output_dir)
                shutil.copy(annot_file, args.output_dir)
                print "  Cleaning result file %s from scratch dir" % (fname)

    # Finalize and exit
    print colorify('Done', 'green')
    for f in output_files:
        colorify('Result files:', 'yellow')
        if pexists(f):
            print "   %s" % (f)

    print 'Total time: %g secs' % (time.time() - _total_time)

    if args.mode == 'hmmer':
        print get_citation(['hmmer'])
    elif args.mode == 'diamond':
        print get_citation(['diamond'])

    shutdown_server()