Ejemplo n.º 1
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks",
                 type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus**.5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Ejemplo n.º 2
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks", type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-l 100 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus ** .5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Ejemplo n.º 3
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species",
                 default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3",
                 default=False,
                 action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(
        augustuswrap,
        species=opts.species,
        gff3=gff3,
        cfgfile=cfgfile,
        hintsfile=opts.hintsfile,
    )
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus

        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Ejemplo n.º 4
0
def parallel_musclewrap(clustfile, cpus, minsamp=0):
    musclewrap_minsamp = partial(musclewrap, minsamp=minsamp)
    if cpus == 1:
        return musclewrap_minsamp(clustfile)

    from jcvi.apps.grid import Jobs

    outdir = mkdtemp(dir=".")
    fs = split([clustfile, outdir, str(cpus), "--format=clust"])
    g = Jobs(musclewrap_minsamp, fs.names)
    g.run()

    clustnames = [x.replace(".clust", ".clustS") for x in fs.names]
    clustSfile = clustfile.replace(".clust", ".clustS")
    FileMerger(clustnames, outfile=clustSfile).merge()
    shutil.rmtree(outdir)
Ejemplo n.º 5
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species", default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3", default=False, action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(augustuswrap, species=opts.species,
                            gff3=gff3, cfgfile=cfgfile,
                            hintsfile=opts.hintsfile)
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus
        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Ejemplo n.º 6
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile,
                    outfile=outfile,
                    errfile=outfile,
                    arr=ncmds,
                    grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)
Ejemplo n.º 7
0
def parallel(args):
    """
    %prog parallel genome.fasta N

    Partition the genome into parts and run separately. This is useful if MAKER
    is to be run on the grid.
    """
    from jcvi.formats.base import split

    p = OptionParser(parallel.__doc__)
    p.set_home("maker")
    p.set_tmpdir(tmpdir="tmp")
    p.set_grid_opts(array=True)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    genome, NN = args
    threaded = opts.threaded or 1
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    tmpdir = get_abs_path(tmpdir)

    N = int(NN)
    assert 1 <= N < 1000, "Required: 1 < N < 1000!"

    outdir = "outdir"
    fs = split([genome, outdir, NN])

    c = CTLFile("maker_opts.ctl")
    c.update_abs_path()
    if threaded > 1:
        c.update_tag("cpus", threaded)

    cwd = os.getcwd()
    dirs = []
    for name in fs.names:
        fn = get_abs_path(name)
        bn = op.basename(name)
        dirs.append(bn)
        c.update_tag("genome", fn)
        mkdir(bn)
        sh("cp *.ctl {0}".format(bn))

        os.chdir(bn)
        c.write_file("maker_opts.ctl")
        os.chdir(cwd)

    jobs = "jobs"
    fw = open(jobs, "w")
    print("\n".join(dirs), file=fw)
    fw.close()

    # Submit to grid
    ncmds = len(dirs)
    runfile = "array.sh"
    cmd = op.join(opts.maker_home, "bin/maker")
    if tmpdir:
        cmd += " -TMP {0}".format(tmpdir)

    engine = get_grid_engine()
    contents = arraysh.format(jobs, cmd) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, jobs, cmd)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    # qsub script
    outfile = "maker.\$TASK_ID.out"
    p = GridProcess(runfile, outfile=outfile, errfile=outfile,
                    arr=ncmds, grid_opts=opts)
    qsubfile = "qsub.sh"
    qsub = p.build()
    write_file(qsubfile, qsub)
Ejemplo n.º 8
0
def main():
    """
    %prog database.fa query.fa [options]

    Wrapper for NCBI BLAST+.
    """
    p = OptionParser(main.__doc__)

    p.add_option("--format", default=" \'6 qseqid sseqid pident length " \
            "mismatch gapopen qstart qend sstart send evalue bitscore\' ",
            help="0-11, learn more with \"blastp -help\". [default: %default]")
    p.add_option("--path", dest="blast_path", default=None,
            help="specify BLAST+ path including the program name")
    p.add_option("--prog", dest="blast_program", default="blastp",
            help="specify BLAST+ program to use. See complete list here: " \
            "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation"
            " [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.set_cpus()
    p.add_option("--nprocs", default=1, type="int",
            help="number of BLAST processes to run in parallel. " + \
            "split query.fa into `nprocs` chunks, " + \
            "each chunk uses -num_threads=`cpus`")
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2 or opts.blast_program is None:
        sys.exit(not p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    blast_path = opts.blast_path
    blast_program = opts.blast_program

    blast_bin = blast_path or blast_program
    if op.basename(blast_bin) != blast_program:
        blast_bin = op.join(blast_bin, blast_program)

    nprocs, cpus = opts.nprocs, opts.cpus
    if nprocs > 1:
        logging.debug("Dispatch job to %d processes" % nprocs)
        outdir = "outdir"
        fs = split([afasta_fn, outdir, str(nprocs)])
        queries = fs.names
    else:
        queries = [afasta_fn]

    dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \
        else "nucl"

    db = bfasta_fn
    if dbtype == "prot":
        nin = db + ".pin"
    else:
        nin = db + ".nin"
        nin00 = db + ".00.nin"
        nin = nin00 if op.exists(nin00) else (db + ".nin")

    run_formatdb(infile=db, outfile=nin, dbtype=dbtype)

    lock = Lock()

    blastplus_template = "{0} -db {1} -outfmt {2}"
    blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format)
    blast_cmd += " -evalue {0} -max_target_seqs {1}".\
        format(opts.evalue, opts.best)
    blast_cmd += " -num_threads {0}".format(cpus)
    if extra:
        blast_cmd += " " + extra.strip()

    args = [(out_fh, blast_cmd, query, lock) for query in queries]
    g = Jobs(target=blastplus, args=args)
    g.run()