Example #1
0
def last(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option("--path", help="Specify LAST path")
    p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb")
    p.add_option("--format", default="BlastTab",
                 choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
                 help="Output format")
    p.add_option("--minlen", default=0, type="int",
                 help="Filter alignments by how many bases match")
    p.add_option("--minid", default=0, type="int", help="Minimum sequence identity")
    p.set_cpus()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last")
    sh(cmd, outfile=lastfile)
Example #2
0
def array(args):
    """
    %prog array commands.list

    Parallelize a set of commands on grid using array jobs.
    """
    p = OptionParser(array.__doc__)
    p.set_grid_opts(array=True)
    p.set_params(prog="grid")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (cmds, ) = args
    fp = open(cmds)
    N = sum(1 for _ in fp)
    fp.close()

    pf = cmds.rsplit(".", 1)[0]
    runfile = pf + ".sh"
    assert runfile != cmds, "Commands list file should not have a `.sh` extension"

    engine = get_grid_engine()
    threaded = opts.threaded or 1
    contents = (arraysh.format(cmds) if engine == "SGE" else arraysh_ua.format(
        N, threaded, cmds))
    write_file(runfile, contents)

    if engine == "PBS":
        return

    outfile = "{0}.{1}.out".format(pf, r"\$TASK_ID")
    errfile = "{0}.{1}.err".format(pf, r"\$TASK_ID")
    p = GridProcess(
        "sh {0}".format(runfile),
        outfile=outfile,
        errfile=errfile,
        arr=N,
        extra_opts=opts.extra,
        grid_opts=opts,
    )
    p.start()
Example #3
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks",
                 type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus**.5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Example #4
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks", type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-l 100 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus ** .5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Example #5
0
File: grid.py Project: xuanblo/jcvi
def array(args):
    """
    %prog array commands.list

    Parallelize a set of commands on grid using array jobs.
    """
    p = OptionParser(array.__doc__)
    p.set_grid_opts(array=True)
    p.set_params(prog="grid")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    cmds, = args
    fp = open(cmds)
    N = sum(1 for x in fp)
    fp.close()

    pf = cmds.rsplit(".",  1)[0]
    runfile = pf + ".sh"
    assert runfile != cmds, \
            "Commands list file should not have a `.sh` extension"

    engine = get_grid_engine()
    threaded = opts.threaded or 1
    contents = arraysh.format(cmds) if engine == "SGE" \
                else arraysh_ua.format(N, threaded, cmds)
    write_file(runfile, contents)

    if engine == "PBS":
        return

    outfile = "{0}.{1}.out".format(pf, "\$TASK_ID")
    errfile = "{0}.{1}.err".format(pf, "\$TASK_ID")
    p = GridProcess("sh {0}".format(runfile), outfile=outfile, errfile=errfile,
                    arr=N, extra_opts=opts.extra, grid_opts=opts)
    p.start()
Example #6
0
def last(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option("--path", help="Specify LAST path")
    p.add_option("--mask",
                 default=False,
                 action="store_true",
                 help="Invoke -c in lastdb")
    p.add_option("--format",
                 default="BlastTab",
                 choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
                 help="Output format")
    p.add_option("--minlen",
                 default=0,
                 type="int",
                 help="Filter alignments by how many bases match")
    p.add_option("--minid",
                 default=0,
                 type="int",
                 help="Minimum sequence identity")
    p.set_cpus()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last")
    sh(cmd, outfile=lastfile)
Example #7
0
def main(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """
    p = OptionParser(main.__doc__)
    p.add_option("--path", help="specify LAST path")
    p.add_option("--mask", default=False, action="store_true",
                 help="invoke -c in lastdb [default: %default]")
    p.add_option("--format", default="blast", choices=supported_formats,
                 help="Output format [default: %default]")
    p.add_option("--eval", default=False, action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    oappend = False
    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd, outfile=opts.outfile)
        oappend = True

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb)

    out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend)

    if out_fh is None:
        return

    lock = Lock()
    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()
Example #8
0
File: clc.py Project: radaniba/jcvi
def align(args):
    """
    %prog align reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(align.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    p.add_option("--fraction", default=0.5,
            help="Fraction of the read that must match [default: %default]")
    p.add_option("--similarity", default=0.95,
            help="Similarity of the matching region [default: %default]")
    p.set_params()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus {0}".format(opts.cpus)
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:

        cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity)

    sh(cmd)
    return outfile, None
Example #9
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="Ooutput format [default: %default]")
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")
    p.add_option("--similar", default=False, action="store_true",
            help="Use options tuned for close comparison [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format))

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
            lock, lastz_bin, extra, mask) for k in xrange(cpus)]
    g = Jobs(target=lastz, args=args)
    g.run()
Example #10
0
def run(args):
    """
    %prog run command ::: file1 file2

    Parallelize a set of commands on grid. The syntax is modeled after GNU
    parallel <http://www.gnu.org/s/parallel/man.html#options>

    {}   - input line
    {.}  - input line without extension
    {_}  - input line first part
    {/}  - basename of input line
    {/.} - basename of input line without extension
    {/_} - basename of input line first part
    {#}  - sequence number of job to run
    :::  - Use arguments from the command line as input source instead of stdin
    (standard input).

    If file name is `t/example.tar.gz`, then,
    {} is "t/example.tar.gz", {.} is "t/example.tar", {_} is "t/example"
    {/} is "example.tar.gz", {/.} is "example.tar", {/_} is "example"

    A few examples:
    ls -1 *.fastq | %prog run process {} {.}.pdf  # use stdin
    %prog run process {} {.}.pdf ::: *fastq  # use :::
    %prog run "zcat {} > {.}" ::: *.gz  # quote redirection
    %prog run < commands.list  # run a list of commands
    """
    p = OptionParser(run.__doc__)
    p.set_grid_opts()
    p.set_params(prog="grid")
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    sep = ":::"
    if sep in args:
        sepidx = args.index(sep)
        filenames = args[sepidx + 1:]
        args = args[:sepidx]
        if not filenames:
            filenames = [""]
    else:
        filenames = sys.stdin if not sys.stdin.isatty() else [""]

    cmd = " ".join(args)

    cmds = [] if filenames else [(cmd, None)]
    for i, filename in enumerate(filenames):
        filename = filename.strip()
        noextname = filename.rsplit(".", 1)[0]
        prefix, basename = op.split(filename)
        basenoextname = basename.rsplit(".", 1)[0]
        basefirstname = basename.split(".")[0]
        firstname = op.join(prefix, basefirstname)
        ncmd = cmd

        if "{" in ncmd:
            ncmd = ncmd.replace("{}", filename)
        else:
            ncmd += " " + filename

        ncmd = ncmd.replace("{.}", noextname)
        ncmd = ncmd.replace("{_}", firstname)
        ncmd = ncmd.replace("{/}", basename)
        ncmd = ncmd.replace("{/.}", basenoextname)
        ncmd = ncmd.replace("{/_}", basefirstname)
        ncmd = ncmd.replace("{#}", str(i))

        outfile = None
        if ">" in ncmd:
            ncmd, outfile = ncmd.split(">", 1)
            ncmd, outfile = ncmd.strip(), outfile.strip()

        ncmd = ncmd.strip()
        cmds.append((ncmd, outfile))

    for ncmd, outfile in cmds:
        p = GridProcess(ncmd,
                        outfile=outfile,
                        extra_opts=opts.extra,
                        grid_opts=opts)
        p.start()
Example #11
0
def align(args):
    """
    %prog align reference fastqfiles

    Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero
    -s option to turn on paired end mode.
    """
    p = OptionParser(align.__doc__)
    p.add_option("-o", dest="outfile", default=None,
            help="Output prefix.cas file [default: %default]")
    p.add_option("-s", dest="size", default=0, type="int",
            help="Use paired end mapping with insert [default: %default]")
    p.add_option("--short", default=False, action="store_true",
            help="Use `clc_ref_assemble_short` as the mapper [default: %default]")
    p.add_option("--orientations", default="fb",
            help="The reads have the orientations [default: %default]")
    p.add_option("--fraction", default=0.5,
            help="Fraction of the read that must match [default: %default]")
    p.add_option("--similarity", default=0.95,
            help="Similarity of the matching region [default: %default]")
    p.set_params()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    write_file("license.properties", CLCLICENSE, skipcheck=True)

    ref = args[0]
    assert op.exists(ref)
    fastqfiles = args[1:]
    size = opts.size
    orientations = opts.orientations
    assert orientations in ("fb", "bf", "ff", "bb")

    cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long"
    readprefix = op.basename(fastqfiles[0]).split(".", 1)[0]
    refprefix = op.basename(ref).split(".", 1)[0]
    outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix)
    if not outfile.endswith(".cas"):
        outfile += ".cas"

    cmd += " --cpus {0}".format(opts.cpus)
    cmd += " -d {0} -o {1} -q ".format(ref, outfile)
    fastqs = " ".join(fastqfiles)
    if size == 0:
        cmd += fastqs
    else:
        assert len(fastqfiles) == 2
        stddev = size / 4
        lb, ub = size - stddev, size + stddev
        cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs)

    if opts.extra:
        cmd += " " + opts.extra

    if not opts.short:

        cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity)

    sh(cmd)
    return outfile, None
Example #12
0
def main():
    """
    %prog database.fa query.fa [options]

    Run LASTZ similar to the BLAST interface, and generates -m8 tabular format
    """
    p = OptionParser(main.__doc__)

    supported_formats = tuple(x.strip() for x in \
        "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\
        "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(','))

    p.add_option("--format", default="BLASTN-", choices=supported_formats,
            help="Ooutput format [default: %default]")
    p.add_option("--path", dest="lastz_path", default=None,
            help="specify LASTZ path")
    p.add_option("--mask", dest="mask", default=False, action="store_true",
            help="treat lower-case letters as mask info [default: %default]")
    p.add_option("--similar", default=False, action="store_true",
            help="Use options tuned for close comparison [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    if opts.similar:
        extra += similarOptions

    lastz_bin = opts.lastz_path or "lastz"
    assert lastz_bin.endswith("lastz"), "You need to include lastz in your path"

    mask = opts.mask
    cpus = opts.cpus
    logging.debug("Dispatch job to %d cpus" % cpus)
    format = opts.format
    blastline = (format == "BLASTN-")

    # The axt, maf, etc. format can only be run on splitted database (i.e. one
    # FASTA record per file). The splitted files are then parallelized for the
    # computation, as opposed to splitting queries through "subsample".
    outdir = "outdir"
    if not blastline:
        from jcvi.formats.fasta import Fasta
        from jcvi.formats.chain import faToTwoBit

        mkdir(outdir)

        bfasta_2bit = faToTwoBit(bfasta_fn)
        bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered())

        apf = op.basename(afasta_fn).split(".")[0]
        args = []
        # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format
        for id in bids:
            bfasta = "/".join((bfasta_2bit, id))
            outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format))
            args.append((bfasta, afasta_fn, outfile, \
                         lastz_bin, extra, mask, format))

        p = Pool(cpus)
        p.map(lastz_2bit, args)

        return

    lock = Lock()

    args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh,
            lock, lastz_bin, extra, mask) for k in xrange(cpus)]
    g = Jobs(target=lastz, args=args)
    g.run()
Example #13
0
File: grid.py Project: xuanblo/jcvi
def run(args):
    """
    %prog run command ::: file1 file2

    Parallelize a set of commands on grid. The syntax is modeled after GNU
    parallel <http://www.gnu.org/s/parallel/man.html#options>

    {}   - input line
    {.}  - input line without extension
    {_}  - input line first part
    {/}  - basename of input line
    {/.} - basename of input line without extension
    {/_} - basename of input line first part
    {#}  - sequence number of job to run
    :::  - Use arguments from the command line as input source instead of stdin
    (standard input).

    If file name is `t/example.tar.gz`, then,
    {} is "t/example.tar.gz", {.} is "t/example.tar", {_} is "t/example"
    {/} is "example.tar.gz", {/.} is "example.tar", {/_} is "example"

    A few examples:
    ls -1 *.fastq | %prog run process {} {.}.pdf  # use stdin
    %prog run process {} {.}.pdf ::: *fastq  # use :::
    %prog run "zcat {} > {.}" ::: *.gz  # quote redirection
    %prog run < commands.list  # run a list of commands
    """
    p = OptionParser(run.__doc__)
    p.set_grid_opts()
    p.set_params(prog="grid")
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    sep = ":::"
    if sep in args:
        sepidx = args.index(sep)
        filenames = args[sepidx + 1:]
        args = args[:sepidx]
        if not filenames:
            filenames = [""]
    else:
        filenames = sys.stdin if not sys.stdin.isatty() else [""]

    cmd = " ".join(args)

    cmds = [] if filenames else [(cmd, None)]
    for i, filename in enumerate(filenames):
        filename = filename.strip()
        noextname = filename.rsplit(".", 1)[0]
        prefix, basename = op.split(filename)
        basenoextname = basename.rsplit(".", 1)[0]
        basefirstname = basename.split(".")[0]
        firstname = op.join(prefix, basefirstname)
        ncmd = cmd

        if "{" in ncmd:
            ncmd = ncmd.replace("{}", filename)
        else:
            ncmd += " " + filename

        ncmd = ncmd.replace("{.}", noextname)
        ncmd = ncmd.replace("{_}", firstname)
        ncmd = ncmd.replace("{/}", basename)
        ncmd = ncmd.replace("{/.}", basenoextname)
        ncmd = ncmd.replace("{/_}", basefirstname)
        ncmd = ncmd.replace("{#}", str(i))

        outfile = None
        if ">" in ncmd:
            ncmd, outfile = ncmd.split(">", 1)
            ncmd, outfile = ncmd.strip(), outfile.strip()

        ncmd = ncmd.strip()
        cmds.append((ncmd, outfile))

    for ncmd, outfile in cmds:
        p = GridProcess(ncmd, outfile=outfile, extra_opts=opts.extra, grid_opts=opts)
        p.start()
Example #14
0
def main():
    """
    %prog database.fa query.fa [options]

    Wrapper for NCBI BLAST+.
    """
    p = OptionParser(main.__doc__)

    p.add_option("--format", default=" \'6 qseqid sseqid pident length " \
            "mismatch gapopen qstart qend sstart send evalue bitscore\' ",
            help="0-11, learn more with \"blastp -help\". [default: %default]")
    p.add_option("--path", dest="blast_path", default=None,
            help="specify BLAST+ path including the program name")
    p.add_option("--prog", dest="blast_program", default="blastp",
            help="specify BLAST+ program to use. See complete list here: " \
            "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation"
            " [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.set_cpus()
    p.add_option("--nprocs", default=1, type="int",
            help="number of BLAST processes to run in parallel. " + \
            "split query.fa into `nprocs` chunks, " + \
            "each chunk uses -num_threads=`cpus`")
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2 or opts.blast_program is None:
        sys.exit(not p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    blast_path = opts.blast_path
    blast_program = opts.blast_program

    blast_bin = blast_path or blast_program
    if op.basename(blast_bin) != blast_program:
        blast_bin = op.join(blast_bin, blast_program)

    nprocs, cpus = opts.nprocs, opts.cpus
    if nprocs > 1:
        logging.debug("Dispatch job to %d processes" % nprocs)
        outdir = "outdir"
        fs = split([afasta_fn, outdir, str(nprocs)])
        queries = fs.names
    else:
        queries = [afasta_fn]

    dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \
        else "nucl"

    db = bfasta_fn
    if dbtype == "prot":
        nin = db + ".pin"
    else:
        nin = db + ".nin"
        nin00 = db + ".00.nin"
        nin = nin00 if op.exists(nin00) else (db + ".nin")

    run_formatdb(infile=db, outfile=nin, dbtype=dbtype)

    lock = Lock()

    blastplus_template = "{0} -db {1} -outfmt {2}"
    blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format)
    blast_cmd += " -evalue {0} -max_target_seqs {1}".\
        format(opts.evalue, opts.best)
    blast_cmd += " -num_threads {0}".format(cpus)
    if extra:
        blast_cmd += " " + extra.strip()

    args = [(out_fh, blast_cmd, query, lock) for query in queries]
    g = Jobs(target=blastplus, args=args)
    g.run()
Example #15
0
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(
        p, "Input file options (required)",
        "Note: Please choose from and provide values for one of the following parameters"
    )
    g1.add_option("--input_file",
                  default=None,
                  help="input file of reads [default: %default]")
    g1.add_option(
        "--input_folder",
        default=None,
        help=
        "input folder containing multi FASTA files of reads [default: %default]"
    )
    g1.add_option(
        "--input_file_list",
        default=None,
        help=
        "list file containing paths to multi FASTA files of reads [default: %default]"
    )
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
                     "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option(
        "-x",
        "--prefix",
        dest="prefix",
        default="cap3",
        help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    p.set_params()

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(
                opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(
                opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith(('.fa', '.fasta'))]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning(
            "List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile)
Example #16
0
def main(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB, LASTAL and LASTEX.
    """

    supported_formats = ("tab", "maf", "blast")

    p = OptionParser(main.__doc__)
    p.add_option("--path", help="specify LAST path")
    p.add_option("--mask",
                 default=False,
                 action="store_true",
                 help="invoke -c in lastdb [default: %default]")
    p.add_option("--format",
                 default="blast",
                 choices=supported_formats,
                 help="Output format [default: %default]")
    p.add_option("--eval",
                 default=False,
                 action="store_true",
                 help="Use lastex to recalculate E-value [default: %default]")
    p.set_cpus(cpus=32)
    p.set_params()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    if opts.eval and opts.cpus > 1:
        raise Exception, "Option --eval cannnot work with multiple threads"

    path = opts.path
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")
    lastex_bin = getpath("lastex")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    cpus = opts.cpus
    logging.debug("Dispatch job to {0} cpus".format(cpus))

    oappend = False
    if opts.format == "maf":
        cmd = 'echo "##maf version=1"'
        sh(cmd, outfile=opts.outfile)
        oappend = True

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    f = supported_formats.index(opts.format)
    cmd += " -f {0}".format(f)
    cmd += " {0} -".format(subjectdb)

    extra = opts.extra
    if extra:
        cmd += " " + extra

    if opts.eval:
        querydb = query.rsplit(".", 1)[0]
        run_lastdb(infile=query, outfile=querydb + ".prj")

        cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb,
                                                 querydb)

    out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend)

    if out_fh is None:
        return

    lock = Lock()
    args = [(k + 1, cpus, out_fh, cmd, query, lock) \
                    for k in xrange(cpus)]
    g = Jobs(target=last, args=args)
    g.run()
Example #17
0
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(p, "Input file options (required)",
            "Note: Please choose from and provide values for one of the following parameters")
    g1.add_option("--input_file", default=None,
            help="input file of reads [default: %default]")
    g1.add_option("--input_folder", default=None,
            help="input folder containing multi FASTA files of reads [default: %default]")
    g1.add_option("--input_file_list", default=None,
            help="list file containing paths to multi FASTA files of reads [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
            "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option("-x", "--prefix", dest="prefix", default="cap3",
            help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    p.set_params()

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith('.fa') or file.lower().endswith('.fasta')]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning("List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile)