def last(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option("--path", help="Specify LAST path") p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb") p.add_option("--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format") p.add_option("--minlen", default=0, type="int", help="Filter alignments by how many bases match") p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last") sh(cmd, outfile=lastfile)
def array(args): """ %prog array commands.list Parallelize a set of commands on grid using array jobs. """ p = OptionParser(array.__doc__) p.set_grid_opts(array=True) p.set_params(prog="grid") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (cmds, ) = args fp = open(cmds) N = sum(1 for _ in fp) fp.close() pf = cmds.rsplit(".", 1)[0] runfile = pf + ".sh" assert runfile != cmds, "Commands list file should not have a `.sh` extension" engine = get_grid_engine() threaded = opts.threaded or 1 contents = (arraysh.format(cmds) if engine == "SGE" else arraysh_ua.format( N, threaded, cmds)) write_file(runfile, contents) if engine == "PBS": return outfile = "{0}.{1}.out".format(pf, r"\$TASK_ID") errfile = "{0}.{1}.err".format(pf, r"\$TASK_ID") p = GridProcess( "sh {0}".format(runfile), outfile=outfile, errfile=errfile, arr=N, extra_opts=opts.extra, grid_opts=opts, ) p.start()
def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus**.5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-l 100 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus ** .5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def array(args): """ %prog array commands.list Parallelize a set of commands on grid using array jobs. """ p = OptionParser(array.__doc__) p.set_grid_opts(array=True) p.set_params(prog="grid") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) cmds, = args fp = open(cmds) N = sum(1 for x in fp) fp.close() pf = cmds.rsplit(".", 1)[0] runfile = pf + ".sh" assert runfile != cmds, \ "Commands list file should not have a `.sh` extension" engine = get_grid_engine() threaded = opts.threaded or 1 contents = arraysh.format(cmds) if engine == "SGE" \ else arraysh_ua.format(N, threaded, cmds) write_file(runfile, contents) if engine == "PBS": return outfile = "{0}.{1}.out".format(pf, "\$TASK_ID") errfile = "{0}.{1}.err".format(pf, "\$TASK_ID") p = GridProcess("sh {0}".format(runfile), outfile=outfile, errfile=errfile, arr=N, extra_opts=opts.extra, grid_opts=opts) p.start()
def last(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option("--path", help="Specify LAST path") p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb") p.add_option("--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format") p.add_option("--minlen", default=0, type="int", help="Filter alignments by how many bases match") p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last") sh(cmd, outfile=lastfile)
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ p = OptionParser(main.__doc__) p.add_option("--path", help="specify LAST path") p.add_option("--mask", default=False, action="store_true", help="invoke -c in lastdb [default: %default]") p.add_option("--format", default="blast", choices=supported_formats, help="Output format [default: %default]") p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) oappend = False if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd, outfile=opts.outfile) oappend = True u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend) if out_fh is None: return lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()
def align(args): """ %prog align reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(align.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") p.add_option("--fraction", default=0.5, help="Fraction of the read that must match [default: %default]") p.add_option("--similarity", default=0.95, help="Similarity of the matching region [default: %default]") p.set_params() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE, skipcheck=True) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus {0}".format(opts.cpus) cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity) sh(cmd) return outfile, None
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("--format", default="BLASTN-", choices=supported_formats, help="Ooutput format [default: %default]") p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option("--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format)) p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def run(args): """ %prog run command ::: file1 file2 Parallelize a set of commands on grid. The syntax is modeled after GNU parallel <http://www.gnu.org/s/parallel/man.html#options> {} - input line {.} - input line without extension {_} - input line first part {/} - basename of input line {/.} - basename of input line without extension {/_} - basename of input line first part {#} - sequence number of job to run ::: - Use arguments from the command line as input source instead of stdin (standard input). If file name is `t/example.tar.gz`, then, {} is "t/example.tar.gz", {.} is "t/example.tar", {_} is "t/example" {/} is "example.tar.gz", {/.} is "example.tar", {/_} is "example" A few examples: ls -1 *.fastq | %prog run process {} {.}.pdf # use stdin %prog run process {} {.}.pdf ::: *fastq # use ::: %prog run "zcat {} > {.}" ::: *.gz # quote redirection %prog run < commands.list # run a list of commands """ p = OptionParser(run.__doc__) p.set_grid_opts() p.set_params(prog="grid") opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) sep = ":::" if sep in args: sepidx = args.index(sep) filenames = args[sepidx + 1:] args = args[:sepidx] if not filenames: filenames = [""] else: filenames = sys.stdin if not sys.stdin.isatty() else [""] cmd = " ".join(args) cmds = [] if filenames else [(cmd, None)] for i, filename in enumerate(filenames): filename = filename.strip() noextname = filename.rsplit(".", 1)[0] prefix, basename = op.split(filename) basenoextname = basename.rsplit(".", 1)[0] basefirstname = basename.split(".")[0] firstname = op.join(prefix, basefirstname) ncmd = cmd if "{" in ncmd: ncmd = ncmd.replace("{}", filename) else: ncmd += " " + filename ncmd = ncmd.replace("{.}", noextname) ncmd = ncmd.replace("{_}", firstname) ncmd = ncmd.replace("{/}", basename) ncmd = ncmd.replace("{/.}", basenoextname) ncmd = ncmd.replace("{/_}", basefirstname) ncmd = ncmd.replace("{#}", str(i)) outfile = None if ">" in ncmd: ncmd, outfile = ncmd.split(">", 1) ncmd, outfile = ncmd.strip(), outfile.strip() ncmd = ncmd.strip() cmds.append((ncmd, outfile)) for ncmd, outfile in cmds: p = GridProcess(ncmd, outfile=outfile, extra_opts=opts.extra, grid_opts=opts) p.start()
def align(args): """ %prog align reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(align.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") p.add_option("--fraction", default=0.5, help="Fraction of the read that must match [default: %default]") p.add_option("--similarity", default=0.95, help="Similarity of the matching region [default: %default]") p.set_params() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE, skipcheck=True) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus {0}".format(opts.cpus) cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity) sh(cmd) return outfile, None
def main(): """ %prog database.fa query.fa [options] Run LASTZ similar to the BLAST interface, and generates -m8 tabular format """ p = OptionParser(main.__doc__) supported_formats = tuple(x.strip() for x in \ "lav, lav+text, axt, axt+, maf, maf+, maf-, sam, softsam, "\ "sam-, softsam-, cigar, BLASTN, BLASTN-, differences, rdotplot, text".split(',')) p.add_option("--format", default="BLASTN-", choices=supported_formats, help="Ooutput format [default: %default]") p.add_option("--path", dest="lastz_path", default=None, help="specify LASTZ path") p.add_option("--mask", dest="mask", default=False, action="store_true", help="treat lower-case letters as mask info [default: %default]") p.add_option("--similar", default=False, action="store_true", help="Use options tuned for close comparison [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2: sys.exit(p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra if opts.similar: extra += similarOptions lastz_bin = opts.lastz_path or "lastz" assert lastz_bin.endswith("lastz"), "You need to include lastz in your path" mask = opts.mask cpus = opts.cpus logging.debug("Dispatch job to %d cpus" % cpus) format = opts.format blastline = (format == "BLASTN-") # The axt, maf, etc. format can only be run on splitted database (i.e. one # FASTA record per file). The splitted files are then parallelized for the # computation, as opposed to splitting queries through "subsample". outdir = "outdir" if not blastline: from jcvi.formats.fasta import Fasta from jcvi.formats.chain import faToTwoBit mkdir(outdir) bfasta_2bit = faToTwoBit(bfasta_fn) bids = list(Fasta(bfasta_fn, lazy=True).iterkeys_ordered()) apf = op.basename(afasta_fn).split(".")[0] args = [] # bfasta_fn, afasta_fn, outfile, lastz_bin, extra, mask, format for id in bids: bfasta = "/".join((bfasta_2bit, id)) outfile = op.join(outdir, "{0}.{1}.{2}".format(apf, id, format)) args.append((bfasta, afasta_fn, outfile, \ lastz_bin, extra, mask, format)) p = Pool(cpus) p.map(lastz_2bit, args) return lock = Lock() args = [(k + 1, cpus, bfasta_fn, afasta_fn, out_fh, lock, lastz_bin, extra, mask) for k in xrange(cpus)] g = Jobs(target=lastz, args=args) g.run()
def run(args): """ %prog run command ::: file1 file2 Parallelize a set of commands on grid. The syntax is modeled after GNU parallel <http://www.gnu.org/s/parallel/man.html#options> {} - input line {.} - input line without extension {_} - input line first part {/} - basename of input line {/.} - basename of input line without extension {/_} - basename of input line first part {#} - sequence number of job to run ::: - Use arguments from the command line as input source instead of stdin (standard input). If file name is `t/example.tar.gz`, then, {} is "t/example.tar.gz", {.} is "t/example.tar", {_} is "t/example" {/} is "example.tar.gz", {/.} is "example.tar", {/_} is "example" A few examples: ls -1 *.fastq | %prog run process {} {.}.pdf # use stdin %prog run process {} {.}.pdf ::: *fastq # use ::: %prog run "zcat {} > {.}" ::: *.gz # quote redirection %prog run < commands.list # run a list of commands """ p = OptionParser(run.__doc__) p.set_grid_opts() p.set_params(prog="grid") opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) sep = ":::" if sep in args: sepidx = args.index(sep) filenames = args[sepidx + 1:] args = args[:sepidx] if not filenames: filenames = [""] else: filenames = sys.stdin if not sys.stdin.isatty() else [""] cmd = " ".join(args) cmds = [] if filenames else [(cmd, None)] for i, filename in enumerate(filenames): filename = filename.strip() noextname = filename.rsplit(".", 1)[0] prefix, basename = op.split(filename) basenoextname = basename.rsplit(".", 1)[0] basefirstname = basename.split(".")[0] firstname = op.join(prefix, basefirstname) ncmd = cmd if "{" in ncmd: ncmd = ncmd.replace("{}", filename) else: ncmd += " " + filename ncmd = ncmd.replace("{.}", noextname) ncmd = ncmd.replace("{_}", firstname) ncmd = ncmd.replace("{/}", basename) ncmd = ncmd.replace("{/.}", basenoextname) ncmd = ncmd.replace("{/_}", basefirstname) ncmd = ncmd.replace("{#}", str(i)) outfile = None if ">" in ncmd: ncmd, outfile = ncmd.split(">", 1) ncmd, outfile = ncmd.strip(), outfile.strip() ncmd = ncmd.strip() cmds.append((ncmd, outfile)) for ncmd, outfile in cmds: p = GridProcess(ncmd, outfile=outfile, extra_opts=opts.extra, grid_opts=opts) p.start()
def main(): """ %prog database.fa query.fa [options] Wrapper for NCBI BLAST+. """ p = OptionParser(main.__doc__) p.add_option("--format", default=" \'6 qseqid sseqid pident length " \ "mismatch gapopen qstart qend sstart send evalue bitscore\' ", help="0-11, learn more with \"blastp -help\". [default: %default]") p.add_option("--path", dest="blast_path", default=None, help="specify BLAST+ path including the program name") p.add_option("--prog", dest="blast_program", default="blastp", help="specify BLAST+ program to use. See complete list here: " \ "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation" " [default: %default]") p.set_align(evalue=.01) p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.set_cpus() p.add_option("--nprocs", default=1, type="int", help="number of BLAST processes to run in parallel. " + \ "split query.fa into `nprocs` chunks, " + \ "each chunk uses -num_threads=`cpus`") p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2 or opts.blast_program is None: sys.exit(not p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra blast_path = opts.blast_path blast_program = opts.blast_program blast_bin = blast_path or blast_program if op.basename(blast_bin) != blast_program: blast_bin = op.join(blast_bin, blast_program) nprocs, cpus = opts.nprocs, opts.cpus if nprocs > 1: logging.debug("Dispatch job to %d processes" % nprocs) outdir = "outdir" fs = split([afasta_fn, outdir, str(nprocs)]) queries = fs.names else: queries = [afasta_fn] dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \ else "nucl" db = bfasta_fn if dbtype == "prot": nin = db + ".pin" else: nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin, dbtype=dbtype) lock = Lock() blastplus_template = "{0} -db {1} -outfmt {2}" blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format) blast_cmd += " -evalue {0} -max_target_seqs {1}".\ format(opts.evalue, opts.best) blast_cmd += " -num_threads {0}".format(cpus) if extra: blast_cmd += " " + extra.strip() args = [(out_fh, blast_cmd, query, lock) for query in queries] g = Jobs(target=blastplus, args=args) g.run()
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup( p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters" ) g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option( "--input_folder", default=None, help= "input folder containing multi FASTA files of reads [default: %default]" ) g1.add_option( "--input_file_list", default=None, help= "list file containing paths to multi FASTA files of reads [default: %default]" ) p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option( "-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) p.set_params() opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format( opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format( opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith(('.fa', '.fasta'))] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning( "List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile)
def main(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB, LASTAL and LASTEX. """ supported_formats = ("tab", "maf", "blast") p = OptionParser(main.__doc__) p.add_option("--path", help="specify LAST path") p.add_option("--mask", default=False, action="store_true", help="invoke -c in lastdb [default: %default]") p.add_option("--format", default="blast", choices=supported_formats, help="Output format [default: %default]") p.add_option("--eval", default=False, action="store_true", help="Use lastex to recalculate E-value [default: %default]") p.set_cpus(cpus=32) p.set_params() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args if opts.eval and opts.cpus > 1: raise Exception, "Option --eval cannnot work with multiple threads" path = opts.path getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") lastex_bin = getpath("lastex") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) cpus = opts.cpus logging.debug("Dispatch job to {0} cpus".format(cpus)) oappend = False if opts.format == "maf": cmd = 'echo "##maf version=1"' sh(cmd, outfile=opts.outfile) oappend = True u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) f = supported_formats.index(opts.format) cmd += " -f {0}".format(f) cmd += " {0} -".format(subjectdb) extra = opts.extra if extra: cmd += " " + extra if opts.eval: querydb = query.rsplit(".", 1)[0] run_lastdb(infile=query, outfile=querydb + ".prj") cmd += " | {0} {1}.prj {2}.prj -".format(lastex_bin, subjectdb, querydb) out_fh = must_open(opts.outfile, "w", checkexists=True, oappend=oappend) if out_fh is None: return lock = Lock() args = [(k + 1, cpus, out_fh, cmd, query, lock) \ for k in xrange(cpus)] g = Jobs(target=last, args=args) g.run()
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup(p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters") g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option("--input_folder", default=None, help="input folder containing multi FASTA files of reads [default: %default]") g1.add_option("--input_file_list", default=None, help="list file containing paths to multi FASTA files of reads [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option("-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) p.set_params() opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format(opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format(opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith('.fa') or file.lower().endswith('.fasta')] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning("List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile)