def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus**.5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def nucmer(args): """ %prog nucmer ref.fasta query.fasta Run NUCMER using query against reference. Parallel implementation derived from: <https://github.com/fritzsedlazeck/sge_mummer> """ from itertools import product from jcvi.apps.grid import MakeManager from jcvi.formats.base import split p = OptionParser(nucmer.__doc__) p.add_option("--chunks", type="int", help="Split both query and subject into chunks") p.set_params(prog="nucmer", params="-l 100 -c 500") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args cpus = opts.cpus nrefs = nqueries = opts.chunks or int(cpus ** .5) refdir = ref.split(".")[0] + "-outdir" querydir = query.split(".")[0] + "-outdir" reflist = split([ref, refdir, str(nrefs)]).names querylist = split([query, querydir, str(nqueries)]).names mm = MakeManager() for i, (r, q) in enumerate(product(reflist, querylist)): pf = "{0:04d}".format(i) cmd = "nucmer -maxmatch" cmd += " {0}".format(opts.extra) cmd += " {0} {1} -p {2}".format(r, q, pf) deltafile = pf + ".delta" mm.add((r, q), deltafile, cmd) print cmd mm.write()
def augustus(args): """ %prog augustus fastafile Run parallel AUGUSTUS. Final results can be reformatted using annotation.reformat.augustus(). """ p = OptionParser(augustus.__doc__) p.add_option("--species", default="maize", help="Use species model for prediction") p.add_option("--hintsfile", help="Hint-guided AUGUSTUS") p.add_option("--nogff3", default=False, action="store_true", help="Turn --gff3=off") p.set_home("augustus") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args cpus = opts.cpus mhome = opts.augustus_home gff3 = not opts.nogff3 suffix = ".gff3" if gff3 else ".out" cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg") outdir = mkdtemp(dir=".") fs = split([fastafile, outdir, str(cpus)]) augustuswrap_params = partial( augustuswrap, species=opts.species, gff3=gff3, cfgfile=cfgfile, hintsfile=opts.hintsfile, ) g = Jobs(augustuswrap_params, fs.names) g.run() gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names] outfile = fastafile.rsplit(".", 1)[0] + suffix FileMerger(gff3files, outfile=outfile).merge() shutil.rmtree(outdir) if gff3: from jcvi.annotation.reformat import augustus as reformat_augustus reformat_outfile = outfile.replace(".gff3", ".reformat.gff3") reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
def parallel_musclewrap(clustfile, cpus, minsamp=0): musclewrap_minsamp = partial(musclewrap, minsamp=minsamp) if cpus == 1: return musclewrap_minsamp(clustfile) from jcvi.apps.grid import Jobs outdir = mkdtemp(dir=".") fs = split([clustfile, outdir, str(cpus), "--format=clust"]) g = Jobs(musclewrap_minsamp, fs.names) g.run() clustnames = [x.replace(".clust", ".clustS") for x in fs.names] clustSfile = clustfile.replace(".clust", ".clustS") FileMerger(clustnames, outfile=clustSfile).merge() shutil.rmtree(outdir)
def augustus(args): """ %prog augustus fastafile Run parallel AUGUSTUS. Final results can be reformatted using annotation.reformat.augustus(). """ p = OptionParser(augustus.__doc__) p.add_option("--species", default="maize", help="Use species model for prediction") p.add_option("--hintsfile", help="Hint-guided AUGUSTUS") p.add_option("--nogff3", default=False, action="store_true", help="Turn --gff3=off") p.set_home("augustus") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args cpus = opts.cpus mhome = opts.augustus_home gff3 = not opts.nogff3 suffix = ".gff3" if gff3 else ".out" cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg") outdir = mkdtemp(dir=".") fs = split([fastafile, outdir, str(cpus)]) augustuswrap_params = partial(augustuswrap, species=opts.species, gff3=gff3, cfgfile=cfgfile, hintsfile=opts.hintsfile) g = Jobs(augustuswrap_params, fs.names) g.run() gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names] outfile = fastafile.rsplit(".", 1)[0] + suffix FileMerger(gff3files, outfile=outfile).merge() shutil.rmtree(outdir) if gff3: from jcvi.annotation.reformat import augustus as reformat_augustus reformat_outfile = outfile.replace(".gff3", ".reformat.gff3") reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
def parallel(args): """ %prog parallel genome.fasta N Partition the genome into parts and run separately. This is useful if MAKER is to be run on the grid. """ from jcvi.formats.base import split p = OptionParser(parallel.__doc__) p.set_home("maker") p.set_tmpdir(tmpdir="tmp") p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) genome, NN = args threaded = opts.threaded or 1 tmpdir = opts.tmpdir mkdir(tmpdir) tmpdir = get_abs_path(tmpdir) N = int(NN) assert 1 <= N < 1000, "Required: 1 < N < 1000!" outdir = "outdir" fs = split([genome, outdir, NN]) c = CTLFile("maker_opts.ctl") c.update_abs_path() if threaded > 1: c.update_tag("cpus", threaded) cwd = os.getcwd() dirs = [] for name in fs.names: fn = get_abs_path(name) bn = op.basename(name) dirs.append(bn) c.update_tag("genome", fn) mkdir(bn) sh("cp *.ctl {0}".format(bn)) os.chdir(bn) c.write_file("maker_opts.ctl") os.chdir(cwd) jobs = "jobs" fw = open(jobs, "w") print("\n".join(dirs), file=fw) fw.close() # Submit to grid ncmds = len(dirs) runfile = "array.sh" cmd = op.join(opts.maker_home, "bin/maker") if tmpdir: cmd += " -TMP {0}".format(tmpdir) engine = get_grid_engine() contents = arraysh.format(jobs, cmd) if engine == "SGE" \ else arraysh_ua.format(N, threaded, jobs, cmd) write_file(runfile, contents) if engine == "PBS": return # qsub script outfile = "maker.\$TASK_ID.out" p = GridProcess(runfile, outfile=outfile, errfile=outfile, arr=ncmds, grid_opts=opts) qsubfile = "qsub.sh" qsub = p.build() write_file(qsubfile, qsub)
def main(): """ %prog database.fa query.fa [options] Wrapper for NCBI BLAST+. """ p = OptionParser(main.__doc__) p.add_option("--format", default=" \'6 qseqid sseqid pident length " \ "mismatch gapopen qstart qend sstart send evalue bitscore\' ", help="0-11, learn more with \"blastp -help\". [default: %default]") p.add_option("--path", dest="blast_path", default=None, help="specify BLAST+ path including the program name") p.add_option("--prog", dest="blast_program", default="blastp", help="specify BLAST+ program to use. See complete list here: " \ "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation" " [default: %default]") p.set_align(evalue=.01) p.add_option("--best", default=1, type="int", help="Only look for best N hits [default: %default]") p.set_cpus() p.add_option("--nprocs", default=1, type="int", help="number of BLAST processes to run in parallel. " + \ "split query.fa into `nprocs` chunks, " + \ "each chunk uses -num_threads=`cpus`") p.set_params() p.set_outfile() opts, args = p.parse_args() if len(args) != 2 or opts.blast_program is None: sys.exit(not p.print_help()) bfasta_fn, afasta_fn = args for fn in (afasta_fn, bfasta_fn): assert op.exists(fn) afasta_fn = op.abspath(afasta_fn) bfasta_fn = op.abspath(bfasta_fn) out_fh = must_open(opts.outfile, "w") extra = opts.extra blast_path = opts.blast_path blast_program = opts.blast_program blast_bin = blast_path or blast_program if op.basename(blast_bin) != blast_program: blast_bin = op.join(blast_bin, blast_program) nprocs, cpus = opts.nprocs, opts.cpus if nprocs > 1: logging.debug("Dispatch job to %d processes" % nprocs) outdir = "outdir" fs = split([afasta_fn, outdir, str(nprocs)]) queries = fs.names else: queries = [afasta_fn] dbtype = "prot" if op.basename(blast_bin) in ("blastp", "blastx") \ else "nucl" db = bfasta_fn if dbtype == "prot": nin = db + ".pin" else: nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin, dbtype=dbtype) lock = Lock() blastplus_template = "{0} -db {1} -outfmt {2}" blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format) blast_cmd += " -evalue {0} -max_target_seqs {1}".\ format(opts.evalue, opts.best) blast_cmd += " -num_threads {0}".format(cpus) if extra: blast_cmd += " " + extra.strip() args = [(out_fh, blast_cmd, query, lock) for query in queries] g = Jobs(target=blastplus, args=args) g.run()