def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def get_cookies(cookies=PHYTOZOME_COOKIES): from jcvi.utils.console import console # Check if cookies is still good if op.exists(cookies) and last_updated(cookies) < 3600: return cookies if console.is_terminal: username = console.input("[bold green]Phytozome Login: "******"[bold green]Phytozome Password: "******"curl") if curlcmd is None: logging.error("curl command not installed. Aborting.") return None cmd = "{} https://signon.jgi.doe.gov/signon/create".format(curlcmd) cmd += " --data-urlencode 'login={0}' --data-urlencode 'password={1}' -b {2} -c {2}".format( username, pw, cookies ) sh(cmd, outfile="/dev/null", errfile="/dev/null", log=False) if not op.exists(cookies): logging.error("Cookies file `{}` not created. Aborting.".format(cookies)) return None return cookies
def test_get_cookies(mock_username, mock_password): from jcvi.apps.fetch import get_cookies, PHYTOZOME_COOKIES from jcvi.apps.base import remove_if_exists, which remove_if_exists(PHYTOZOME_COOKIES) if which("curl"): assert get_cookies() == PHYTOZOME_COOKIES else: assert get_cookies() is None # errored out with "curl not found"
def run_blat(infile=None, outfile=None, db="UniVec_Core", pctid=95, hitlen=50, cpus=16, overwrite=True): cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat" cmd += " {0} {1} -out=blast8 {2}".format(db, infile, outfile) sh(cmd) blatfile = outfile filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blatfile, outfile=filtered_blatfile, pctid=pctid, hitlen=hitlen) if overwrite: shutil.move(filtered_blatfile, blatfile)
def run_concorde(self, tspfile, seed=666): outfile = op.join(self.work_dir, "data.sol") if op.exists(outfile): os.remove(outfile) cc = "concorde" assert which(cc), "You must install `concorde` on your PATH" + \ " [http://www.math.uwaterloo.ca/tsp/concorde.html]" cmd = "{0} -s {1} -x -o {2} {3}".format(cc, seed, outfile, tspfile) outf = None if self.verbose else "/dev/null" retcode = sh(cmd, outfile=outf, errfile=outf) return retcode, outfile
def run_blat(infile=None, outfile=None, db="UniVec_Core", pctid=95, hitlen=50, cpus=16, overwrite=True): cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat" cmd += ' {0} {1} -out=blast8 {2}'.format(db, infile, outfile) sh(cmd) blatfile = outfile filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blatfile, outfile=filtered_blatfile, pctid=pctid, hitlen=hitlen) if overwrite: shutil.move(filtered_blatfile, blatfile)
def run_concorde(self, tspfile, seed=666): outfile = op.join(self.work_dir, "data.sol") if op.exists(outfile): os.remove(outfile) cc = "concorde" assert which(cc), ("You must install `concorde` on your PATH" + " [http://www.math.uwaterloo.ca/tsp/concorde.html]") cmd = "{0} -s {1} -x -o {2} {3}".format(cc, seed, outfile, tspfile) outf = None if self.verbose else "/dev/null" retcode = sh(cmd, outfile=outf, errfile=outf) return retcode, outfile
def getpath(cmd, name=None, url=None, cfg="~/.jcvirc"): """ Get install locations of common binaries First, check ~/.jcvirc file to get the full path If not present, ask on the console and and store """ p = which(cmd) # if in PATH, just returns it if p: return p PATH = "Path" config = ConfigParser.RawConfigParser() cfg = op.expanduser(cfg) changed = False if op.exists(cfg): config.read(cfg) assert name is not None, "Need a program name" try: fullpath = config.get(PATH, name) except ConfigParser.NoSectionError: config.add_section(PATH) changed = True except: pass try: fullpath = config.get(PATH, name) except ConfigParser.NoOptionError: msg = "=== Configure path for {0} ===\n".format(name, cfg) if url: msg += "URL: {0}\n".format(url) msg += "[Directory that contains `{0}`]: ".format(cmd) fullpath = raw_input(msg).strip() config.set(PATH, name, fullpath) changed = True path = op.join(op.expanduser(fullpath), cmd) assert is_exe(path), \ "Cannot execute binary `{0}`. Please verify and rerun.".format(path) if changed: configfile = open(cfg, "w") config.write(configfile) logging.debug("Configuration written to `{0}`.".format(cfg)) return path
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) p.add_option( "--paired", default=False, action="store_true", help="Specify if library layout is paired-end", ) p.add_option( "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files", ) p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (srafile, ) = args paired = opts.paired compress = opts.compress outdir = opts.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-files") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=opts.grid)
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print >> fw, "\t".join((k, str(size))) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print("\t".join((k, str(size))), file=fw) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) p.add_option( "--paired", default=False, action="store_true", help="Specify if library layout is paired-end " + "[default: %default]", ) p.add_option( "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]" ) p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) srafile, = args paired = opts.paired compress = opts.compress outdir = opts.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-3") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=opts.grid)
def get_cookies(cookies=PHYTOZOME_COOKIES): from getpass import getpass # Check if cookies is still good if op.exists(cookies) and last_updated(cookies) < 3600: return cookies username = input("Phytozome Login: "******"Phytozome Password: "******"curl") if curlcmd is None: print("curl command not installed. Aborting.", file=sys.stderr) return None cmd = "{} https://signon.jgi.doe.gov/signon/create".format(curlcmd) cmd += " --data-urlencode 'login={0}' --data-urlencode 'password={1}' -b {2} -c {2}".format( username, pw, cookies) sh(cmd, outfile="/dev/null", errfile="/dev/null", log=False) if not op.exists(cookies): print("Cookies file `{}` not created. Aborting.".format(cookies), file=sys.stderr) return None return cookies
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cpus = opts.cpus cmd = op.join(opts.pbjelly_home, "setup.sh") setup = "source {0}".format(cmd) if not which("fakeQuals.py"): sh(setup) pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" cwd = os.getcwd() reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) if not op.exists(reference): sh("mkdir -p {0}".format(dref)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) if not op.exists(reads): sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(readsfiles, dreads)) outputDir = cwd p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) runsh.append( 'Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus)) runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def compare(args): """ %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff] Run the PASA annotation comparison pipeline If annotation.gff file is provided, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, transcripts, = args[:3] annotation = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if opts.grid and not opts.threaded: opts.threaded = opts.cpus acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, genome, transcripts, opts.genetic_code) if annotation: accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def is_usetex(): """Check if latex command is available """ return bool(which("latex")) and bool(which("lp"))
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cpus = opts.cpus cmd = op.join(opts.pbjelly_home, "setup.sh") setup = "source {0}".format(cmd) if not which("fakeQuals.py"): sh(setup) pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" cwd = os.getcwd() reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) if not op.exists(reference): sh("mkdir -p {0}".format(dref)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) if not op.exists(reads): sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(readsfiles, dreads)) outputDir = cwd p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) runsh.append('Jelly.py assembly Protocol.xml -x "--nproc={0}"'.format(cpus)) runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) fl_accs = opts.fl_accs cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctid = opts.compreh_pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice cmds = [] # set PASAHOME env variable if preparing shell script if prepare: env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME) cmds.append(env_cmd) if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) cmds.append(accn_extract_cmd) if not prepare: sh(accn_extract_cmd) else: symlink(dnfasta, tfasta) transcripts = tfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: ccpus = 16 if cpus >= 16 else cpus cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus) if prepare: cmds.append(cleancmd) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice), file=aafw) aafw.close() symlink(genome, gfasta) aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta) aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \ " -t {0}".format(transcripts) if fl_accs: symlink(fl_accs, flaccs) aacmd += " -f {0}".format(flaccs) if ggfasta: aacmd += " --TDN {0}".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: cmds.append(aacmd) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov) if prepare: cmds.append(comprehcmd) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts) if prepare: write_file(runfile, "\n".join(cmds)) # initialize run script
def compare(args): """ %prog compare pasa_db_name [--annots_gff3=annotation.gff3] Run the PASA annotation comparison pipeline This assumes that PASA alignment assembly has alredy been completed and run directory contains `genome.fasta` and `transcript.fasta` files. If `--annots_gff3` is specified, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) pasa_db, = args PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) annots_gff3 = opts.annots_gff3 grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "", append=True, skipcheck=True) # initialize run script acfw = must_open(acconf, "w") print(annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons), file=acfw) acfw.close() if not op.exists(gfasta): sys.exit("Genome fasta file `{0}` does not exist".format(gfasta)) transcripts = tfasta if not op.exists(transcripts): sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts)) if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, gfasta, transcripts, opts.genetic_code) if annots_gff3: if not op.exists(annots_gff3): sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3)) symlink(annots_gff3, annotation) accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def compare(args): """ %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff] Run the PASA annotation comparison pipeline If annotation.gff file is provided, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option( "--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, transcripts, = args[:3] annotation = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error( "PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if opts.grid and not opts.threaded: opts.threaded = opts.cpus acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, genome, transcripts, opts.genetic_code) if annotation: accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided, the PASA Comprehensive Transcriptome protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_home("pasa") p.set_align(pctid=95, pctcov=90, intron=2000, bpsplice=3, compreh_pctcov=30) p.add_option("--aligners", default="blat,gmap", help="Specify splice aligners to use for mapping [default: %default]") p.add_option("--clean", default=False, action="store_true", help="Clean transcripts using tgi seqclean [default: %default]") p.set_cpus() p.set_grid() p.set_grid_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = which("seqclean") if clean and not seqclean: logging.error("Cannot find tgi seqclean in PATH") sys.exit() accn_extract = which(op.join(PASA_HOME, "misc_utilities", "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", "build_comprehensive_transcriptome.dbi")) cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice mkdir(pasa_db) os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) write_file(runfile, accn_extract_cmd, append=True) \ if prepare else sh(accn_extract_cmd) else: transcripts = dnfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus) if prepare: write_file(runfile, cleancmd, append=True) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), pctcov, pctid, bpsplice) aafw.close() aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome) aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \ " -t {0} ".format(transcripts) if ggfasta: aacmd += " --TDN {0} ".format(tdn) aacmd += " --ALIGNERS {0} -I {1}".format(",".join(aligners), opts.intron) if prepare: write_file(runfile, aacmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += "--min_per_ID {0} --min_per_aligned {1}".format(pctid, pctcov) if prepare: write_file(runfile, comprehcmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
def patch(args): """ %prog patch reference.fasta reads.fasta Run PBJelly with reference and reads. """ from jcvi.formats.base import write_file from jcvi.formats.fasta import format p = OptionParser(patch.__doc__) p.add_option("--cleanfasta", default=False, action="store_true", help="Clean FASTA to remove description [default: %default]") p.add_option("--highqual", default=False, action="store_true", help="Reads are of high quality [default: %default]") p.set_home("pbjelly") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, reads = args cmd = op.join(opts.pbjelly_home, "setup.sh") if not which("fakeQuals.py"): setup = "source {0}".format(cmd) sh(setup) # Check environment try: import networkx version = networkx.version except: logging.error("You need networkx==1.1 to run PBJELLY") return try: import argparse except ImportError: logging.error("You need Python2.7 or at least argparse lib") return pf = ref.rsplit(".", 1)[0] pr, px = reads.rsplit(".", 1) # Remove description line if opts.cleanfasta: oref = pf + ".f.fasta" oreads = pr + ".f.fasta" format([ref, oref]) format([reads, oreads]) ref, reads = oref, oreads # Check if the FASTA has qual ref, refq = fake_quals(ref) convert_reads = not px in ("fq", "fastq", "txt") if convert_reads: reads, readsq = fake_quals(reads) readsfiles = " ".join((reads, readsq)) else: readsfiles = reads # Make directory structure dref, dreads = "data/reference", "data/reads" sh("mkdir -p {0}".format(dref)) sh("mkdir -p {0}".format(dreads)) sh("cp {0} {1}/".format(" ".join((ref, refq)), dref)) sh("cp {0} {1}/".format(readsfiles, dreads)) cwd = os.getcwd() outputDir = cwd reference = op.join(cwd, "{0}/{1}".format(dref, ref)) reads = op.join(cwd, "{0}/{1}".format(dreads, reads)) p = Protocol(outputDir, reference, reads, highqual=opts.highqual) p.write_xml() # Make sure we have the patched version of Extraction.py # See discussion <http://seqanswers.com/forums/showthread.php?t=27599> # This check has been removed # Build the pipeline runsh = [setup] for action in "setup|mapping|support|extraction".split("|"): runsh.append("Jelly.py {0} Protocol.xml".format(action)) #pcmds = """find assembly -name "ref*" -exec echo \\ # "Assembly.py {} \\ # > {}/assembly.out 2> {}/assembly.err" \; > commands.list""" #runsh.append(pcmds) runsh.append("Jelly.py assembly Protocol.xml") runsh.append("cp assembly/assembly_chunk0.sh commands.list") runsh.append("parallel < commands.list") runsh.append("Jelly.py output Protocol.xml") runfile = "run.sh" contents = "\n".join(runsh) write_file(runfile, contents, meta="run script")