def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) p.add_option( "--paired", default=False, action="store_true", help="Specify if library layout is paired-end", ) p.add_option( "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files", ) p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (srafile, ) = args paired = opts.paired compress = opts.compress outdir = opts.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-files") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=opts.grid)
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) p.add_option( "--paired", default=False, action="store_true", help="Specify if library layout is paired-end " + "[default: %default]", ) p.add_option( "--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]" ) p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) srafile, = args paired = opts.paired compress = opts.compress outdir = opts.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-3") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=opts.grid)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_pasa_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = op.join(opts.tgi_home, "seqclean") accn_extract = which(op.join(PASA_HOME, "misc_utilities", \ "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", \ "build_comprehensive_transcriptome.dbi")) fl_accs = opts.fl_accs cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctid = opts.compreh_pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice cmds = [] # set PASAHOME env variable if preparing shell script if prepare: env_cmd = 'export PASAHOME="{0}"'.format(PASA_HOME) cmds.append(env_cmd) if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) cmds.append(accn_extract_cmd) if not prepare: sh(accn_extract_cmd) else: symlink(dnfasta, tfasta) transcripts = tfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: ccpus = 16 if cpus >= 16 else cpus cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, ccpus) if prepare: cmds.append(cleancmd) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print(alignAssembly_conf.format("{0}_pasa".format(pasa_db), \ pctcov, pctid, bpsplice), file=aafw) aafw.close() symlink(genome, gfasta) aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, gfasta) aacmd += " -t {0}.clean -T -u {0}".format(transcripts) if clean else \ " -t {0}".format(transcripts) if fl_accs: symlink(fl_accs, flaccs) aacmd += " -f {0}".format(flaccs) if ggfasta: aacmd += " --TDN {0}".format(tdn) aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \ opts.intron, cpus) if prepare: cmds.append(aacmd) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if opts.compreh and ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(compreh_pctid, compreh_pctcov) if prepare: cmds.append(comprehcmd) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts) if prepare: write_file(runfile, "\n".join(cmds)) # initialize run script
def compare(args): """ %prog compare pasa_db_name [--annots_gff3=annotation.gff3] Run the PASA annotation comparison pipeline This assumes that PASA alignment assembly has alredy been completed and run directory contains `genome.fasta` and `transcript.fasta` files. If `--annots_gff3` is specified, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) pasa_db, = args PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) annots_gff3 = opts.annots_gff3 grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "", append=True, skipcheck=True) # initialize run script acfw = must_open(acconf, "w") print(annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons), file=acfw) acfw.close() if not op.exists(gfasta): sys.exit("Genome fasta file `{0}` does not exist".format(gfasta)) transcripts = tfasta if not op.exists(transcripts): sys.exit("Transcript fasta file `{0}` does not exist".format(transcripts)) if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, gfasta, transcripts, opts.genetic_code) if annots_gff3: if not op.exists(annots_gff3): sys.exit("Annotation gff3 file `{0}` does not exist".format(annots_gff3)) symlink(annots_gff3, annotation) accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG-Trinity. If coord-sorted BAM is provided, then it will use it as starting point. Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu` In such cases, the `--cpu` should be set to a larger value to help speedup upstream steps such as GSNAP read mapping or coordinate sorting of BAM files. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam gg_cpu = opts.gg_cpu pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz") if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format(genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if gg_cpu: cmd += " --genome_guided_CPU {0}".format(gg_cpu) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def prepare(args): """ %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN-Trinity. If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM as starting point. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_fastq_names() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] paired = opts.paired merge = opts.merge trinity_home = opts.trinity_home hpc_grid_runner_home = opts.hpcgridrunner_home method = "DN" bam = opts.bam if bam and op.exists(bam): bam = op.abspath(bam) method = "GG" pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) cmds = [] # set TRINITY_HOME env variable when preparing shell script env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home) cmds.append(env_cmd) if method == "DN": assert op.exists("../" + inparam) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(trinity_home, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome_guided_bam {0}".format(bam) cmd += " --genome_guided_max_intron {0}".format(opts.max_intron) else: if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: cmd += " --left {0}".format(",".join(f1)) cmd += " --right {0}".format(",".join(f2)) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.grid and opts.grid_conf_file: hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl") hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file) assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file) cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file) if opts.extra: cmd += " {0}".format(opts.extra) cmds.append(cmd) if opts.cleanup: cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \ if method == "DN" else \ 'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")' cmd.append(cleanup_cmd) runfile = "run.sh" write_file(runfile, "\n".join(cmds)) os.chdir(cwd)
def assemble(args): """ %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta] Run the PASA alignment assembly pipeline If two transcript fasta files (Trinity denovo and genome guided) are provided, the PASA Comprehensive Transcriptome protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome> Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(assemble.__doc__) p.set_home("pasa") p.set_align(pctid=95, pctcov=90, intron=2000, bpsplice=3, compreh_pctcov=30) p.add_option("--aligners", default="blat,gmap", help="Specify splice aligners to use for mapping [default: %default]") p.add_option("--clean", default=False, action="store_true", help="Clean transcripts using tgi seqclean [default: %default]") p.set_cpus() p.set_grid() p.set_grid_opts() p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, dnfasta, = args[:3] ggfasta = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() aligners = opts.aligners.split(",") for aligner in aligners: if aligner not in ALLOWED_ALIGNERS: logging.error("Error: Unknown aligner `{0}`".format(aligner)) logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \ "combine multiple aligners in list separated by comma") sys.exit() clean = opts.clean seqclean = which("seqclean") if clean and not seqclean: logging.error("Cannot find tgi seqclean in PATH") sys.exit() accn_extract = which(op.join(PASA_HOME, "misc_utilities", "accession_extractor.pl")) launch_pasa = which(op.join(PASA_HOME, "scripts", "Launch_PASA_pipeline.pl")) build_compreh_trans = which(op.join(PASA_HOME, "scripts", "build_comprehensive_transcriptome.dbi")) cpus = opts.cpus grid = opts.grid prepare, runfile = opts.prepare, "run.sh" pctcov, pctid = opts.pctcov, opts.pctid compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice mkdir(pasa_db) os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if ggfasta: transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge() accn_extract_cmd = "cat {0} | {1} > {2}".format(dnfasta, accn_extract, tdn) write_file(runfile, accn_extract_cmd, append=True) \ if prepare else sh(accn_extract_cmd) else: transcripts = dnfasta if opts.grid and not opts.threaded: opts.threaded = opts.cpus prjobid = None if clean: cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus) if prepare: write_file(runfile, cleancmd, append=True) else: prjobid = sh(cleancmd, grid=grid, grid_opts=opts) aafw = must_open(aaconf, "w") print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), pctcov, pctid, bpsplice) aafw.close() aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome) aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \ " -t {0} ".format(transcripts) if ggfasta: aacmd += " --TDN {0} ".format(tdn) aacmd += " --ALIGNERS {0} -I {1}".format(",".join(aligners), opts.intron) if prepare: write_file(runfile, aacmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(aacmd, grid=grid, grid_opts=opts) if ggfasta: comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf, transcripts) comprehcmd += "--min_per_ID {0} --min_per_aligned {1}".format(pctid, pctcov) if prepare: write_file(runfile, comprehcmd, append=True) else: opts.hold_jid = prjobid prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
def compare(args): """ %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff] Run the PASA annotation comparison pipeline If annotation.gff file is provided, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option("--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, transcripts, = args[:3] annotation = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error("PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if opts.grid and not opts.threaded: opts.threaded = opts.cpus acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, genome, transcripts, opts.genetic_code) if annotation: accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)
def prepare(args): """ %prog prepare [--options] folder [genome.fasta] Run Trinity on a folder of reads. When paired-end (--paired) mode is on, filenames will be scanned based on whether they contain the patterns ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2."). By default, prepare script for DN If genome.fasta is provided, prepare script for GG-Trinity. If coord-sorted BAM is provided, then it will use it as starting point. Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu` In such cases, the `--cpu` should be set to a larger value to help speedup upstream steps such as GSNAP read mapping or coordinate sorting of BAM files. Newer versions of trinity can take multiple fastq files as input. If "--merge" is specified, the fastq files are merged together before assembling """ p = OptionParser(prepare.__doc__) p.add_option("--paired", default=False, action="store_true", help="Paired-end mode [default: %default]") p.add_option("--merge", default=False, action="store_true", help="Merge individual input fastq's into left/right/single" + \ " file(s) [default: %default]") p.set_trinity_opts() p.set_grid() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) inparam, = args[:1] assert op.exists(inparam) genome = args[1] if len(args) == 2 else None method = "GG" if genome is not None else "DN" paired = opts.paired merge = opts.merge thome = opts.trinity_home use_bam = opts.use_bam gg_cpu = opts.gg_cpu pf = inparam.split(".")[0] tfolder = "{0}_{1}".format(pf, method) cwd = os.getcwd() mkdir(tfolder) os.chdir(tfolder) flist = iglob("../" + inparam, opts.names) if paired: f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x] f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x] assert len(f1) == len(f2) if merge: r1, r2 = "left.fastq", "right.fastq" reads = ((f1, r1), (f2, r2)) else: if merge: r = "single.fastq" reads = ((flist, r), ) if merge: for fl, r in reads: fm = FileMerger(fl, r) fm.merge(checkexists=True) cmd = op.join(thome, "Trinity") cmd += " --seqType fq --max_memory {0} --CPU {1}".format( opts.max_memory, opts.cpus) cmd += " --min_contig_length {0}".format(opts.min_contig_length) if opts.bflyGCThreads: cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads) if method == "GG": cmd += " --genome {0} --genome_guided_max_intron {1}".format( genome, opts.max_intron) if use_bam: cmd += " --genome_guided_use_bam {0}".format(use_bam) if gg_cpu: cmd += " --genome_guided_CPU {0}".format(gg_cpu) if opts.grid and opts.grid_conf_file: cmd += " --grid_conf_file={0}".format(opts.grid_conf_file) if paired: if merge: cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1]) else: for lf, rf in zip(f1, f2): cmd += " --left {0}".format(lf) cmd += " --right {0}".format(rf) else: if merge: cmd += " --single {0}".format(reads[0][-1]) else: for f in flist: cmd += " --single {0}".format(f) if opts.extra: cmd += " {0}".format(opts.extra) cmd += " --bypass_java_version_check" runfile = "run.sh" write_file(runfile, cmd) os.chdir(cwd)
def compare(args): """ %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff] Run the PASA annotation comparison pipeline If annotation.gff file is provided, the PASA database is loaded with the annotations first before starting annotation comparison. Otherwise, it uses previously loaded annotation data. Using the `--prepare` option creates a shell script with the run commands without executing the pipeline """ p = OptionParser(compare.__doc__) p.set_pasa_opts(action="compare") p.add_option( "--prepare", default=False, action="store_true", help="Prepare PASA run script with commands [default: %default]") p.set_grid() p.set_grid_opts() opts, args = p.parse_args(args) if len(args) not in (3, 4): sys.exit(not p.print_help()) pasa_db, genome, transcripts, = args[:3] annotation = args[3] if len(args) == 4 else None PASA_HOME = opts.pasa_home if not op.isdir(PASA_HOME): logging.error( "PASA_HOME={0} directory does not exist".format(PASA_HOME)) sys.exit() launch_pasa = which(op.join(PASA_HOME, "scripts", \ "Launch_PASA_pipeline.pl")) grid = opts.grid prepare, runfile = opts.prepare, "run.sh" os.chdir(pasa_db) if prepare: write_file(runfile, "") # initialize run script if opts.grid and not opts.threaded: opts.threaded = opts.cpus acfw = must_open(acconf, "w") print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \ opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \ opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \ opts.stompovl, opts.trust_FL, opts.utr_exons) acfw.close() if op.exists("{0}.clean".format(transcripts)): transcripts = "{0}.clean".format(transcripts) accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \ acconf, genome, transcripts, opts.genetic_code) if annotation: accmd += " -L --annots_gff3 {0}".format(annotation) if prepare: write_file(runfile, accmd, append=True) else: sh(accmd, grid=grid, grid_opts=opts)