def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fasta import join from jcvi.formats.fastq import guessoffset from jcvi.projects.tgbs import snp p = OptionParser(align.__doc__) p.add_option("--join", default=False, action="store_true", help="Join sequences with padded 50Ns") p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--snp", default=False, action="store_true", help="Call SNPs after GSNAP") p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[0:2] if opts.join: dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"]) assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = prefix + ".log" gsnapfile = prefix + ".gsnap" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.snp: snp([gsnapfile, "--cpus={0}".format(opts.cpus)]) return gsnapfile, logfile
def correct(args): """ %prog correct *.fastq Correct the fastqfile and generated corrected fastqfiles. This calls assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The naming convention for your fastqfiles are important, and are listed below. By default, this will correct all PE reads, and remove duplicates of all MP reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq` and `jump_reads.corr.{pairs,frags}.fastq`. """ from jcvi.assembly.allpaths import prepare from jcvi.assembly.base import FastqNamings p = OptionParser(correct.__doc__ + FastqNamings) p.add_option( "--nofragsdedup", default=False, action="store_true", help="Don't deduplicate the fragment reads [default: %default]") p.add_option("--cpus", default=32, type="int", help="Number of threads to run [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastq = args tag, tagj = "frag_reads", "jump_reads" prepare(["Unknown"] + fastq + ["--norun"]) datadir = "data" mkdir(datadir) fullpath = op.join(os.getcwd(), datadir) nthreads = " NUM_THREADS={0}".format(opts.cpus) phred64 = (guessoffset([args[0]]) == 64) orig = datadir + "/{0}_orig".format(tag) origfastb = orig + ".fastb" if need_update(fastq, origfastb): cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\ format(fullpath, opts.cpus) if phred64: cmd += " PHRED_64=True" sh(cmd) if op.exists(origfastb): dedup = not opts.nofragsdedup correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup) origj = datadir + "/{0}_orig".format(tagj) origjfastb = origj + ".fastb" if op.exists(origjfastb): correct_jump(datadir, tagj, origjfastb, nthreads)
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option( "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads?", ) p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug( "[warn] long insert size {0} but not outtie".format(size)) mated = size != 0 libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in ( 1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = offset == 64 if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def tophat(args): """ %prog tophat folder reference Run tophat on a folder of reads. """ from jcvi.apps.bowtie import check_index from jcvi.formats.fastq import guessoffset p = OptionParser(tophat.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.add_option("--intron", default=15000, type="int", help="Max intron size [default: %default]") p.add_option("--dist", default=-50, type="int", help="Mate inner distance [default: %default]") p.add_option("--stdev", default=50, type="int", help="Mate standard deviation [default: %default]") p.set_phred() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) num = 1 if opts.single else 2 folder, reference = args reference = check_index(reference) for p, prefix in iter_project(folder, n=num): outdir = "{0}_tophat".format(prefix) outfile = op.join(outdir, "accepted_hits.bam") if op.exists(outfile): logging.debug("File `{0}` found. Skipping.".format(outfile)) continue cmd = "tophat -p {0}".format(opts.cpus) if opts.gtf: cmd += " -G {0}".format(opts.gtf) cmd += " -o {0}".format(outdir) if num == 1: # Single-end a, = p else: # Paired-end a, b = p cmd += " --max-intron-length {0}".format(opts.intron) cmd += " --mate-inner-dist {0}".format(opts.dist) cmd += " --mate-std-dev {0}".format(opts.stdev) phred = opts.phred or str(guessoffset([a])) if phred == "64": cmd += " --phred64-quals" cmd += " {0} {1}".format(reference, " ".join(p)) sh(cmd)
def correct(args): """ %prog correct *.fastq Correct the fastqfile and generated corrected fastqfiles. This calls assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The naming convention for your fastqfiles are important, and are listed below. By default, this will correct all PE reads, and remove duplicates of all MP reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq` and `jump_reads.corr.{pairs,frags}.fastq`. """ from jcvi.assembly.allpaths import prepare from jcvi.assembly.base import FastqNamings p = OptionParser(correct.__doc__ + FastqNamings) p.add_option("--nofragsdedup", default=False, action="store_true", help="Don't deduplicate the fragment reads [default: %default]") p.add_option("--cpus", default=32, type="int", help="Number of threads to run [default: %default]") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastq = args tag, tagj = "frag_reads", "jump_reads" prepare(["Unknown"] + fastq + ["--norun"]) datadir = "data" mkdir(datadir) fullpath = op.join(os.getcwd(), datadir) nthreads = " NUM_THREADS={0}".format(opts.cpus) phred64 = (guessoffset([args[0]]) == 64) orig = datadir + "/{0}_orig".format(tag) origfastb = orig + ".fastb" if need_update(fastq, origfastb): cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}'".\ format(fullpath, opts.cpus) if phred64: cmd += " PHRED_64=True" sh(cmd) if op.exists(origfastb): dedup = not opts.nofragsdedup correct_frag(datadir, tag, origfastb, nthreads, dedup=dedup) origj = datadir + "/{0}_orig".format(tagj) origjfastb = origj + ".fastb" if op.exists(origjfastb): correct_jump(datadir, tagj, origjfastb, nthreads)
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) p.add_option( "--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]" ) p.set_phred() p.set_size() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] size = opts.size outtie = opts.outtie if size > 1000 and (not outtie): logging.debug("[warn] long insert size {0} but not outtie".format(size)) mated = size != 0 libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = "fastqToCA" cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = offset == 64 if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def check_aln(dbfile, readfile, cpus=32): from jcvi.formats.fastq import guessoffset saifile = readfile.rsplit(".", 1)[0] + ".sai" if need_update((dbfile, readfile), saifile): offset = guessoffset([readfile]) cmd = "bwa aln " + " ".join((dbfile, readfile)) cmd += " -t {0}".format(cpus) if offset == 64: cmd += " -I" sh(cmd, outfile=saifile) else: logging.error("`{0}` exists. `bwa aln` already run.".format(saifile)) return saifile
def fastq(args): """ %prog fastq fastqfile Convert reads formatted as FASTQ file, and convert to CA frg file. """ from jcvi.formats.fastq import guessoffset p = OptionParser(fastq.__doc__) phdchoices = ("33", "64") p.add_option("--outtie", dest="outtie", default=False, action="store_true", help="Are these outie reads? [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) add_size_option(p) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(p.print_help()) fastqfiles = [get_abs_path(x) for x in args] mated = (opts.size != 0) outtie = opts.outtie libname = op.basename(args[0]).split(".")[0] libname = libname.replace("_1_sequence", "") frgfile = libname + ".frg" mean, sv = get_mean_sv(opts.size) cmd = CAPATH("fastqToCA") cmd += " -libraryname {0} ".format(libname) fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) if mated: assert len(args) in (1, 2), "you need one or two fastq files for mated library" fastqs = "-mates {0}".format(",".join(fastqfiles)) cmd += "-insertsize {0} {1} ".format(mean, sv) cmd += fastqs offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) illumina = (offset == 64) if illumina: cmd += " -type illumina" if outtie: cmd += " -outtie" sh(cmd, outfile=frgfile)
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ choices = "prepare,align,filter,rmdup,genreads".split(",") p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement the reads before alignment") p.add_option("--len", default=100, type="int", help="Extend to this length") p.add_option("--stage", default="prepare", choices=choices, help="Start from certain stage") p.add_option("--dup", default=10, type="int", help="Filter duplicates with coordinates within this distance") p.add_option("--maxdiff", default=1, type="int", help="Maximum number of differences") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" if opts.rc: cmd += " -rc" cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup) cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len) cmd += " -maxdiff {0}".format(opts.maxdiff) cmd += " -stage {0}".format(opts.stage) cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def check_aln(dbfile, readfile, grid=False, cpus=32): from jcvi.formats.fastq import guessoffset saifile = readfile.rsplit(".", 1)[0] + ".sai" if op.exists(saifile): logging.error("`{0}` exists. `bwa aln` already run.".format(saifile)) else: offset = guessoffset([readfile]) cmd = "bwa aln -t {0}".format(cpus) if offset == 64: cmd += " -I" cmd += " {0} {1}".format(dbfile, readfile) sh(cmd, grid=grid, outfile=saifile) return saifile
def clean(args): """ %prog clean 1.fastq 2.fastq [insertsize] Clean and dedup paired FASTQ files. """ p = OptionParser(clean.__doc__) p.add_option("-a", default=0, type="int", help="Trim length at 5' end [default: %default]") p.add_option("-b", default=50, type="int", help="Trim length at 3' end [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: p1, p2 = args size = get_size(p1) elif len(args) == 3: p1, p2, size = args size = int(size) else: sys.exit(not p.print_help()) pf = p1.split(".")[0] cpus = opts.cpus offset = guessoffset([p1]) a, b = opts.a, opts.b p1_clean = p1 + ".clean" p1_cleangz = p1_clean + ".gz" p2_clean = p2 + ".clean" p2_cleangz = p2_clean + ".gz" if need_update([p1, p2], [p1_cleangz, p2_cleangz]): cmd = "SOAPfilter_v2.0 -t {0} -m 2000000 -p -y -z -g".format(cpus) cmd += " -q {0} -w 10 -B 50 -f 0".format(offset) cmd += " -l {0} -a {1} -b {2} -c {1} -d {2}".format(size, a, b, a, b) cmd += " {0} {1} {2}.clean.stat {3} {4}".\ format(p1, p2, pf, p1_clean, p2_clean) sh(cmd)
def correct(args): """ %prog correct *.fastq Correct reads using ErrorCorrection. Only PE will be used to build the K-mer table. """ p = OptionParser(correct.__doc__) p.add_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) lstfile = "reads2cor.lst" fw = open(lstfile, "w") print >> fw, "\n".join(x for x in args if x[:2] == "PE") fw.close() p1 = args[0] offset = guessoffset([p1]) cpus = opts.cpus freq = "output.freq.cz" freqlen = freq + ".len" if need_update(args, (freq, freqlen)): cmd = "KmerFreq_AR_v2.0 -k 17 -c -1 -q {0}".format(offset) cmd += " -m 1 -t {0}".format(cpus) cmd += " -p output {0}".format(lstfile) sh(cmd) fw = open(lstfile, "w") print >> fw, "\n".join(args) fw.close() cmd = "Corrector_AR_v2.0 -k 17 -l 3 -m 5 -c 5 -a 0 -e 1 -w 0 -r 45" cmd += " -Q {0} -q 30 -x 8 -t {1} -o 1 ".format(offset, cpus) cmd += " {0} {1} {2}".format(freq, freqlen, lstfile) sh(cmd)
def correct(args): """ %prog correct *.fastq Correct reads using ErrorCorrection. Only PE will be used to build the K-mer table. """ p = OptionParser(correct.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) lstfile = "reads2cor.lst" fw = open(lstfile, "w") print("\n".join(x for x in args if x[:2] == "PE"), file=fw) fw.close() p1 = args[0] offset = guessoffset([p1]) cpus = opts.cpus freq = "output.freq.cz" freqlen = freq + ".len" if need_update(args, (freq, freqlen)): cmd = "KmerFreq_AR_v2.0 -k 17 -c -1 -q {0}".format(offset) cmd += " -m 1 -t {0}".format(cpus) cmd += " -p output {0}".format(lstfile) sh(cmd) fw = open(lstfile, "w") print("\n".join(args), file=fw) fw.close() cmd = "Corrector_AR_v2.0 -k 17 -l 3 -m 5 -c 5 -a 0 -e 1 -w 0 -r 45" cmd += " -Q {0} -q 30 -x 8 -t {1} -o 1 ".format(offset, cpus) cmd += " {0} {1} {2}".format(freq, freqlen, lstfile) sh(cmd)
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ TrimVersion = tv = "0.20" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=10, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=30, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path) adaptersfile = "adapters.fasta" if not op.exists(adaptersfile): write_file(adaptersfile, Adapters) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) cmd = JAVAPATH("java-1.6.0") cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += ".TrimmomaticSE" cmd += phredflag fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += ".TrimmomaticPE" cmd += phredflag fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd, grid=opts.grid)
def prepare(args): """ %prog prepare "B. oleracea" *.fastq Scan input fastq files (see below) and create `in_groups.csv` and `in_libs.csv`. The species name does not really matter. """ from jcvi.utils.table import write_csv from jcvi.formats.base import write_file from jcvi.formats.fastq import guessoffset p = OptionParser(prepare.__doc__ + FastqNamings) p.add_option("--corr", default=False, action="store_true", help="Extra parameters for corrected data [default: %default]") p.add_option("--norun", default=False, action="store_true", help="Don't write `run.sh` script [default: %default]") p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) organism_name = args[0] project_name = "".join(x[0] for x in organism_name.split()).upper() fnames = sorted(glob("*.fastq*") if len(args) == 1 else args[1:]) for x in fnames: assert op.exists(x), "File `{0}` not found.".format(x) offset = guessoffset([fnames[0]]) phred64 = offset == 64 assert all(guessoffset([x]) == offset for x in fnames[1:]) groupheader = "group_name library_name file_name".split() libheader = "library_name project_name organism_name type paired "\ "frag_size frag_stddev insert_size insert_stddev read_orientation "\ "genomic_start genomic_end".split() groupcontents = [] libs = [] for file_name in fnames: group_name = op.basename(file_name).split(".")[0] library_name = "-".join(group_name.split("-")[:2]) # Handle paired files and convert to wildcard if ".1." in file_name: file_name = file_name.replace(".1.", ".?.") elif ".2." in file_name: continue groupcontents.append((group_name, library_name, file_name)) if library_name not in libs: libs.append(library_name) libcontents = [] for library_name in libs: L = Library(library_name) size = L.size stddev = L.stddev type = L.type paired = L.paired read_orientation = L.read_orientation size = size or "" stddev = stddev or "" frag_size = size if type == "fragment" else "" frag_stddev = stddev if type == "fragment" else "" insert_size = size if type != "fragment" else "" insert_stddev = stddev if type != "fragment" else "" genomic_start, genomic_end = "", "" libcontents.append((library_name, project_name, organism_name, type, \ paired, frag_size, frag_stddev, insert_size, insert_stddev, \ read_orientation, genomic_start, genomic_end)) write_csv(groupheader, groupcontents, filename="in_groups.csv", tee=True) logging.debug("`in_group.csv` created (# of groups = {0}).".\ format(len(groupcontents))) write_csv(libheader, libcontents, filename="in_libs.csv", tee=True) logging.debug("`in_libs.csv` created (# of libs = {0}).".\ format(len(libcontents))) runfile = "run.sh" extra = "" if opts.corr: extra += "FE_NUM_CYCLES=1 EC_K=28 FE_QUAL_CEIL_RADIUS=0" extra += " REMOVE_DODGY_READS_FRAG=False FE_MAX_KMER_FREQ_TO_MARK=1" if not opts.norun: contents = ALLPATHSRUN.format(opts.ploidy, opts.cpus, phred64, extra) write_file(runfile, contents)
def align(args): """ %prog align database.fasta read1.fq read2.fq Wrapper for `gsnap` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.add_option("--rnaseq", default=False, action="store_true", help="Input is RNA-seq reads, turn splicing on") p.add_option("--native", default=False, action="store_true", help="Convert GSNAP output to NATIVE format") p.set_home("eddyyeh") p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) == 2: logging.debug("Single-end alignment") elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) dbfile, readfile = args[:2] outdir = opts.outdir assert op.exists(dbfile) and op.exists(readfile) prefix = get_prefix(readfile, dbfile) logfile = op.join(outdir, prefix + ".log") gsnapfile = op.join(outdir, prefix + ".gsnap") nativefile = gsnapfile.rsplit(".", 1)[0] + ".unique.native" if not need_update((dbfile, readfile), gsnapfile): logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname) cmd += " -B 5 -m 0.1 -i 2 -n 3" # memory, mismatch, indel penalty, nhits if opts.rnaseq: cmd += " -N 1" cmd += " -t {0}".format(opts.cpus) cmd += " --gmap-mode none --nofails" if readfile.endswith(".gz"): cmd += " --gunzip" try: offset = "sanger" if guessoffset([readfile]) == 33 else "illumina" cmd += " --quality-protocol {0}".format(offset) except AssertionError: pass cmd += " " + " ".join(args[1:]) sh(cmd, outfile=gsnapfile, errfile=logfile) if opts.native: EYHOME = opts.eddyyeh_home if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) return gsnapfile, logfile
def correct(args): """ %prog correct *.fastq Correct the fastqfile and generated corrected fastqfiles. This calls assembly.allpaths.prepare() to generate input files for ALLPATHS-LG. The naming convention for your fastqfiles are important, and are listed below. By default, this will correct all PE reads, and remove duplicates of all MP reads, and results will be placed in `frag_reads.corr.{pairs,frags}.fastq` and `jump_reads.corr.{pairs,frags}.fastq`. """ from jcvi.assembly.allpaths import prepare from jcvi.assembly.base import FastqNamings p = OptionParser(correct.__doc__ + FastqNamings) p.add_option("--dir", default="data", help="Working directory [default: %default]") p.add_option("--fragsdedup", default=False, action="store_true", help="Don't deduplicate the fragment reads [default: %default]") p.add_option("--ploidy", default="2", choices=("1", "2"), help="Ploidy [default: %default]") p.add_option("--haploidify", default=False, action="store_true", help="Set HAPLOIDIFY=True [default: %default]") p.add_option("--suffix", default=False, action="store_true", help="Add suffix /1, /2 to read names") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastq = args tag, tagj, taglj = "frag_reads", "jump_reads", "long_jump_reads" ploidy = opts.ploidy haploidify = opts.haploidify suffix = opts.suffix assert (not haploidify) or (haploidify and ploidy == '2') prepare(["Unknown"] + fastq + ["--norun"]) datadir = opts.dir mkdir(datadir) fullpath = op.join(os.getcwd(), datadir) nthreads = " NUM_THREADS={0}".format(opts.cpus) phred64 = (guessoffset([args[0]]) == 64) orig = datadir + "/{0}_orig".format(tag) origfastb = orig + ".fastb" if need_update(fastq, origfastb): cmd = "PrepareAllPathsInputs.pl DATA_DIR={0} HOSTS='{1}' PLOIDY={2}".\ format(fullpath, opts.cpus, ploidy) if phred64: cmd += " PHRED_64=True" sh(cmd) if op.exists(origfastb): correct_frag(datadir, tag, origfastb, nthreads, dedup=opts.fragsdedup, haploidify=haploidify, suffix=suffix) origj = datadir + "/{0}_orig".format(tagj) origjfastb = origj + ".fastb" if op.exists(origjfastb): correct_jump(datadir, tagj, origjfastb, nthreads, suffix=suffix) origlj = datadir + "/{0}_orig".format(taglj) origljfastb = origlj + ".fastb" if op.exists(origljfastb): correct_jump(datadir, taglj, origljfastb, nthreads, suffix=suffix)
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.add_option("--null", default=False, action="store_true", help="Do not write to SAM/BAM output") p.add_option("--fasta", default=False, action="store_true", help="Query reads are FASTA") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == '+-': extra += "" elif mo == '-+': extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped fasta = opts.fasta gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile(readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam) logfile = prefix + ".log" if not fasta: offset = guessoffset([readfile]) if not need_update(dbfile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) if fasta: cmd += " -f" else: cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) if opts.null: samfile = "/dev/null" cmd = output_bam(cmd, samfile) sh(cmd) print(open(logfile).read(), file=sys.stderr) return samfile, logfile
def align(args): """ %prog align database.fasta read1.fq [read2.fq] Wrapper for `bowtie2` single-end or paired-end, depending on the number of args. """ from jcvi.formats.fastq import guessoffset p = OptionParser(align.__doc__) p.set_firstN(firstN=0) p.add_option("--full", default=False, action="store_true", help="Enforce end-to-end alignment [default: local]") p.add_option("--reorder", default=False, action="store_true", help="Keep the input read order [default: %default]") p.set_cutoff(cutoff=800) p.set_mateorientation(mateorientation="+-") p.set_sam_options(bowtie=True) opts, args = p.parse_args(args) extra = opts.extra mo = opts.mateorientation if mo == "+-": extra += "" elif mo == "-+": extra += "--rf" else: extra += "--ff" PE = True if len(args) == 2: logging.debug("Single-end alignment") PE = False elif len(args) == 3: logging.debug("Paired-end alignment") else: sys.exit(not p.print_help()) firstN = opts.firstN mapped = opts.mapped unmapped = opts.unmapped gl = "--end-to-end" if opts.full else "--local" dbfile, readfile = args[0:2] dbfile = get_abs_path(dbfile) safile = check_index(dbfile) prefix = get_prefix(readfile, dbfile) samfile, mapped, unmapped = get_samfile( readfile, dbfile, bowtie=True, mapped=mapped, unmapped=unmapped, bam=opts.bam ) logfile = prefix + ".log" offset = guessoffset([readfile]) if not need_update(safile, samfile): logging.error("`{0}` exists. `bowtie2` already run.".format(samfile)) return samfile, logfile cmd = "bowtie2 -x {0}".format(dbfile) if PE: r1, r2 = args[1:3] cmd += " -1 {0} -2 {1}".format(r1, r2) cmd += " --maxins {0}".format(opts.cutoff) mtag, utag = "--al-conc", "--un-conc" else: cmd += " -U {0}".format(readfile) mtag, utag = "--al", "--un" if mapped: cmd += " {0} {1}".format(mtag, mapped) if unmapped: cmd += " {0} {1}".format(utag, unmapped) if firstN: cmd += " --upto {0}".format(firstN) cmd += " -p {0}".format(opts.cpus) cmd += " --phred{0}".format(offset) cmd += " {0}".format(gl) if opts.reorder: cmd += " --reorder" cmd += " {0}".format(extra) # Finally the log cmd += " 2> {0}".format(logfile) cmd = output_bam(cmd, samfile) sh(cmd) print >>sys.stderr, open(logfile).read() return samfile, logfile
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ tv = "0.32" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic jar file [default: %default]") p.add_option("--phred", default=None, choices=phdchoices, help="Phred score offset [default: guess]") p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=15, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=36, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--adapteronly", default=False, action="store_true", help="Only trim adapters with no qv trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") p.add_option("--log", default=None, dest="trimlog", help="Specify a `trimlog` file [default: %default]") p.set_cpus(cpus=4) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path), \ "Couldn't find Trimmomatic jar file at `{0}`".\ format(path) adaptersfile = "adapters.fasta" Adapters = must_open(op.join(datadir, adaptersfile)).read() write_file(adaptersfile, Adapters, skipcheck=True) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) threadsflag = " -threads {0}".format(opts.cpus) if opts.trimlog: trimlog = " -trimlog {0}".format(opts.trimlog) cmd = "java -Xmx4g -jar {0}".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += " SE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += " PE" cmd += phredflag cmd += threadsflag if opts.trimlog: cmd += trimlog fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:30:10".format(adaptersfile) if not opts.adapteronly: cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0}".format(opts.minqv) cmd += " MINLEN:{0}".format(opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd)
def trim(args): """ %prog trim fastqfiles Trim reads using TRIMMOMATIC. If two fastqfiles are given, then it invokes the paired reads mode. See manual: <http://www.usadellab.org/cms/index.php?page=trimmomatic> """ TrimVersion = tv = "0.20" TrimJar = "trimmomatic-{0}.jar".format(tv) phdchoices = ("33", "64") p = OptionParser(trim.__doc__) p.add_option("--path", default=op.join("~/bin", TrimJar), help="Path to trimmomatic [default: %default]") p.add_option( "--phred", default=None, choices=phdchoices, help="Phred score offset {0} [default: guess]".format(phdchoices)) p.add_option("--nofrags", default=False, action="store_true", help="Discard frags file in PE mode [default: %default]") p.add_option("--minqv", default=10, type="int", help="Average qv after trimming [default: %default]") p.add_option("--minlen", default=30, type="int", help="Minimum length after trimming [default: %default]") p.add_option("--nogz", default=False, action="store_true", help="Do not write to gzipped files [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) path = op.expanduser(opts.path) url = \ "http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-{0}.zip"\ .format(tv) if not op.exists(path): path = download(url) TrimUnzipped = "Trimmomatic-" + tv if not op.exists(TrimUnzipped): sh("unzip " + path) os.remove(path) path = op.join(TrimUnzipped, TrimJar) assert op.exists(path) adaptersfile = "adapters.fasta" if not op.exists(adaptersfile): write_file(adaptersfile, Adapters) assert op.exists(adaptersfile), \ "Please place the illumina adapter sequence in `{0}`".\ format(adaptersfile) if opts.phred is None: offset = guessoffset([args[0]]) else: offset = int(opts.phred) phredflag = " -phred{0}".format(offset) cmd = JAVAPATH("java-1.6.0") cmd += " -Xmx4g -cp {0} org.usadellab.trimmomatic".format(path) frags = ".frags.fastq" pairs = ".pairs.fastq" if not opts.nogz: frags += ".gz" pairs += ".gz" get_prefix = lambda x: op.basename(x).replace(".gz", "").rsplit(".", 1)[0] if len(args) == 1: cmd += ".TrimmomaticSE" cmd += phredflag fastqfile, = args prefix = get_prefix(fastqfile) frags1 = prefix + frags cmd += " {0}".format(" ".join((fastqfile, frags1))) else: cmd += ".TrimmomaticPE" cmd += phredflag fastqfile1, fastqfile2 = args prefix1 = get_prefix(fastqfile1) prefix2 = get_prefix(fastqfile2) pairs1 = prefix1 + pairs pairs2 = prefix2 + pairs frags1 = prefix1 + frags frags2 = prefix2 + frags if opts.nofrags: frags1 = "/dev/null" frags2 = "/dev/null" cmd += " {0}".format(" ".join((fastqfile1, fastqfile2, \ pairs1, frags1, pairs2, frags2))) cmd += " ILLUMINACLIP:{0}:2:40:12".format(adaptersfile) cmd += " LEADING:3 TRAILING:3" cmd += " SLIDINGWINDOW:4:{0} MINLEN:{1}".format(opts.minqv, opts.minlen) if offset != 33: cmd += " TOPHRED33" sh(cmd, grid=opts.grid)