def wgsim(args): """ %prog wgsim fastafile Run dwgsim on fastafile. """ p = OptionParser(wgsim.__doc__) p.add_option("--erate", default=.02, type="float", help="Base error rate of the read [default: %default]") p.add_option("--distance", default=500, type="int", help="Outer distance between the two ends [default: %default]") p.add_option("--genomesize", type="int", help="Genome size in Mb [default: estimate from data]") p.add_option("--readlen", default=100, type="int", help="Length of the read [default: %default]") p.add_option("--noerrors", default=False, action="store_true", help="Simulate reads with no errors [default: %default]") p.set_depth(depth=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args pf = fastafile.split(".")[0] genomesize = opts.genomesize size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize depth = opts.depth readlen = opts.readlen readnum = size * depth / (2 * readlen) distance = opts.distance stdev = distance / 5 outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth) distance -= 2 * readlen # Outer distance => Inner distance assert distance >= 0, "Outer distance must be >= 2 * readlen" logging.debug("Total genome size: {0} bp".format(size)) logging.debug("Target depth: {0}x".format(depth)) logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum)) if opts.noerrors: opts.erate = 0 cmd = "dwgsim -e {0} -E {0}".format(opts.erate) if opts.noerrors: cmd += " -r 0 -R 0 -X 0 -y 0" cmd += " -d {0} -s {1}".format(distance, stdev) cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen) cmd += " {0} {1}".format(fastafile, outpf) sh(cmd)
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option("--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option( "--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1.0 / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit", default="/usr/local/bin/") p.set_home("fiona", default="/usr/local/bin/") p.set_home("jellyfish", default="/usr/local/bin/") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus), "--jellyfish_home={0}".format(opts.jellyfish_home)]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home)]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5)]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf)])
def diginorm(args): """ %prog diginorm fastqfile Run K-mer based normalization. Based on tutorial: <http://ged.msu.edu/angus/diginorm-2012/tutorial.html> Assume input is either an interleaved pairs file, or two separate files. To set up khmer: $ git clone git://github.com/ged-lab/screed.git $ git clone git://github.com/ged-lab/khmer.git $ cd screed $ python setup.py install $ cd ../khmer $ make test $ export PYTHONPATH=~/export/khmer """ from jcvi.formats.fastq import shuffle, pairinplace, split from jcvi.apps.base import getfilesize p = OptionParser(diginorm.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end reads") p.add_option("--tablesize", help="Memory size") p.add_option("--npass", default="1", choices=("1", "2"), help="How many passes of normalization") p.set_depth(depth=50) p.set_home("khmer", default="/usr/local/bin/") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) if len(args) == 2: fastq = shuffle(args + ["--tag"]) else: fastq, = args kh = opts.khmer_home depth = opts.depth PE = not opts.single sys.path.insert(0, op.join(kh, "python")) pf = fastq.rsplit(".", 1)[0] keepfile = fastq + ".keep" hashfile = pf + ".kh" mints = 10000000 ts = opts.tablesize or ((getfilesize(fastq) / 16 / mints + 1) * mints) norm_cmd = op.join(kh, "normalize-by-median.py") filt_cmd = op.join(kh, "filter-abund.py") if need_update(fastq, (hashfile, keepfile)): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth, ts) if PE: cmd += " -p" cmd += " -s {0} {1}".format(hashfile, fastq) sh(cmd) abundfiltfile = keepfile + ".abundfilt" if need_update((hashfile, keepfile), abundfiltfile): cmd = filt_cmd cmd += " {0} {1}".format(hashfile, keepfile) sh(cmd) if opts.npass == "1": seckeepfile = abundfiltfile else: seckeepfile = abundfiltfile + ".keep" if need_update(abundfiltfile, seckeepfile): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x {1}".format(depth - 10, ts / 2) cmd += " {0}".format(abundfiltfile) sh(cmd) if PE: pairsfile = pairinplace([seckeepfile, "--base={0}".format(pf + "_norm"), "--rclip=2"]) split([pairsfile])
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def diginorm(args): """ %prog diginorm fastqfile Run K-mer based normalization. Based on tutorial: <http://ged.msu.edu/angus/diginorm-2012/tutorial.html> Assume input is either an interleaved pairs file, or two separate files. To set up khmer: $ git clone git://github.com/ged-lab/screed.git $ git clone git://github.com/ged-lab/khmer.git $ cd screed $ python setup.py install $ cd ../khmer $ make test $ export PYTHONPATH=/root/khmer/python """ from jcvi.formats.fastq import shuffle, pairinplace, split p = OptionParser(diginorm.__doc__) p.set_depth() p.set_home("khmer") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) if len(args) == 2: fastq = shuffle(args + ["--tag"]) else: fastq, = args kh = opts.khmer_home depth = opts.depth sys.path.insert(0, op.join(kh, "python")) pf = fastq.rsplit(".", 1)[0] hashfile = pf + ".kh" keepfile = fastq + ".keep" norm_cmd = op.join(kh, "scripts/normalize-by-median.py") filt_cmd = op.join(kh, "scripts/filter-abund.py") if need_update(fastq, (hashfile, keepfile)): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x 2.5e8 -p".format(depth) cmd += " --savehash {0} {1}".format(hashfile, fastq) sh(cmd) abundfiltfile = keepfile + ".abundfilt" if need_update((hashfile, keepfile), abundfiltfile): cmd = filt_cmd cmd += " {0} {1}".format(hashfile, keepfile) sh(cmd) seckeepfile = abundfiltfile + ".keep" if need_update(abundfiltfile, seckeepfile): cmd = norm_cmd cmd += " -C {0} -k 20 -N 4 -x 1e8".format(depth - 5) cmd += " {0}".format(abundfiltfile) sh(cmd) pairsfile = pairinplace([seckeepfile, "--base={0}".format(pf + "_norm"), "--rclip=2"]) split([pairsfile])
def novo(args): """ %prog novo reads.fastq Reference-free tGBS pipeline. """ from jcvi.assembly.kmer import jellyfish, histogram from jcvi.assembly.preprocess import diginorm from jcvi.formats.fasta import filter as fasta_filter, format from jcvi.apps.cdhit import filter as cdhit_filter p = OptionParser(novo.__doc__) p.add_option("--technology", choices=("illumina", "454", "iontorrent"), default="iontorrent", help="Sequencing platform") p.add_option("--dedup", choices=("uclust", "cdhit"), default="cdhit", help="Dedup algorithm") p.set_depth(depth=50) p.set_align(pctid=96) p.set_home("cdhit") p.set_home("fiona") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args cpus = opts.cpus depth = opts.depth pf, sf = fastqfile.rsplit(".", 1) diginormfile = pf + ".diginorm." + sf if need_update(fastqfile, diginormfile): diginorm([fastqfile, "--single", "--depth={0}".format(depth)]) keepabund = fastqfile + ".keep.abundfilt" sh("cp -s {0} {1}".format(keepabund, diginormfile)) jf = pf + "-K23.histogram" if need_update(diginormfile, jf): jellyfish([ diginormfile, "--prefix={0}".format(pf), "--cpus={0}".format(cpus) ]) genomesize = histogram([jf, pf, "23"]) fiona = pf + ".fiona.fa" if need_update(diginormfile, fiona): cmd = op.join(opts.fiona_home, "bin/fiona") cmd += " -g {0} -nt {1} --sequencing-technology {2}".\ format(genomesize, cpus, opts.technology) cmd += " -vv {0} {1}".format(diginormfile, fiona) logfile = pf + ".fiona.log" sh(cmd, outfile=logfile, errfile=logfile) dedup = opts.dedup pctid = opts.pctid cons = fiona + ".P{0}.{1}.consensus.fasta".format(pctid, dedup) if need_update(fiona, cons): if dedup == "cdhit": deduplicate([ fiona, "--consensus", "--reads", "--pctid={0}".format(pctid), "--cdhit_home={0}".format(opts.cdhit_home) ]) else: uclust([fiona, "--pctid={0}".format(pctid)]) filteredfile = pf + ".filtered.fasta" if need_update(cons, filteredfile): covfile = pf + ".cov.fasta" cdhit_filter([ cons, "--outfile={0}".format(covfile), "--minsize={0}".format(depth / 5) ]) fasta_filter([covfile, "50", "--outfile={0}".format(filteredfile)]) finalfile = pf + ".final.fasta" if need_update(filteredfile, finalfile): format([ filteredfile, finalfile, "--sequential=replace", "--prefix={0}_".format(pf) ])
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option( "--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option( "--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1. / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile