def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + [ "--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta"), ]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=96, pctcov=0) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. fastafile, qualfile = fasta([fastafile, "--seqtk"]) ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" if opts.pctcov != 0: cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.) dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + ["--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta")]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import plt, markup, human_formatter, \ human_base_formatter, savefig, set2, set_ticklabels_helvetica p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option("--color", default='0', choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2") opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * .5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000 ** SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1. / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, '-', color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = .95, .95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print >> sys.stderr, t ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= .05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params(axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import ( plt, markup, human_formatter, human_base_formatter, savefig, set2, set_ticklabels_helvetica, ) p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option( "--color", default="0", choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2", ) opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * 0.5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000**SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1.0 / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, "-", color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = 0.95, 0.95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print(t, file=sys.stderr) ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= 0.05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params( axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True, ) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=96, pctcov=0) p.add_option( "--fast", default=False, action="store_true", help="Place sequence in the first cluster", ) p.add_option( "--consensus", default=False, action="store_true", help="Compute consensus sequences", ) p.add_option( "--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate", ) p.add_option( "--samestrand", default=False, action="store_true", help="Enforce same strand alignment", ) p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args identity = opts.pctid / 100.0 fastafile, qualfile = fasta([fastafile, "--seqtk"]) ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" if opts.pctcov != 0: cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.0) dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".format( clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd