def main(): """ %prog numbers1.txt number2.txt ... Print histogram of the data files. The data files contain one number per line. If more than one file is inputted, the program will combine the histograms into the same plot. """ allowed_format = ("emf", "eps", "pdf", "png", "ps", \ "raw", "rgba", "svg", "svgz") p = OptionParser(main.__doc__) p.add_option("--skip", default=0, type="int", help="skip the first several lines [default: %default]") p.set_histogram() p.add_option("--tags", dest="tags", default=None, help="tags for data if multiple input files, comma sep") p.add_option("--ascii", default=False, action="store_true", help="print ASCII text stem-leaf plot [default: %default]") p.add_option("--base", default="0", choices=("0", "2", "10"), help="use logarithm axis with base, 0 to disable [default: %default]") p.add_option("--facet", default=False, action="store_true", help="place multiple histograms side-by-side [default: %default]") p.add_option("--fill", default="white", help="color of the bin [default: %default]") p.add_option("--format", default="pdf", choices=allowed_format, help="Generate image of format [default: %default]") p.add_option("--quick", default=False, action="store_true", help="Use quick plot, assuming bins are already counted") p.add_option("--noprintstats", default=False, action="store_true", help="Write basic stats when using --quick") opts, args = p.parse_args() if len(args) < 1: sys.exit(not p.print_help()) skip = opts.skip vmin, vmax = opts.vmin, opts.vmax bins = opts.bins xlabel, title = opts.xlabel, opts.title title = title or args[0] base = int(opts.base) fileno = len(args) if opts.quick: assert fileno == 1, "Single input file expected using --quick" filename = args[0] figname = filename.rsplit(".", 1)[0] + ".pdf" data = DictFile(filename, keycast=int, cast=int) quickplot(data, vmin, vmax, xlabel, title, figname=figname, print_stats=(not opts.noprintstats)) return if fileno == 1: histogram(args[0], vmin, vmax, xlabel, title, outfmt=opts.format, bins=bins, skip=skip, ascii=opts.ascii, base=base, fill=opts.fill) else: histogram_multiple(args, vmin, vmax, xlabel, title, outfmt=opts.format, tags=opts.tags, bins=bins, skip=skip, ascii=opts.ascii, facet=opts.facet, fill=opts.fill)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import ( plt, markup, human_formatter, human_base_formatter, savefig, set2, set_ticklabels_helvetica, ) p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option( "--color", default="0", choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2", ) opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * 0.5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000**SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1.0 / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, "-", color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = 0.95, 0.95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print(t, file=sys.stderr) ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= 0.05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params( axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True, ) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def histogram(args): """ %prog histogram [reads.fasta|reads.fastq] Plot read length distribution for reads. The plot would be similar to the one generated by SMRT-portal, for example: http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html Plot has two axes - corresponding to pdf and cdf, respectively. Also adding number of reads, average/median, N50, and total length. """ from jcvi.utils.cbook import human_size, thousands, SUFFIXES from jcvi.formats.fastq import fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.graphics.base import plt, markup, human_formatter, \ human_base_formatter, savefig, set2, set_ticklabels_helvetica p = OptionParser(histogram.__doc__) p.set_histogram(vmax=50000, bins=100, xlabel="Read length", title="Read length distribution") p.add_option("--ylabel1", default="Counts", help="Label of y-axis on the left") p.add_option("--color", default='0', choices=[str(x) for x in range(8)], help="Color of bars, which is an index 0-7 in brewer set2") opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args fastafile, qualfile = fasta([fastafile, "--seqtk"]) sizes = Sizes(fastafile) all_sizes = sorted(sizes.sizes) xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins) plt.figure(1, (iopts.w, iopts.h)) ax1 = plt.gca() width = (xmax - xmin) * .5 / bins color = set2[int(opts.color)] ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center") ax1.set_xlabel(markup(opts.xlabel)) ax1.set_ylabel(opts.ylabel1) ax2 = ax1.twinx() cur_size = 0 total_size, l50, n50 = sizes.summary cdf = {} hsize = human_size(total_size) tag = hsize[-2:] unit = 1000 ** SUFFIXES[1000].index(tag) for x in all_sizes: if x not in cdf: cdf[x] = (total_size - cur_size) * 1. / unit cur_size += x x, y = zip(*sorted(cdf.items())) ax2.plot(x, y, '-', color="darkslategray") ylabel2 = "{0} above read length".format(tag) ax2.set_ylabel(ylabel2) for ax in (ax1, ax2): set_ticklabels_helvetica(ax) ax.set_xlim((xmin - width / 2, xmax + width / 2)) tc = "gray" axt = ax1.transAxes xx, yy = .95, .95 ma = "Total bases: {0}".format(hsize) mb = "Total reads: {0}".format(thousands(len(sizes))) mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes))) md = "Median read length: {0}bp".format(thousands(np.median(all_sizes))) me = "N50 read length: {0}bp".format(thousands(l50)) for t in (ma, mb, mc, md, me): print >> sys.stderr, t ax1.text(xx, yy, t, color=tc, transform=axt, ha="right") yy -= .05 ax1.set_title(markup(opts.title)) # Seaborn removes ticks for all styles except 'ticks'. Now add them back: ax1.tick_params(axis="x", direction="out", length=3, left=False, right=False, top=False, bottom=True) ax1.xaxis.set_major_formatter(human_base_formatter) ax1.yaxis.set_major_formatter(human_formatter) figname = sizes.filename + ".pdf" savefig(figname)
def main(): """ %prog numbers1.txt number2.txt ... Print histogram of the data files. The data files contain one number per line. If more than one file is inputted, the program will combine the histograms into the same plot. """ allowed_format = ("emf", "eps", "pdf", "png", "ps", "raw", "rgba", "svg", "svgz") p = OptionParser(main.__doc__) p.add_option("--skip", default=0, type="int", help="skip the first several lines") p.add_option("--col", default=0, type="int", help="Get the n-th column") p.set_histogram() p.add_option( "--tags", dest="tags", default=None, help="tags for data if multiple input files, comma sep", ) p.add_option( "--ascii", default=False, action="store_true", help="print ASCII text stem-leaf plot", ) p.add_option( "--base", default="0", choices=("0", "2", "10"), help="use logarithm axis with base, 0 to disable", ) p.add_option( "--facet", default=False, action="store_true", help="place multiple histograms side-by-side", ) p.add_option("--fill", default="white", help="color of the bin") p.add_option( "--format", default="pdf", choices=allowed_format, help="Generate image of format", ) p.add_option( "--quick", default=False, action="store_true", help="Use quick plot, assuming bins are already counted", ) p.add_option( "--noprintstats", default=False, action="store_true", help="Write basic stats when using --quick", ) opts, args = p.parse_args() if len(args) < 1: sys.exit(not p.print_help()) skip = opts.skip vmin, vmax = opts.vmin, opts.vmax bins = opts.bins xlabel, title = opts.xlabel, opts.title title = title or args[0] base = int(opts.base) fileno = len(args) if opts.quick: assert fileno == 1, "Single input file expected using --quick" filename = args[0] figname = filename.rsplit(".", 1)[0] + ".pdf" data = DictFile(filename, keycast=int, cast=int) quickplot( data, vmin, vmax, xlabel, title, figname=figname, print_stats=(not opts.noprintstats), ) return if fileno == 1: histogram( args[0], vmin, vmax, xlabel, title, outfmt=opts.format, bins=bins, skip=skip, ascii=opts.ascii, base=base, fill=opts.fill, col=opts.col, ) else: histogram_multiple( args, vmin, vmax, xlabel, title, outfmt=opts.format, tags=opts.tags, bins=bins, skip=skip, ascii=opts.ascii, facet=opts.facet, fill=opts.fill, )