def loghistogram(data, base=2, ascii=True, title="Counts", summary=False): """ bins is a dictionary with key: log(x, base), value: counts. """ from jcvi.utils.cbook import percentage if summary: unique = len(data) total = sum(data) # Print out a distribution print >> sys.stderr, "Unique: {0}".format(percentage(unique, total)) bins = defaultdict(int) for d in data: logd = int(log(d, base)) bins[logd] += 1 x, y = [], [] for size, number in sorted(bins.items()): lb, ub = base ** size, base ** (size + 1) x.append((lb, ub)) y.append(number) asciiplot(x, y, title=title)
def stem_leaf_plot(data, vmin, vmax, bins, digit=1, title=None): ''' Generate stem and leaf plot given a collection of numbers ''' assert bins > 0 range = vmax - vmin step = range * 1. / bins if isinstance(range, int): step = int(ceil(step)) step = step or 1 bins = np.arange(vmin, vmax + step, step) hist, bin_edges = np.histogram(data, bins=bins) asciiplot(bin_edges, hist, digit=digit, title=title) print >> sys.stderr, "Last bin ends in {0}, inclusive.".format(vmax)
def stem_leaf_plot(data, vmin, vmax, bins, digit=1, title=None): """ Generate stem and leaf plot given a collection of numbers """ assert bins > 0 range = vmax - vmin step = range * 1.0 / bins if isinstance(range, int): step = int(ceil(step)) step = step or 1 bins = np.arange(vmin, vmax + step, step) hist, bin_edges = np.histogram(data, bins=bins) # By default, len(bin_edges) = len(hist) + 1 bin_edges = bin_edges[: len(hist)] asciiplot(bin_edges, hist, digit=digit, title=title) print("Last bin ends in {0}, inclusive.".format(vmax), file=sys.stderr) return bin_edges, hist
def model(args): """ %prog model erate Model kmer distribution given error rate. See derivation in FIONA paper: <http://bioinformatics.oxfordjournals.org/content/30/17/i356.full> """ from scipy.stats import binom, poisson p = OptionParser(model.__doc__) p.add_option("-k", default=23, type="int", help="Kmer size") p.add_option("--cov", default=50, type="int", help="Expected coverage") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (erate, ) = args erate = float(erate) cov = opts.cov k = opts.k xy = [] # Range include c although it is unclear what it means to have c=0 for c in range(0, cov * 2 + 1): Prob_Yk = 0 for i in range(k + 1): # Probability of having exactly i errors pi_i = binom.pmf(i, k, erate) # Expected coverage of kmer with exactly i errors mu_i = cov * (erate / 3)**i * (1 - erate)**(k - i) # Probability of seeing coverage of c Prob_Yk_i = poisson.pmf(c, mu_i) # Sum i over 0, 1, ... up to k errors Prob_Yk += pi_i * Prob_Yk_i xy.append((c, Prob_Yk)) x, y = zip(*xy) asciiplot(x, y, title="Model")
def model(args): """ %prog model erate Model kmer distribution given error rate. See derivation in FIONA paper: <http://bioinformatics.oxfordjournals.org/content/30/17/i356.full> """ from scipy.stats import binom, poisson p = OptionParser(model.__doc__) p.add_option("-k", default=23, type="int", help="Kmer size") p.add_option("--cov", default=50, type="int", help="Expected coverage") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) erate, = args erate = float(erate) cov = opts.cov k = opts.k xy = [] # Range include c although it is unclear what it means to have c=0 for c in xrange(0, cov * 2 + 1): Prob_Yk = 0 for i in xrange(k + 1): # Probability of having exactly i errors pi_i = binom.pmf(i, k, erate) # Expected coverage of kmer with exactly i errors mu_i = cov * (erate / 3) ** i * (1 - erate) ** (k - i) # Probability of seeing coverage of c Prob_Yk_i = poisson.pmf(c, mu_i) # Sum i over 0, 1, ... up to k errors Prob_Yk += pi_i * Prob_Yk_i xy.append((c, Prob_Yk)) x, y = zip(*xy) asciiplot(x, y, title="Model")
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option( "--method", choices=("nbinom", "allpaths"), default="nbinom", help= "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes", ) p.add_option( "--maxiter", default=100, type="int", help="Max iterations for optimization. Only used with --method nbinom", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args, iopts = p.set_image_options(args, figsize="7x7") if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args method = opts.method vmin, vmax = opts.vmin, opts.vmax ascii = not opts.pdf peaks = not opts.nopeaks and method == "allpaths" N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.lambda_ if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(vmin, vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (iopts.w, iopts.h)) plt.bar(x, y, fc="#b2df8a", lw=0) # Plot the negative binomial fit if method == "nbinom": generative_model = method_info["generative_model"] GG = method_info["Gbins"] ll = method_info["lambda"] rr = method_info["rho"] kf_range = method_info["kf_range"] stacked = generative_model(GG, ll, rr) plt.plot( kf_range, stacked, ":", color="#6a3d9a", lw=2, ) ax = plt.gca() if peaks: # Only works for method 'allpaths' t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=3, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak") ax.text(ks.max2, tcounts[ks.max2], "Main peak") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 if method == "nbinom": # Plot multiple CN locations, CN1, CN2, ... up to ploidy cn_color = "#a6cee3" for i in range(1, ks.ploidy + 1): x = i * ks.lambda_ plt.plot((x, x), (0, ymax), "-.", color=cn_color) plt.text( x, ymax * 0.95, "CN{}".format(i), ha="right", va="center", color=cn_color, rotation=90, ) messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] if method == "nbinom": messages += [ks.ploidy_message] + ks.copy_messages write_messages(ax, messages) ax.set_title(markup(title)) ax.set_xlim((0, vmax)) ax.set_ylim((0, ymax)) adjust_spines(ax, ["left", "bottom"], outward=True) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + "." + iopts.format savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]" ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = meryl([histfile]) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\ format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive", ) p.add_option( "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot", ) p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option( "--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf peaks = not opts.nopeaks N = int(N) if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): logging.debug("CA kmer index found") histfile = merylhistogram(histfile) ks = KmerSpectrum(histfile) ks.analyze(K=N) Total_Kmers = int(ks.totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print(msg, file=sys.stderr) x, y = ks.get_xy(opts.vmin, opts.vmax) title = "{0} {1}-mer histogram".format(species, N) if ascii: asciiplot(x, y, title=title) return Genome_size plt.figure(1, (6, 6)) plt.plot(x, y, "g-", lw=2, alpha=0.5) ax = plt.gca() if peaks: t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in ks.counts if x in t] if tcounts: x, y = zip(*tcounts) tcounts = dict(tcounts) plt.plot(x, y, "ko", lw=2, mec="k", mfc="w") ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") messages = [ Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg, ] write_messages(ax, messages) ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title)) ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100) return Genome_size
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]") p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]") p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]") p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args N = int(N) KMERYL, KSOAP, KALLPATHS = range(3) kformats = ("Meryl", "Soap", "AllPaths") kformat = KMERYL ascii = not opts.pdf peaks = not opts.nopeaks fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram for row in fp: if row.startswith("# 1:"): kformat = KALLPATHS break if len(row.split()) == 1: kformat = KSOAP break fp.seek(0) logging.debug("Guessed format: {0}".format(kformats[kformat])) data = [] for rowno, row in enumerate(fp): if row[0] == '#': continue if kformat == KSOAP: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = Kcounts data.append((K, counts)) covmax = 1000000 ks = KmerSpectrum(data) ks.analyze(K=N, covmax=covmax) Total_Kmers = int(totalKmers) coverage = opts.coverage Kmer_coverage = ks.max2 if not coverage else coverage Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) Repetitive_msg = ks.repetitive SNPrate_msg = ks.snprate for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() \ if opts.vmin <= a <= opts.vmax) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) tcounts = [(x, y) for x, y in counts if x in t] x, y = zip(*tcounts) plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w') tcounts = dict(tcounts) if peaks: ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top") ax.text(ks.max2, tcounts[ks.max2], "Main peak") tc = "gray" axt = ax.transAxes ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right") ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right") ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right") ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right") ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right") ymin, ymax = ax.get_ylim() ymax = ymax * 7 / 6 ax.set_title(markup(title), color='r') ax.set_ylim((ymin, ymax)) xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(xlabel, color='r') ax.set_ylabel(ylabel, color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" savefig(imagename, dpi=100)
def histogram(args): """ %prog histogram meryl.histogram species K Plot the histogram based on meryl K-mer distribution, species and N are only used to annotate the graphic. Find out totalKmers when running kmer.meryl(). """ p = OptionParser(histogram.__doc__) p.add_option("--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) histfile, species, N = args ascii = not opts.pdf fp = open(histfile) hist = {} totalKmers = 0 # Guess the format of the Kmer histogram soap = False for row in fp: if len(row.split()) == 1: soap = True break fp.seek(0) for rowno, row in enumerate(fp): if soap: K = rowno + 1 counts = int(row.strip()) else: # meryl histogram K, counts = row.split()[:2] K, counts = int(K), int(counts) Kcounts = K * counts totalKmers += Kcounts hist[K] = counts history = ["drop"] for a, b in pairwise(sorted(hist.items())): Ka, ca = a Kb, cb = b if ca <= cb: status = "rise" else: status = "drop" if history[-1] != status: history.append(status) if history == ["drop", "rise", "drop"]: break Total_Kmers = int(totalKmers) Kmer_coverage = Ka Genome_size = Total_Kmers * 1. / Ka / 1e6 Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers) Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage) Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size) for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): print >> sys.stderr, msg counts = sorted((a, b) for a, b in hist.items() if a <= 100) x, y = zip(*counts) title = "{0} genome {1}-mer histogram".format(species, N) if ascii: return asciiplot(x, y, title=title) fig = plt.figure(1, (6, 6)) plt.plot(x, y, 'g-', lw=2, alpha=.5) ax = plt.gca() ax.text(.5, .9, _(Total_Kmers_msg), ha="center", color='b', transform=ax.transAxes) ax.text(.5, .8, _(Kmer_coverage_msg), ha="center", color='b', transform=ax.transAxes) ax.text(.5, .7, _(Genome_size_msg), ha="center", color='b', transform=ax.transAxes) ax.set_title(_(title), color='r') xlabel, ylabel = "Coverage (X)", "Counts" ax.set_xlabel(_(xlabel), color='r') ax.set_ylabel(_(ylabel), color='r') set_human_axis(ax) imagename = histfile.split(".")[0] + ".pdf" plt.savefig(imagename, dpi=100) print >> sys.stderr, "Image saved to `{0}`.".format(imagename)