Esempio n. 1
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        x, y = zip(*tcounts)
        tcounts = dict(tcounts)
        plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Esempio n. 2
0
def plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                           qbreaks, sbreaks, sep=True, chrlw=.1,
                           sepcolor="g", minfont=5, stdpf=True):
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbreaks:
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        if sep:
            ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbreaks:
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        if sep:
            ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos, .91, latex(label), size=fontsize,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91, pos, latex(label), size=fontsize,
                va="center", color="grey")

    # Plot the frame
    ax.plot(xlim, [0, 0], "-", lw=chrlw, color=sepcolor)
    ax.plot(xlim, [ysize, ysize], "-", lw=chrlw, color=sepcolor)
    ax.plot([0, 0], ylim, "-", lw=chrlw, color=sepcolor)
    ax.plot([xsize, xsize], ylim, "-", lw=chrlw, color=sepcolor)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    ax.set_xlabel(gx, size=16)
    ax.set_ylabel(gy, size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    return xlim, ylim
Esempio n. 3
0
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, genomenames=None,
        sample_number=10000, ignore=.005, palette=None, chrlw=.01, title=None):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        try:
            value = float(value)
        except ValueError:
            value = vmax

        if value < vmin:
            value = vmin
        if value > vmax:
            value = vmax

        if query not in qorder:
            #logging.warning("ignore %s" % query)
            continue
        if subject not in sorder:
            #logging.warning("ignore %s" % subject)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = vmax - value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    # only show random subset, default to sample_number = 5000
    if len(data) > sample_number:
        logging.debug("Showing a random subset of %s data points (total %s) " \
                      "for clarity." % (sample_number, len(data)))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    if not palette:
        data.sort(key=lambda x: -x[2])

    default_cm = cm.copper
    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=default_cm,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    xchr_labels, ychr_labels = [], []
    # Tag to mark whether to plot chr name (skip small ones)
    ignore_size_x = ignore_size_y = 0
    if ignore:
        ignore_size_x = xsize * ignore
        ignore_size_y = ysize * ignore

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        ignore = abs(end - beg) < ignore_size_x
        seqid = seqid.split("_")[-1]
        try:
            seqid = int(seqid)
            seqid = "c%d" % seqid
        except:
            pass

        xchr_labels.append((seqid, (beg + end) / 2, ignore))
        ax.plot([beg, beg], ylim, "g-", lw=chrlw)

    for (seqid, beg, end) in sbed.get_breaks():
        ignore = abs(end - beg) < ignore_size_y
        seqid = seqid.split("_")[-1]
        try:
            seqid = int(seqid)
            seqid = "c%d" % seqid
        except:
            pass

        ychr_labels.append((seqid, (beg + end) / 2, ignore))
        ax.plot(xlim, [beg, beg], "g-", lw=chrlw)

    # plot the chromosome labels
    for label, pos, ignore in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if not ignore:
            root.text(pos, .91, label,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, ignore in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if not ignore:
            root.text(.91, pos, label,
                va="center", color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(gx, size=16)
    ax.set_ylabel(gy, size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if title:
        fig.suptitle(title, x=.05, y=.98, color="k")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
Esempio n. 4
0
def dotplot(anchorfile,
            qbed,
            sbed,
            fig,
            root,
            ax,
            vmin=0,
            vmax=1,
            is_self=False,
            synteny=False,
            cmap_text=None,
            genomenames=None,
            sample_number=10000,
            minfont=5,
            palette=None,
            chrlw=.01,
            title=None,
            sepcolor="gainsboro"):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        try:
            value = float(value)
        except ValueError:
            value = vmax

        if value < vmin:
            value = vmin
        if value > vmax:
            value = vmax

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = vmax - value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = len(data)
    # Only show random subset
    if npairs > sample_number:
        logging.debug("Showing a random subset of {0} data points (total {1}) " \
                      "for clarity.".format(sample_number, npairs))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    if not palette:
        data.sort(key=lambda x: -x[2])

    default_cm = cm.copper
    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    else:
        ax.scatter(x,
                   y,
                   c=c,
                   edgecolors="none",
                   s=2,
                   lw=0,
                   cmap=default_cm,
                   vmin=vmin,
                   vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbed.get_breaks():
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos,
                      .91,
                      latex(label),
                      size=fontsize,
                      ha="center",
                      va="bottom",
                      rotation=45,
                      color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91,
                      pos,
                      latex(label),
                      size=fontsize,
                      va="center",
                      color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(gx, size=16)
    ax.set_ylabel(gy, size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
             color='gray',
             size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if not title:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(title, x=.5, y=.96, color="k")
    logging.debug(title)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
Esempio n. 5
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive")
    p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive")
    p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive")
    p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        line, = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K)
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Esempio n. 6
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]")
    p.add_option(
        "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]"
    )
    p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = meryl([histfile])

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Esempio n. 7
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int",
            help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int",
            help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf", default=False, action="store_true",
            help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage", default=0, type="int",
            help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true",
            help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
def plot_breaks_and_labels(
    fig,
    root,
    ax,
    gx,
    gy,
    xsize,
    ysize,
    qbreaks,
    sbreaks,
    sep=True,
    chrlw=0.1,
    sepcolor="g",
    minfont=5,
    stdpf=True,
    chpf=True,
):
    xlim = (0, 47724.0) # hard-coding xlim maximum
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbreaks:
        xsize_ratio = abs(end - beg) * 0.8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        if chpf:
            seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        if sep:
            ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)
    for (seqid, beg, end) in sbreaks:
        ysize_ratio = abs(end - beg) * 0.8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        if chpf:
            seqid = "".join(seqid_parse(seqid, stdpf=stdpf)[:2])
        ychr_labels.append((seqid, (beg + end) / 2, fontsize*0.85))
        if sep:
            ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    xchr_labels = [('chr\ 1', 1997.5, 12), ('chr\ 2', 5944.5, 12), ('chr\ 3', 9014.0, 12), ('chr\ 4', 11351.5, 12), ('chr\ 5', 13639.0, 12), ('chr\ 6', 17657.5, 12), ('chr\ 7', 22329.0, 12), ('chr\ 8', 25466.0, 12), ('chr\ 9', 28092.0, 12), ('chr\ 10', 31361.5, 12), ('chr\ 11', 34457.0, 12), ('chr\ 12', 37234.0, 12), ('chr\ 13', 41112.5, 12), ('chr\ 14', 43851.0, 12), ('chr\ 15', 45258.5, 12), ('scf\ 16', 46740.5, 12), ('scf\ 458', 47724.0, 12)]
    for label, pos, fontsize in xchr_labels:
        #print(xchr_labels)
        pos = 0.1 + pos * 0.8 / xsize
        if fontsize >= minfont:
            root.text(
                pos,
                0.91,
                latex(label),
                size=fontsize*0.85,
                ha="center",
                va="bottom",
                rotation=45,
                color="black",
            )

    # remember y labels are inverted
    ychr_labels = [('chr\ 1', 2672.0, 10.2), ('chr\ 2', 7532.0, 10.2), ('chr\ 3', 12035.0, 10.2), ('chr\ 4', 16228.0, 10.2), ('chr\ 5', 19784.5, 10.2), ('chr\ 6', 23211.0, 10.2), ('chr\ 7', 26612.5, 10.2), ('chr\ 8', 29773.0, 10.2), ('chr\ 9', 32518.0, 10.2), ('chr\ 10', 35004.5, 10.2), ('chr\ 11', 37760.0, 10.2), ('chr\ 12', 40635.5, 10.2), ('ChrSy', 42048.0, 0), ('ChrUn', 42140.5, 0)]
    for label, pos, fontsize in ychr_labels:
        #print(ychr_labels)
        pos = 0.9 - pos * 0.8 / ysize
        if fontsize >= minfont:
            root.text(0.91, pos, latex(label), size=fontsize*0.85, va="center", color="black")

    # Plot the frame
    ax.plot(xlim, [0, 0], "-", lw=chrlw, color=sepcolor)
    ax.plot(xlim, [ysize, ysize], "-", lw=chrlw, color=sepcolor)
    ax.plot([0, 0], ylim, "-", lw=chrlw, color=sepcolor)
    ax.plot([xsize, xsize], ylim, "-", lw=chrlw, color=sepcolor)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    
    # The axis labels have been hardcoded (vs. gx gy as in original) so taht we can get the species names spelled out in italics, rather than the BED file name.
    ax.set_xlabel('$\it{Zizania\ palustris}$', size=16)
    ax.set_ylabel('$\it{Oryza\ sativa}$', size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color="black", size=10)

    return xlim, ylim
Esempio n. 9
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)
Esempio n. 10
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size /
                                                                1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Esempio n. 11
0
File: kmer.py Progetto: bennyyu/jcvi
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--pdf", default=False, action="store_true",
            help="Print PDF instead of ASCII plot [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    soap = False
    for row in fp:
        if len(row.split()) == 1:
            soap = True
            break
    fp.seek(0)

    for rowno, row in enumerate(fp):
        if soap:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = counts

    history = ["drop"]
    for a, b in pairwise(sorted(hist.items())):
        Ka, ca = a
        Kb, cb = b
        if ca <= cb:
            status = "rise"
        else:
            status = "drop"
        if history[-1] != status:
            history.append(status)
        if history == ["drop", "rise", "drop"]:
            break

    Total_Kmers = int(totalKmers)
    Kmer_coverage = Ka
    Genome_size = Total_Kmers * 1. / Ka / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() if a <= 100)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    fig = plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)

    ax = plt.gca()
    ax.text(.5, .9, _(Total_Kmers_msg),
            ha="center", color='b', transform=ax.transAxes)
    ax.text(.5, .8, _(Kmer_coverage_msg),
            ha="center", color='b', transform=ax.transAxes)
    ax.text(.5, .7, _(Genome_size_msg),
            ha="center", color='b', transform=ax.transAxes)

    ax.set_title(_(title), color='r')
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(_(xlabel), color='r')
    ax.set_ylabel(_(ylabel), color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    plt.savefig(imagename, dpi=100)
    print >> sys.stderr, "Image saved to `{0}`.".format(imagename)
Esempio n. 12
0
def draw_depth(
    root,
    ax,
    bed,
    chrinfo={},
    defaultcolor="k",
    sepcolor="w",
    ylim=100,
    title=None,
    subtitle=None,
):
    """ Draw depth plot on the given axes, using data from bed

    Args:
        root (matplotlib.Axes): Canvas axes
        ax (matplotlib.Axes): Axes to plot data on
        bed (Bed): Bed data from mosdepth
        chrinfo (ChrInfoFile): seqid => color, new name
        defaultcolor (str): matplotlib-compatible color for data points
        sepcolor (str): matplotlib-compatible color for chromosome breaks
        ylim (int): Upper limit of the y-axis (depth)
        title (str): Title of the figure, to the right of the axis
        subtitle (str): Subtitle of the figure, just below title
    """
    if chrinfo is None:
        chrinfo = {}
    sizes = bed.max_bp_in_chr
    seqids = chrinfo.keys() if chrinfo else sizes.keys()
    starts = {}
    ends = {}
    label_positions = []
    start = 0
    for seqid in seqids:
        starts[seqid] = start
        end = start + sizes[seqid]
        ends[seqid] = end
        label_positions.append((seqid, (start + end) / 2))
        start = end
    xsize = end

    # Extract plotting data
    data = []
    data_by_seqid = defaultdict(list)
    for b in bed:
        seqid = b.seqid
        if seqid not in starts:
            continue
        # chr01A  2000000 3000000 113.00
        x = starts[seqid] + (b.start + b.end) / 2
        y = float(b.accn)
        c = chrinfo[seqid].color if seqid in chrinfo else "k"
        data.append((x, y, c))
        data_by_seqid[seqid].append(y)

    x, y, c = zip(*data)
    ax.scatter(
        x,
        y,
        c=c,
        edgecolors="none",
        s=8,
        lw=0,
    )
    logging.debug("Obtained {} data points with depth data".format(len(data)))

    # Per seqid median
    medians = {}
    for seqid, values in data_by_seqid.items():
        c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor
        seqid_start = starts[seqid]
        seqid_end = ends[seqid]
        seqid_median = np.median(values)
        medians[seqid] = seqid_median
        ax.plot(
            (seqid_start, seqid_end),
            (seqid_median, seqid_median),
            "-",
            lw=4,
            color=c,
            alpha=0.5,
        )

    # vertical lines for all the breaks
    for pos in starts.values():
        ax.plot((pos, pos), (0, ylim), "-", lw=1, color=sepcolor)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    median_depth_y = 0.88
    chr_label_y = 0.08
    for seqid, position in label_positions:
        xpos = 0.1 + position * 0.8 / xsize
        c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor
        newseqid = chrinfo[seqid].new_name if seqid in chrinfo else seqid
        root.text(xpos,
                  chr_label_y,
                  newseqid,
                  color=c,
                  ha="center",
                  va="center",
                  rotation=20)
        seqid_median = medians[seqid]
        root.text(
            xpos,
            median_depth_y,
            str(int(seqid_median)),
            color=c,
            ha="center",
            va="center",
        )

    if title:
        root.text(
            0.95,
            0.5,
            markup(title),
            color="darkslategray",
            ha="center",
            va="center",
            size=15,
        )
    if subtitle:
        root.text(
            0.95,
            0.375,
            markup(subtitle),
            color="darkslategray",
            ha="center",
            va="center",
            size=15,
        )

    ax.set_xticks([])
    ax.set_xlim(0, xsize)
    ax.set_ylim(0, ylim)
    ax.set_ylabel("Depth")

    set_human_axis(ax)
    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
             color="gray",
             size=10)
    normalize_axes(root)
Esempio n. 13
0
def dotplot(anchorfile, qbed, sbed, image_name, vmin, vmax, iopts,
        is_self=False, synteny=False, cmap_text=None):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax))

    for row in fp:
        atoms = row.split()
        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue
        query, subject = atoms[:2]
        value = atoms[-1]

        try:
            value = float(value)
        except ValueError:
            value = vmax

        if value < vmin:
            value = vmin
        if value > vmax:
            value = vmax

        if query not in qorder:
            #logging.warning("ignore %s" % query)
            continue
        if subject not in sorder:
            #logging.warning("ignore %s" % subject)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = vmax - value
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([.1, .1, .8, .8])  # the dot plot

    sample_number = 5000  # only show random subset
    if len(data) > sample_number:
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    data.sort(key=lambda x: -x[2])

    default_cm = cm.copper
    x, y, c = zip(*data)
    ax.scatter(x, y, c=c, s=2, lw=0, cmap=default_cm,
            vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    xchr_labels, ychr_labels = [], []
    ignore = True  # tag to mark whether to plot chr name (skip small ones)
    ignore_size_x = xsize * .005
    ignore_size_y = ysize * .005

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        ignore = abs(end - beg) < ignore_size_x
        seqid = seqid.split("_")[-1]
        try:
            seqid = int(seqid)
            seqid = "c%d" % seqid
        except:
            pass

        xchr_labels.append((seqid, (beg + end) / 2, ignore))
        ax.plot([beg, beg], ylim, "g-", lw=1)

    for (seqid, beg, end) in sbed.get_breaks():
        ignore = abs(end - beg) < ignore_size_y
        seqid = seqid.split("_")[-1]
        try:
            seqid = int(seqid)
            seqid = "c%d" % seqid
        except:
            pass

        ychr_labels.append((seqid, (beg + end) / 2, ignore))
        ax.plot(xlim, [beg, beg], "g-", lw=1)

    # plot the chromosome labels
    for label, pos, ignore in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if not ignore:
            root.text(pos, .91, label,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, ignore in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if not ignore:
            root.text(.91, pos, label,
                va="center", color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    to_ax_label = lambda fname: _(op.basename(fname).split(".")[0])
    gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(gx, size=16)
    ax.set_ylabel(gy, size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
    logging.debug("Print image to `{0}` {1}".format(image_name, iopts))
    plt.savefig(image_name, dpi=iopts.dpi)
Esempio n. 14
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin",
                 default=15,
                 type="int",
                 help="Minimum K-mer size, inclusive")
    p.add_option("--kmax",
                 default=30,
                 type="int",
                 help="Maximum K-mer size, inclusive")
    p.add_option("--vmin",
                 default=2,
                 type="int",
                 help="Minimum value, inclusive")
    p.add_option("--vmax",
                 default=100,
                 type="int",
                 help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    if len(args) < 1:
        sys.exit(not p.print_help())

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        (line, ) = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K, method="allpaths")
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Esempio n. 15
0
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, cmap="copper",
        genomenames=None, sample_number=10000, minfont=5, palette=None,
        chrlw=.01, title=None, sepcolor="gainsboro"):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\
                        .format(vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        if cmap_text:
            try:
                value = float(value)
            except ValueError:
                value = vmax

            if value < vmin:
                continue
            if value > vmax:
                continue
        else:
            value = 0

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = len(data)
    # Only show random subset
    if npairs > sample_number:
        logging.debug("Showing a random subset of {0} data points (total {1}) " \
                      "for clarity.".format(sample_number, npairs))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    #if not palette:
    #    data.sort(key=lambda x: -x[2])

    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)
    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbed.get_breaks():
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos, .91, latex(label), size=fontsize,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91, pos, latex(label), size=fontsize,
                va="center", color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(markup(gx), size=16)
    ax.set_ylabel(markup(gy), size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if not title:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
Esempio n. 16
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option(
        "--method",
        choices=("nbinom", "allpaths"),
        default="nbinom",
        help=
        "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes",
    )
    p.add_option(
        "--maxiter",
        default=100,
        type="int",
        help="Max iterations for optimization. Only used with --method nbinom",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args, iopts = p.set_image_options(args, figsize="7x7")

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    method = opts.method
    vmin, vmax = opts.vmin, opts.vmax
    ascii = not opts.pdf
    peaks = not opts.nopeaks and method == "allpaths"
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.lambda_ if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size /
                                                                 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(vmin, vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (iopts.w, iopts.h))
    plt.bar(x, y, fc="#b2df8a", lw=0)
    # Plot the negative binomial fit
    if method == "nbinom":
        generative_model = method_info["generative_model"]
        GG = method_info["Gbins"]
        ll = method_info["lambda"]
        rr = method_info["rho"]
        kf_range = method_info["kf_range"]
        stacked = generative_model(GG, ll, rr)
        plt.plot(
            kf_range,
            stacked,
            ":",
            color="#6a3d9a",
            lw=2,
        )

    ax = plt.gca()

    if peaks:  # Only works for method 'allpaths'
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=3, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6
    if method == "nbinom":
        # Plot multiple CN locations, CN1, CN2, ... up to ploidy
        cn_color = "#a6cee3"
        for i in range(1, ks.ploidy + 1):
            x = i * ks.lambda_
            plt.plot((x, x), (0, ymax), "-.", color=cn_color)
            plt.text(
                x,
                ymax * 0.95,
                "CN{}".format(i),
                ha="right",
                va="center",
                color=cn_color,
                rotation=90,
            )

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    if method == "nbinom":
        messages += [ks.ploidy_message] + ks.copy_messages
    write_messages(ax, messages)

    ax.set_title(markup(title))
    ax.set_xlim((0, vmax))
    ax.set_ylim((0, ymax))
    adjust_spines(ax, ["left", "bottom"], outward=True)
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + "." + iopts.format
    savefig(imagename, dpi=100)

    return Genome_size
Esempio n. 17
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    soap = False
    for row in fp:
        if len(row.split()) == 1:
            soap = True
            break
    fp.seek(0)

    for rowno, row in enumerate(fp):
        if soap:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = counts

    history = ["drop"]
    for a, b in pairwise(sorted(hist.items())):
        Ka, ca = a
        Kb, cb = b
        if ca <= cb:
            status = "rise"
        else:
            status = "drop"
        if history[-1] != status:
            history.append(status)
        if history == ["drop", "rise", "drop"]:
            break

    Total_Kmers = int(totalKmers)
    Kmer_coverage = Ka
    Genome_size = Total_Kmers * 1. / Ka / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() if a <= 100)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    fig = plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)

    ax = plt.gca()
    ax.text(.5,
            .9,
            _(Total_Kmers_msg),
            ha="center",
            color='b',
            transform=ax.transAxes)
    ax.text(.5,
            .8,
            _(Kmer_coverage_msg),
            ha="center",
            color='b',
            transform=ax.transAxes)
    ax.text(.5,
            .7,
            _(Genome_size_msg),
            ha="center",
            color='b',
            transform=ax.transAxes)

    ax.set_title(_(title), color='r')
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(_(xlabel), color='r')
    ax.set_ylabel(_(ylabel), color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    plt.savefig(imagename, dpi=100)
    print >> sys.stderr, "Image saved to `{0}`.".format(imagename)