Beispiel #1
0
def subplot(ax,
            data,
            xlabel,
            ylabel,
            xlim=None,
            ylim=1.1,
            xcast=float,
            ycast=float,
            legend=None):
    columned_data = zip(*data)
    x, yy = columned_data[0], columned_data[1:]
    lines = []
    for y, m in zip(yy, "o^x"):
        line, = ax.plot(x, y, "k:", marker=m, mec="k", mfc="w", ms=4)
        lines.append(line)
    if legend:
        assert len(lines) == len(legend)
        ax.legend(lines, legend, loc='best')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if xlim:
        ax.set_xlim(0, xlim)
    if ylim:
        ax.set_ylim(0, ylim)
    set_ticklabels_helvetica(ax, xcast=xcast, ycast=ycast)
Beispiel #2
0
def subplot(ax, data, xlabel, ylabel, xlim=None, ylim=1.1,
                      xcast=float, ycast=float, legend=None):
    columned_data = zip(*data)
    x, yy = columned_data[0], columned_data[1:]
    lines = []
    for y, m in zip(yy, "o^x"):
        line, = ax.plot(x, y, "k:", marker=m, mec="k", mfc="w", ms=4)
        lines.append(line)
    if legend:
        assert len(lines) == len(legend)
        ax.legend(lines, legend, loc='best')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if xlim:
        ax.set_xlim(0, xlim)
    if ylim:
        ax.set_ylim(0, ylim)
    set_ticklabels_helvetica(ax, xcast=xcast, ycast=ycast)
Beispiel #3
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin",
                 default=15,
                 type="int",
                 help="Minimum K-mer size, inclusive")
    p.add_option("--kmax",
                 default=30,
                 type="int",
                 help="Maximum K-mer size, inclusive")
    p.add_option("--vmin",
                 default=2,
                 type="int",
                 help="Minimum value, inclusive")
    p.add_option("--vmax",
                 default=100,
                 type="int",
                 help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    if len(args) < 1:
        sys.exit(not p.print_help())

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        (line, ) = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K, method="allpaths")
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Beispiel #4
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import plt, markup, human_formatter, \
                human_base_formatter, savefig, set2, set_ticklabels_helvetica

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000, bins=100, xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1", default="Counts",
                 help="Label of y-axis on the left")
    p.add_option("--color", default='0', choices=[str(x) for x in range(8)],
                 help="Color of bars, which is an index 0-7 in brewer set2")
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * .5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000 ** SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1. / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, '-', color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = .95, .95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print >> sys.stderr, t
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= .05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(axis="x", direction="out", length=3,
                    left=False, right=False, top=False, bottom=True)
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Beispiel #5
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import (
        plt,
        markup,
        human_formatter,
        human_base_formatter,
        savefig,
        set2,
        set_ticklabels_helvetica,
    )

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000,
                    bins=100,
                    xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1",
                 default="Counts",
                 help="Label of y-axis on the left")
    p.add_option(
        "--color",
        default="0",
        choices=[str(x) for x in range(8)],
        help="Color of bars, which is an index 0-7 in brewer set2",
    )
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * 0.5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000**SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1.0 / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, "-", color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = 0.95, 0.95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print(t, file=sys.stderr)
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= 0.05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(
        axis="x",
        direction="out",
        length=3,
        left=False,
        right=False,
        top=False,
        bottom=True,
    )
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Beispiel #6
0
def variation(args):
    """
    %prog variation P1.bed P2.bed F1.bed

    Associate IES in parents and progeny.
    """
    p = OptionParser(variation.__doc__)
    p.add_option("--diversity",
                 choices=("breakpoint", "variant"),
                 default="variant",
                 help="Plot diversity")
    opts, args, iopts = p.set_image_options(args, figsize="6x6")

    if len(args) != 3:
        sys.exit(not p.print_help())

    pfs = [op.basename(x).split('-')[0] for x in args]
    P1, P2, F1 = pfs
    newbedfile = "-".join(pfs) + ".bed"
    if need_update(args, newbedfile):
        newbed = Bed()
        for pf, filename in zip(pfs, args):
            bed = Bed(filename)
            for b in bed:
                b.accn = "-".join((pf, b.accn))
                b.score = None
                newbed.append(b)
        newbed.print_to_file(newbedfile, sorted=True)

    neworder = Bed(newbedfile).order
    mergedbedfile = mergeBed(newbedfile, nms=True)
    bed = Bed(mergedbedfile)
    valid = 0
    total_counts = Counter()
    F1_counts = []
    bp_diff = []
    novelbedfile = "novel.bed"
    fw = open(novelbedfile, "w")
    for b in bed:
        accns = b.accn.split(',')
        pfs_accns = [x.split("-")[0] for x in accns]
        pfs_counts = Counter(pfs_accns)
        if len(pfs_counts) != 3:
            print(b, file=fw)
            continue

        valid += 1
        total_counts += pfs_counts
        F1_counts.append(pfs_counts[F1])

        # Collect breakpoint positions between P1 and F1
        P1_accns = [x for x in accns if x.split("-")[0] == P1]
        F1_accns = [x for x in accns if x.split("-")[0] == F1]
        if len(P1_accns) != 1:
            continue

        ri, ref = neworder[P1_accns[0]]
        P1_accns = [neworder[x][-1] for x in F1_accns]
        bp_diff.extend(x.start - ref.start for x in P1_accns)
        bp_diff.extend(x.end - ref.end for x in P1_accns)

    print("A total of {0} sites show consistent deletions across samples.".\
                    format(percentage(valid, len(bed))), file=sys.stderr)
    for pf, count in total_counts.items():
        print("{0:>9}: {1:.2f} deletions/site".\
                    format(pf, count * 1. / valid), file=sys.stderr)

    F1_counts = Counter(F1_counts)

    # Plot the IES variant number diversity
    from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica

    fig = plt.figure(1, (iopts.w, iopts.h))
    if opts.diversity == "variant":
        left, height = zip(*sorted(F1_counts.items()))
        for l, h in zip(left, height):
            print("{0:>9} variants: {1}".format(l, h), file=sys.stderr)
            plt.text(l,
                     h + 5,
                     str(h),
                     color="darkslategray",
                     size=8,
                     ha="center",
                     va="bottom",
                     rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Identified number of IES per site")
        plt.ylabel("Counts")
        plt.title("IES variation in progeny pool")
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".counts.pdf")

    # Plot the IES breakpoint position diversity
    else:
        bp_diff = Counter(bp_diff)
        bp_diff_abs = Counter()
        for k, v in bp_diff.items():
            bp_diff_abs[abs(k)] += v
        plt.figure(1, (iopts.w, iopts.h))
        left, height = zip(*sorted(bp_diff_abs.items()))
        for l, h in zip(left, height)[:21]:
            plt.text(l,
                     h + 50,
                     str(h),
                     color="darkslategray",
                     size=8,
                     ha="center",
                     va="bottom",
                     rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Progeny breakpoint relative to SB210")
        plt.ylabel("Counts")
        plt.xlim(-.5, 20.5)
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".breaks.pdf")
        # Serialize the data to a file
        fw = open("Breakpoint-offset-histogram.csv", "w")
        for k, v in sorted(bp_diff.items()):
            print("{0},{1}".format(k, v), file=fw)
        fw.close()

        total = sum(height)
        zeros = bp_diff[0]
        within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20])
        print("No deviation: {0}".format(percentage(zeros, total)),
              file=sys.stderr)
        print(" Within 20bp: {0}".format(percentage(within_20, total)),
              file=sys.stderr)
Beispiel #7
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive")
    p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive")
    p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive")
    p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        line, = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K)
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Beispiel #8
0
def variation(args):
    """
    %prog variation P1.bed P2.bed F1.bed

    Associate IES in parents and progeny.
    """
    p = OptionParser(variation.__doc__)
    p.add_option("--diversity", choices=("breakpoint", "variant"),
                 default="variant", help="Plot diversity")
    opts, args, iopts = p.set_image_options(args, figsize="6x6")

    if len(args) != 3:
        sys.exit(not p.print_help())

    pfs = [op.basename(x).split('-')[0] for x in args]
    P1, P2, F1 = pfs
    newbedfile = "-".join(pfs) + ".bed"
    if need_update(args, newbedfile):
        newbed = Bed()
        for pf, filename in zip(pfs, args):
            bed = Bed(filename)
            for b in bed:
                b.accn = "-".join((pf, b.accn))
                b.score = None
                newbed.append(b)
        newbed.print_to_file(newbedfile, sorted=True)

    neworder = Bed(newbedfile).order
    mergedbedfile = mergeBed(newbedfile, nms=True)
    bed = Bed(mergedbedfile)
    valid = 0
    total_counts = Counter()
    F1_counts = []
    bp_diff = []
    novelbedfile = "novel.bed"
    fw = open(novelbedfile, "w")
    for b in bed:
        accns = b.accn.split(',')
        pfs_accns = [x.split("-")[0] for x in accns]
        pfs_counts = Counter(pfs_accns)
        if len(pfs_counts) != 3:
            print >> fw, b
            continue

        valid += 1
        total_counts += pfs_counts
        F1_counts.append(pfs_counts[F1])

        # Collect breakpoint positions between P1 and F1
        P1_accns = [x for x in accns if x.split("-")[0] == P1]
        F1_accns = [x for x in accns if x.split("-")[0] == F1]
        if len(P1_accns) != 1:
            continue

        ri, ref = neworder[P1_accns[0]]
        P1_accns = [neworder[x][-1] for x in F1_accns]
        bp_diff.extend(x.start - ref.start for x in P1_accns)
        bp_diff.extend(x.end - ref.end for x in P1_accns)

    print >> sys.stderr, \
            "A total of {0} sites show consistent deletions across samples.".\
                    format(percentage(valid, len(bed)))
    for pf, count in total_counts.items():
        print >> sys.stderr, "{0:>9}: {1:.2f} deletions/site".\
                    format(pf, count * 1. / valid)

    F1_counts = Counter(F1_counts)

    # Plot the IES variant number diversity
    from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica

    fig = plt.figure(1, (iopts.w, iopts.h))
    if opts.diversity == "variant":
        left, height = zip(*sorted(F1_counts.items()))
        for l, h in zip(left, height):
            print >> sys.stderr, "{0:>9} variants: {1}".format(l, h)
            plt.text(l, h + 5, str(h), color="darkslategray", size=8,
                     ha="center", va="bottom", rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Identified number of IES per site")
        plt.ylabel("Counts")
        plt.title("IES variation in progeny pool")
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".counts.pdf")

    # Plot the IES breakpoint position diversity
    else:
        bp_diff = Counter(bp_diff)
        bp_diff_abs = Counter()
        for k, v in bp_diff.items():
            bp_diff_abs[abs(k)] += v
        plt.figure(1, (iopts.w, iopts.h))
        left, height = zip(*sorted(bp_diff_abs.items()))
        for l, h in zip(left, height)[:21]:
            plt.text(l, h + 50, str(h), color="darkslategray", size=8,
                     ha="center", va="bottom", rotation=90)

        plt.bar(left, height, align="center")
        plt.xlabel("Progeny breakpoint relative to SB210")
        plt.ylabel("Counts")
        plt.xlim(-.5, 20.5)
        ax = plt.gca()
        set_ticklabels_helvetica(ax)
        savefig(F1 + ".breaks.pdf")
        # Serialize the data to a file
        fw = open("Breakpoint-offset-histogram.csv", "w")
        for k, v in sorted(bp_diff.items()):
            print >> fw, "{0},{1}".format(k, v)
        fw.close()

        total = sum(height)
        zeros = bp_diff[0]
        within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20])
        print >> sys.stderr, "No deviation: {0}".format(percentage(zeros, total))
        print >> sys.stderr, " Within 20bp: {0}".format(percentage(within_20, total))