Example #1
0
    def draw(self, roundrect=False, plot_label=True, plot_circles=True,
             pad=.03, vpad=.09):
        if self.empty:
            return

        y = self.y
        color = self.color
        ax = self.ax
        xstart = self.xstart
        gap = self.gap
        va = self.va
        nseqids = len(self.seqids)
        tr = self.tr

        def make_circle_name(sid):
            sid = sid.rsplit("_", 1)[-1]
            si = "".join(x for x in sid if x in string.digits)
            si = str(int(si))
            if sid in self.rev:
                si += '-'
            return si

        for i, sid in enumerate(self.seqids):
            size = self.sizes[sid]
            rsize = self.ratio * size
            xend = xstart + rsize
            hc = HorizontalChromosome(ax, xstart, xend, y,
                                      height=self.height, lw=self.lw, fc=color,
                                      roundrect=roundrect)
            hc.set_transform(tr)
            si = make_circle_name(sid)
            xx = (xstart + xend) / 2
            xstart = xend + gap

            step = 2 if nseqids <= 40 else 10
            if nseqids >= 2 * MaxSeqids and (i + 1) % step != 0:
                continue
            if nseqids < 5:
                continue

            hpad = -pad if va == "bottom" else pad
            if plot_circles:
                TextCircle(ax, xx, y + hpad, si, radius=.01,
                           fc="w", color=color, size=10, transform=tr)

        label = markup(self.label)
        c = color if color != "gainsboro" else "k"
        if plot_label:
            if self.label_va == "top":
                x, y = self.x, self.y + vpad
            elif self.label_va == "bottom":
                x, y = self.x, self.y - vpad
            else:  # "center"
                x, y = self.xstart - vpad, self.y
            ax.text(x, y, label, ha="center", va="center", color=c, transform=tr)
Example #2
0
File: ks.py Project: ascendo/jcvi
    def draw(self, title="*Ks* distribution", filename="Ks_plot.pdf"):

        ax = self.ax
        ks_max = self.ks_max
        lines = self.lines
        labels = self.labels
        legendp = self.legendp
        if len(lines) > 1:
            leg = ax.legend(lines, labels, loc=legendp,
                            shadow=True, fancybox=True, prop={"size": 10})
            leg.get_frame().set_alpha(.5)

        ax.set_xlim((0, ks_max - self.interval))
        ax.set_title(markup(title), fontweight="bold")
        ax.set_xlabel(markup('Synonymous substitutions per site (*Ks*)'))
        ax.set_ylabel('Percentage of gene pairs')

        ax.set_xticklabels(ax.get_xticks(), family='Helvetica')
        ax.set_yticklabels(ax.get_yticks(), family='Helvetica')

        savefig(filename, dpi=300)
Example #3
0
File: ks.py Project: ascendo/jcvi
    def add_data(self, data, components=1, label="Ks",
                 color='r', marker='.', fill=False, fitted=True):

        ax = self.ax
        ks_max = self.ks_max
        interval = self.interval

        line, line_mixture = plot_ks_dist(ax, data, interval, components, ks_max,
                                          color=color, marker=marker,
                                          fill=fill, fitted=fitted)
        self.lines.append(line)
        self.labels.append(label)

        if fitted:
            self.lines.append(line_mixture)
            self.labels.append(label + " (fitted)")

        self.labels = [markup(x) for x in self.labels]
Example #4
0
    def draw(self, roundrect=False, plot_label=True):
        if self.empty:
            return

        y = self.y
        color = self.color
        ax = self.ax
        xstart = self.xstart
        gap = self.gap
        va = self.va
        nseqids = len(self.seqids)
        tr = self.tr
        for i, sid in enumerate(self.seqids):
            size = self.sizes[sid]
            rsize = self.ratio * size
            xend = xstart + rsize
            hc = HorizontalChromosome(ax, xstart, xend, y,
                                      height=self.height, lw=self.lw, fc=color,
                                      roundrect=roundrect)
            hc.set_transform(tr)
            sid = sid.rsplit("_", 1)[-1]
            si = "".join(x for x in sid if x not in string.letters)
            si = str(int(si))
            xx = (xstart + xend) / 2
            xstart = xend + gap

            if nseqids > 2 * MaxSeqids and (i + 1) % 10 != 0:
                continue
            if nseqids < 5:
                continue

            pad = .02
            if va == "bottom":
                pad = - pad
            TextCircle(ax, xx, y + pad, si, radius=.01,
                       fc="w", color=color, size=10, transform=tr)

        xp = min(self.xstart / 2, .1) if (self.xstart + self.xend) / 2 <= .5 \
                                      else max(1 - self.xend / 2, .92)
        label = markup(self.label)
        c = color if color != "gainsboro" else "k"
        if plot_label:
            ax.text(xp, y + self.height * .6, label,
                    ha="center", color=c, transform=tr)
Example #5
0
def cartoon(args):
    """
    %prog synteny.py

    Generate cartoon illustration of SynFind.
    """
    p = OptionParser(cartoon.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="10x7")

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    # Panel A
    A = CartoonRegion(41)
    A.draw(root, .35, .85, strip=False, color=False)
    x1, x2 = A.x1, A.x2
    lsg = "lightslategray"
    pad = .01
    xc, yc = .35, .88
    arrowlen = x2 - xc - pad
    arrowprops = dict(length_includes_head=True, width=.01, fc=lsg, lw=0,
                      head_length=arrowlen * .15, head_width=.03)
    p = FancyArrow(xc - pad, yc, -arrowlen, 0, shape="left", **arrowprops)
    root.add_patch(p)
    p = FancyArrow(xc + pad, yc, arrowlen, 0, shape="right", **arrowprops)
    root.add_patch(p)

    yt = yc + 4 * pad
    root.text((x1 + xc) / 2, yt, "20 genes upstream", ha="center")
    root.text((x2 + xc) / 2, yt, "20 genes downstream", ha="center")
    root.plot((xc,), (yc,), "o", mfc='w', mec=lsg, mew=2, lw=2, color=lsg)
    root.text(xc, yt, "Query gene", ha="center")

    # Panel B
    A.draw(root, .35, .7, strip=False)

    RoundRect(root, (.07, .49), .56, .14, fc='y', alpha=.2)
    a = deepcopy(A)
    a.evolve(mode='S', target=10)
    a.draw(root, .35, .6)
    b = deepcopy(A)
    b.evolve(mode='F', target=8)
    b.draw(root, .35, .56)
    c = deepcopy(A)
    c.evolve(mode='G', target=6)
    c.draw(root, .35, .52)

    for x in (a, b, c):
        root.text(.64, x.y, "Score={0}".format(x.nonwhites), va="center")

    # Panel C
    A.truncate_between_flankers()
    a.truncate_between_flankers()
    b.truncate_between_flankers()
    c.truncate_between_flankers(target=6)

    plot_diagram(root, .14, .2, A, a, "S", "syntenic")
    plot_diagram(root, .37, .2, A, b, "F", "missing, with both flankers")
    plot_diagram(root, .6, .2, A, c, "G", "missing, with one flanker")

    labels = ((.04, .95, 'A'), (.04, .75, 'B'), (.04, .4, 'C'))
    panel_labels(root, labels)

    # Descriptions
    xt = .85
    desc = ("Extract neighborhood",
            "of *window* size",
            "Count gene pairs within *window*",
            "Find regions above *score* cutoff",
            "Identify flankers",
            "Annotate syntelog class"
            )
    for yt, t in zip((.88, .84, .64, .6, .3, .26), desc):
        root.text(xt, yt, markup(t), ha="center", va="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "cartoon"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #6
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin", default=15, type="int", help="Minimum K-mer size, inclusive")
    p.add_option("--kmax", default=30, type="int", help="Maximum K-mer size, inclusive")
    p.add_option("--vmin", default=2, type="int", help="Minimum value, inclusive")
    p.add_option("--vmax", default=100, type="int", help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([0.08, 0.12, 0.38, 0.76])
    B = fig.add_axes([0.58, 0.12, 0.38, 0.76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        line, = A.plot(x, y, "-", lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K)
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc="w")
    t = np.linspace(opts.kmin - 0.5, opts.kmax + 0.5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((0.04, 0.96, "A"), (0.54, 0.96, "B"))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Example #7
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int", help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int", help="maximum value, inclusive [default: %default]")
    p.add_option(
        "--pdf", default=False, action="store_true", help="Print PDF instead of ASCII plot [default: %default]"
    )
    p.add_option("--coverage", default=0, type="int", help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true", help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = meryl([histfile])

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg, SNPrate_msg]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Example #8
0
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, cmap="copper",
        genomenames=None, sample_number=10000, minfont=5, palette=None,
        chrlw=.1, title=None, sep=True, sepcolor="g", stdpf=True):

    fp = open(anchorfile)
    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    gx, gy = markup(gx), markup(gy)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\
                        .format(vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        if cmap_text:
            try:
                value = float(value)
            except ValueError:
                value = vmax

            if value < vmin:
                continue
            if value > vmax:
                continue
        else:
            value = 0

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = downsample(data, sample_number=sample_number)
    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)
    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    qbreaks = qbed.get_breaks()
    sbreaks = sbed.get_breaks()
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                           qbreaks, sbreaks, sep=sep, chrlw=chrlw,
                           sepcolor=sepcolor, minfont=minfont, stdpf=stdpf)

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if title is None:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(title, x=.5, y=.96, color="k")
    if title:
        logging.debug("Dot plot title: {}".format(title))
    normalize_axes(root)
Example #9
0
    def __init__(self, ax, ext, layout, bed, scale, switch=None, chr_label=True,
                 pad=.04, vpad=.012):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si: ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        label = "-".join((human_size(startbp, target="Mb")[:-2],
                          human_size(endbp, target="Mb")))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2 = cv(gstart), cv(gend)
            a, b = tr.transform((x1, y)), tr.transform((x2, y))
            a, b = inv.transform(a), inv.transform(b)
            self.gg[g.accn] = (a, b)

            color = "b" if strand == "+" else "g"
            if not hidden:
                gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden and chr_label:
            ax.text(lx, ly + vpad, markup(chr), color=layout.color,
                        ha=ha, va="center", rotation=trans_angle)
            ax.text(lx, ly - vpad, label, color="k",
                        ha=ha, va="center", rotation=trans_angle)
Example #10
0
    def draw(self, roundrect=False, plot_label=True, pad=.03, vpad=.09):
        if self.empty:
            return

        y = self.y
        color = self.color
        ax = self.ax
        xstart = self.xstart
        gap = self.gap
        va = self.va
        nseqids = len(self.seqids)
        tr = self.tr
        for i, sid in enumerate(self.seqids):
            size = self.sizes[sid]
            rsize = self.ratio * size
            xend = xstart + rsize
            hc = HorizontalChromosome(ax,
                                      xstart,
                                      xend,
                                      y,
                                      height=self.height,
                                      lw=self.lw,
                                      fc=color,
                                      roundrect=roundrect)
            hc.set_transform(tr)
            sid = sid.rsplit("_", 1)[-1]
            si = "".join(x for x in sid if x in string.digits)
            si = str(int(si))
            xx = (xstart + xend) / 2
            xstart = xend + gap

            step = 2 if nseqids <= 40 else 10
            if nseqids >= 2 * MaxSeqids and (i + 1) % step != 0:
                continue
            if nseqids < 5:
                continue

            hpad = -pad if va == "bottom" else pad
            TextCircle(ax,
                       xx,
                       y + hpad,
                       si,
                       radius=.01,
                       fc="w",
                       color=color,
                       size=10,
                       transform=tr)

        label = markup(self.label)
        c = color if color != "gainsboro" else "k"
        if plot_label:
            if self.label_va == "top":
                x, y = self.x, self.y + vpad
                va = "bottom"
            elif self.label_va == "bottom":
                x, y = self.x, self.y - vpad
                va = "top"
            else:  # "center"
                x, y = self.xstart - vpad, self.y
                va = "center"
            ax.text(x,
                    y,
                    label,
                    ha="center",
                    va="center",
                    color=c,
                    transform=tr)
Example #11
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin",
                 default=15,
                 type="int",
                 help="Minimum K-mer size, inclusive")
    p.add_option("--kmax",
                 default=30,
                 type="int",
                 help="Maximum K-mer size, inclusive")
    p.add_option("--vmin",
                 default=2,
                 type="int",
                 help="Minimum value, inclusive")
    p.add_option("--vmax",
                 default=100,
                 type="int",
                 help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([.08, .12, .38, .76])
    B = fig.add_axes([.58, .12, .38, .76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        line, = A.plot(x, y, '-', lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K)
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc='w')
    t = np.linspace(opts.kmin - .5, opts.kmax + .5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((.04, .96, 'A'), (.54, .96, 'B'))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Example #12
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size /
                                                                1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, "g-", lw=2, alpha=0.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=2, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Example #13
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin", dest="vmin", default=1, type="int",
            help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax", dest="vmax", default=100, type="int",
            help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf", default=False, action="store_true",
            help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage", default=0, type="int",
            help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks", default=False, action="store_true",
            help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Example #14
0
def draw_chromosomes(
    root,
    bedfile,
    sizes,
    iopts,
    mergedist,
    winsize,
    imagemap,
    mappingfile=None,
    gauge=False,
    legend=True,
    empty=False,
    title=None,
):
    bed = Bed(bedfile)
    prefix = bedfile.rsplit(".", 1)[0]

    if imagemap:
        imgmapfile = prefix + ".map"
        mapfh = open(imgmapfile, "w")
        print('<map id="' + prefix + '">', file=mapfh)

    if mappingfile:
        mappings = DictFile(mappingfile, delimiter="\t")
        classes = sorted(set(mappings.values()))
        preset_colors = (DictFile(
            mappingfile, keypos=1, valuepos=2, delimiter="\t")
                         if DictFile.num_columns(mappingfile) >= 3 else {})
    else:
        classes = sorted(set(x.accn for x in bed))
        mappings = dict((x, x) for x in classes)
        preset_colors = {}

    logging.debug("A total of {} classes found: {}".format(
        len(classes), ",".join(classes)))

    # Assign colors to classes
    ncolors = max(3, min(len(classes), 12))
    palette = set1_n if ncolors <= 8 else set3_n
    colorset = palette(number=ncolors)
    colorset = sample_N(colorset, len(classes))
    class_colors = dict(zip(classes, colorset))
    class_colors.update(preset_colors)
    logging.debug("Assigned colors: {}".format(class_colors))

    chr_lens = {}
    centromeres = {}
    if sizes:
        chr_lens = Sizes(sizes).sizes_mapping
    else:
        for b, blines in groupby(bed, key=(lambda x: x.seqid)):
            blines = list(blines)
            maxlen = max(x.end for x in blines)
            chr_lens[b] = maxlen

    for b in bed:
        accn = b.accn
        if accn == "centromere":
            centromeres[b.seqid] = b.start
        if accn in mappings:
            b.accn = mappings[accn]
        else:
            b.accn = "-"

    chr_number = len(chr_lens)
    if centromeres:
        assert chr_number == len(
            centromeres), "chr_number = {}, centromeres = {}".format(
                chr_number, centromeres)

    r = 0.7  # width and height of the whole chromosome set
    xstart, ystart = 0.15, 0.85
    xinterval = r / chr_number
    xwidth = xinterval * 0.5  # chromosome width
    max_chr_len = max(chr_lens.values())
    ratio = r / max_chr_len  # canvas / base

    # first the chromosomes
    for a, (chr, clen) in enumerate(sorted(chr_lens.items())):
        xx = xstart + a * xinterval + 0.5 * xwidth
        root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center")
        if centromeres:
            yy = ystart - centromeres[chr] * ratio
            ChromosomeWithCentromere(root,
                                     xx,
                                     ystart,
                                     yy,
                                     ystart - clen * ratio,
                                     width=xwidth)
        else:
            Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth)

    chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys())))

    alpha = 1
    # color the regions
    for chr in sorted(chr_lens.keys()):
        segment_size, excess = 0, 0
        bac_list = []
        prev_end, prev_klass = 0, None
        for b in bed.sub_bed(chr):
            clen = chr_lens[chr]
            idx = chr_idxs[chr]
            klass = b.accn
            if klass == "centromere":
                continue
            start = b.start
            end = b.end
            if start < prev_end + mergedist and klass == prev_klass:
                start = prev_end
            xx = xstart + idx * xinterval
            yystart = ystart - end * ratio
            yyend = ystart - start * ratio
            root.add_patch(
                Rectangle(
                    (xx, yystart),
                    xwidth,
                    yyend - yystart,
                    fc=class_colors.get(klass, "lightslategray"),
                    lw=0,
                    alpha=alpha,
                ))
            prev_end, prev_klass = b.end, klass

            if imagemap:
                """
                `segment` : size of current BAC being investigated + `excess`
                `excess`  : left-over bases from the previous BAC, as a result of
                            iterating over `winsize` regions of `segment`
                """
                if excess == 0:
                    segment_start = start
                segment = (end - start + 1) + excess
                while True:
                    if segment < winsize:
                        bac_list.append(b.accn)
                        excess = segment
                        break
                    segment_end = segment_start + winsize - 1
                    tlx, tly, brx, bry = (
                        xx,
                        (1 - ystart) + segment_start * ratio,
                        xx + xwidth,
                        (1 - ystart) + segment_end * ratio,
                    )
                    print(
                        "\t" + write_ImageMapLine(
                            tlx,
                            tly,
                            brx,
                            bry,
                            iopts.w,
                            iopts.h,
                            iopts.dpi,
                            chr + ":" + ",".join(bac_list),
                            segment_start,
                            segment_end,
                        ),
                        file=mapfh,
                    )

                    segment_start += winsize
                    segment -= winsize
                    bac_list = []

        if imagemap and excess > 0:
            bac_list.append(b.accn)
            segment_end = end
            tlx, tly, brx, bry = (
                xx,
                (1 - ystart) + segment_start * ratio,
                xx + xwidth,
                (1 - ystart) + segment_end * ratio,
            )
            print(
                "\t" + write_ImageMapLine(
                    tlx,
                    tly,
                    brx,
                    bry,
                    iopts.w,
                    iopts.h,
                    iopts.dpi,
                    chr + ":" + ",".join(bac_list),
                    segment_start,
                    segment_end,
                ),
                file=mapfh,
            )

    if imagemap:
        print("</map>", file=mapfh)
        mapfh.close()
        logging.debug("Image map written to `{0}`".format(mapfh.name))

    if gauge:
        xstart, ystart = 0.9, 0.85
        Gauge(root, xstart, ystart - r, ystart, max_chr_len)

    if "centromere" in class_colors:
        del class_colors["centromere"]

    # class legends, four in a row
    if legend:
        xstart = 0.1
        xinterval = 0.8 / len(class_colors)
        xwidth = 0.04
        yy = 0.08
        for klass, cc in sorted(class_colors.items()):
            if klass == "-":
                continue
            root.add_patch(
                Rectangle((xstart, yy),
                          xwidth,
                          xwidth,
                          fc=cc,
                          lw=0,
                          alpha=alpha))
            root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10)
            xstart += xinterval

    if empty:
        root.add_patch(
            Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1))
        root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10)

    if title:
        root.text(0.5, 0.95, markup(title), ha="center", va="center")
Example #15
0
def draw_tree(
    ax,
    t,
    hpd=None,
    margin=0.1,
    rmargin=0.2,
    tip=0.01,
    treecolor="k",
    supportcolor="k",
    internal=True,
    outgroup=None,
    dashedoutgroup=False,
    reroot=True,
    gffdir=None,
    sizes=None,
    trunc_name=None,
    SH=None,
    scutoff=0,
    leafcolor="k",
    leaffont=12,
    leafinfo=None,
    wgdinfo=None,
    geoscale=False,
):
    """
    main function for drawing phylogenetic tree
    """

    if reroot:
        if outgroup:
            R = t.get_common_ancestor(*outgroup)
        else:
            # Calculate the midpoint node
            R = t.get_midpoint_outgroup()

        if R is not t:
            t.set_outgroup(R)

        # By default, the distance to outgroup and non-outgroup is the same
        # we re-adjust the distances so that the outgroups will appear
        # farthest from everything else
        if dashedoutgroup:
            a, b = t.children
            # Avoid even split
            total = a.dist + b.dist
            newR = t.get_common_ancestor(*outgroup)
            a.dist = 0.9 * total
            b.dist = total - a.dist

    farthest, max_dist = t.get_farthest_leaf()
    print("max_dist = {}".format(max_dist), file=sys.stderr)

    xstart = margin
    ystart = 2 * margin
    # scale the tree
    scale = (1 - margin - rmargin) / max_dist

    def rescale(dist):
        return xstart + scale * dist

    def rescale_divergence(divergence):
        return rescale(max_dist - divergence)

    num_leaves = len(t.get_leaf_names())
    yinterval = (1 - ystart) / num_leaves

    # get exons structures, if any
    structures = {}
    if gffdir:
        gffiles = glob("{0}/*.gff*".format(gffdir))
        setups, ratio = get_setups(gffiles, canvas=rmargin / 2, noUTR=True)
        structures = dict((a, (b, c)) for a, b, c in setups)

    if sizes:
        sizes = Sizes(sizes).mapping

    coords = {}
    i = 0
    for n in t.traverse("postorder"):
        dist = n.get_distance(t)
        xx = rescale(dist)

        if n.is_leaf():
            yy = ystart + i * yinterval
            i += 1

            if trunc_name:
                name = truncate_name(n.name, rule=trunc_name)
            else:
                name = n.name

            if leafinfo and n.name in leafinfo:
                line = leafinfo[n.name]
                lc = line.color
                sname = line.new_name
            else:
                lc = leafcolor
                sname = None
            lc = lc or "k"
            sname = sname or name.replace("_", "-")
            # if color is given as "R,G,B"
            if "," in lc:
                lc = [float(x) for x in lc.split(",")]

            ax.text(
                xx + tip,
                yy,
                markup(sname),
                va="center",
                fontstyle="italic",
                size=leaffont,
                color=lc,
            )

            gname = n.name.split("_")[0]
            if gname in structures:
                mrnabed, cdsbeds = structures[gname]
                ExonGlyph(
                    ax,
                    1 - rmargin / 2,
                    yy,
                    mrnabed,
                    cdsbeds,
                    align="right",
                    ratio=ratio,
                )
            if sizes and gname in sizes:
                size = sizes[gname]
                size = size / 3 - 1  # base pair converted to amino acid
                size = "{0}aa".format(size)
                ax.text(1 - rmargin / 2 + tip, yy, size, size=leaffont)

        else:
            linestyle = "--" if (dashedoutgroup and n is t) else "-"
            children = [coords[x] for x in n.get_children()]
            children_x, children_y = zip(*children)
            min_y, max_y = min(children_y), max(children_y)
            # plot the vertical bar
            ax.plot((xx, xx), (min_y, max_y), linestyle, color=treecolor)
            # plot the horizontal bar
            for cx, cy in children:
                ax.plot((xx, cx), (cy, cy), linestyle, color=treecolor)
            yy = sum(children_y) * 1.0 / len(children_y)
            # plot HPD if exists
            if hpd and n.name in hpd:
                a, b = hpd[n.name]
                ax.plot(
                    (rescale_divergence(a), rescale_divergence(b)),
                    (yy, yy),
                    "-",
                    color="darkslategray",
                    alpha=0.4,
                    lw=2,
                )
            support = n.support
            if support > 1:
                support = support / 100.0
            if not n.is_root() and supportcolor:
                if support > scutoff / 100.0:
                    ax.text(
                        xx,
                        yy + 0.005,
                        "{0:d}".format(int(abs(support * 100))),
                        ha="right",
                        size=leaffont,
                        color=supportcolor,
                    )
            if internal and n.name:
                TextCircle(ax, xx, yy, n.name, size=9)

        coords[n] = (xx, yy)
        # WGD info
        draw_wgd(ax, yy, rescale_divergence, n.name, wgdinfo)

    # scale bar
    if geoscale:
        draw_geoscale(ax,
                      margin=margin,
                      rmargin=rmargin,
                      yy=margin,
                      max_dist=max_dist)
    else:
        br = 0.1
        x1 = xstart + 0.1
        x2 = x1 + br * scale
        yy = margin
        ax.plot([x1, x1], [yy - tip, yy + tip], "-", color=treecolor)
        ax.plot([x2, x2], [yy - tip, yy + tip], "-", color=treecolor)
        ax.plot([x1, x2], [yy, yy], "-", color=treecolor)
        ax.text(
            (x1 + x2) / 2,
            yy - tip,
            "{0:g}".format(br),
            va="top",
            ha="center",
            size=leaffont,
            color=treecolor,
        )

    if SH is not None:
        xs = x1
        ys = (margin + yy) / 2.0
        ax.text(
            xs,
            ys,
            "SH test against ref tree: {0}".format(SH),
            ha="left",
            size=leaffont,
            color="g",
        )

    normalize_axes(ax)
Example #16
0
    def __init__(self, ax, ext, layout, bed, scale, switch=None,
                 chr_label=True, pad=.04, vpad=.012, extra_features=None):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().\
                    rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si: ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        label = "-".join((human_size(startbp, target="Mb")[:-2],
                          human_size(endbp, target="Mb")))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
            self.gg[g.accn] = (a, b)

            color = forward if strand == "+" else backward
            if not hidden:
                gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3)
                gp.set_transform(tr)

        # Extra features (like repeats)
        if extra_features:
            for g in extra_features:
                gstart, gend = g.start, g.end
                x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
                gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False,
                           fc='#ff7f00', zorder=2)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad - .01
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden and chr_label:
            bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5)
            ax.text(lx, ly + vpad, markup(chr), color=layout.color,
                        ha=ha, va="center", rotation=trans_angle,
                        bbox=bbox, zorder=10)
            ax.text(lx, ly - vpad, label, color="lightslategrey", size=10,
                        ha=ha, va="center", rotation=trans_angle,
                        bbox=bbox, zorder=10)
Example #17
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    N = int(N)
    KMERYL, KSOAP, KALLPATHS = range(3)
    kformats = ("Meryl", "Soap", "AllPaths")
    kformat = KMERYL

    ascii = not opts.pdf
    peaks = not opts.nopeaks
    fp = open(histfile)
    hist = {}
    totalKmers = 0

    # Guess the format of the Kmer histogram
    for row in fp:
        if row.startswith("# 1:"):
            kformat = KALLPATHS
            break
        if len(row.split()) == 1:
            kformat = KSOAP
            break
    fp.seek(0)

    logging.debug("Guessed format: {0}".format(kformats[kformat]))

    data = []
    for rowno, row in enumerate(fp):
        if row[0] == '#':
            continue
        if kformat == KSOAP:
            K = rowno + 1
            counts = int(row.strip())
        else:  # meryl histogram
            K, counts = row.split()[:2]
            K, counts = int(K), int(counts)

        Kcounts = K * counts
        totalKmers += Kcounts
        hist[K] = Kcounts
        data.append((K, counts))

    covmax = 1000000
    ks = KmerSpectrum(data)
    ks.analyze(K=N, covmax=covmax)

    Total_Kmers = int(totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = Total_Kmers * 1. / Kmer_coverage / 1e6

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, Total_Kmers)
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".format(Genome_size)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    counts = sorted((a, b) for a, b in hist.items() \
                    if opts.vmin <= a <= opts.vmax)
    x, y = zip(*counts)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        return asciiplot(x, y, title=title)

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
    tcounts = [(x, y) for x, y in counts if x in t]
    x, y = zip(*tcounts)
    plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
    tcounts = dict(tcounts)

    if peaks:
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)
Example #18
0
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, cmap="copper",
        genomenames=None, sample_number=10000, minfont=5, palette=None,
        chrlw=.01, title=None, sepcolor="gainsboro"):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\
                        .format(vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        if cmap_text:
            try:
                value = float(value)
            except ValueError:
                value = vmax

            if value < vmin:
                continue
            if value > vmax:
                continue
        else:
            value = 0

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = len(data)
    # Only show random subset
    if npairs > sample_number:
        logging.debug("Showing a random subset of {0} data points (total {1}) " \
                      "for clarity.".format(sample_number, npairs))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    #if not palette:
    #    data.sort(key=lambda x: -x[2])

    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)
    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbed.get_breaks():
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos, .91, latex(label), size=fontsize,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91, pos, latex(label), size=fontsize,
                va="center", color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(markup(gx), size=16)
    ax.set_ylabel(markup(gy), size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if not title:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
Example #19
0
    def __init__(
        self,
        ax,
        ext,
        layout,
        bed,
        scale,
        switch=None,
        chr_label=True,
        loc_label=True,
        genelabelsize=0,
        pad=0.05,
        vpad=0.015,
        extra_features=None,
        glyphstyle="box",
        glyphcolor: BasePalette = OrientationPalette(),
    ):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, lw=2, zorder=1)

        self.genes = genes = bed[si : ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == "-":
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        if layout.label:
            chr = layout.label

        label = "-".join(
            (
                human_size(startbp, target="Mb", precision=2)[:-2],
                human_size(endbp, target="Mb", precision=2),
            )
        )

        height = 0.012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == "-":
                gstart, gend = gend, gstart
            if orientation == "-":
                strand = "+" if strand == "-" else "-"

            x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
            gene_name = g.accn
            self.gg[gene_name] = (a, b)

            color, zorder = (
                glyphcolor.get_color_and_zorder(strand)
                if isinstance(glyphcolor, OrientationPalette)
                else glyphcolor.get_color_and_zorder(gene_name)
            )

            if hidden:
                continue
            gp = Glyph(
                ax,
                x1,
                x2,
                y,
                height,
                gradient=False,
                fc=color,
                style=glyphstyle,
                zorder=zorder,
            )
            gp.set_transform(tr)
            if genelabelsize:
                ax.text(
                    (x1 + x2) / 2,
                    y + height / 2 + genelabelsize * vpad / 3,
                    markup(gene_name),
                    size=genelabelsize,
                    rotation=25,
                    ha="left",
                    va="center",
                    color="lightslategray",
                )

        # Extra features (like repeats)
        if extra_features:
            for g in extra_features:
                gstart, gend = g.start, g.end
                x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
                gp = Glyph(
                    ax,
                    x1,
                    x2,
                    y,
                    height * 3 / 4,
                    gradient=False,
                    fc="#ff7f00",
                    style=glyphstyle,
                    zorder=2,
                )
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = 0.02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.0
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr,)), l.reshape((1, 2)))[
            0
        ]
        lx, ly = l
        if not hidden:
            bbox = dict(boxstyle="round", fc="w", ec="w", alpha=0.5)
            kwargs = dict(
                ha=ha, va="center", rotation=trans_angle, bbox=bbox, zorder=10
            )

            # TODO: I spent several hours on trying to make this work - with no
            # good solutions. To generate labels on multiple lines, each line
            # with a different style is difficult in matplotlib. The only way,
            # if you can tolerate an extra dot (.), is to use the recipe below.
            # chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None
            # loc_label = r"\noindent . \\ " + label if loc_label else None

            chr_label = markup(chr) if chr_label else None
            loc_label = label if loc_label else None
            if chr_label:
                if loc_label:
                    ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs)
                    ax.text(
                        lx,
                        ly - vpad,
                        loc_label,
                        color="lightslategrey",
                        size=10,
                        **kwargs
                    )
                else:
                    ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
Example #20
0
    def draw(self, roundrect=False, plot_label=True):
        if self.empty:
            return

        y = self.y
        color = self.color
        ax = self.ax
        xstart = self.xstart
        gap = self.gap
        va = self.va
        nseqids = len(self.seqids)
        tr = self.tr
        for i, sid in enumerate(self.seqids):
            size = self.sizes[sid]
            rsize = self.ratio * size
            xend = xstart + rsize
            hc = HorizontalChromosome(ax,
                                      xstart,
                                      xend,
                                      y,
                                      height=self.height,
                                      lw=self.lw,
                                      fc=color,
                                      roundrect=roundrect)
            hc.set_transform(tr)
            sid = sid.rsplit("_", 1)[-1]
            si = "".join(x for x in sid if x not in string.letters)
            si = str(int(si))
            xx = (xstart + xend) / 2
            xstart = xend + gap

            if nseqids > 2 * MaxSeqids and (i + 1) % 10 != 0:
                continue
            if nseqids < 5:
                continue

            pad = .02
            if va == "bottom":
                pad = -pad
            TextCircle(ax,
                       xx,
                       y + pad,
                       si,
                       radius=.01,
                       fc="w",
                       color=color,
                       size=10,
                       transform=tr)

        xp = min(self.xstart / 2,
                 .1)  #if (self.xstart + self.xend) / 2 <= .5 \
        #else max(1 - self.xend / 2, .92)
        label = markup(self.label)
        c = color if color != "gainsboro" else "k"
        if plot_label:
            ax.text(xp,
                    y + self.height * .6,
                    label,
                    ha="center",
                    color=c,
                    transform=tr)
Example #21
0
    def __init__(self, ax, ext, layout, bed, scale, switch=None,
                 chr_label=True, loc_label=True,
                 pad=.05, vpad=.015, extra_features=None):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().\
                    rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si: ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        if layout.label:
            chr = layout.label

        label = "-".join((human_size(startbp, target="Mb", precision=2)[:-2],
                          human_size(endbp, target="Mb", precision=2)))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
            self.gg[g.accn] = (a, b)

            color = forward if strand == "+" else backward
            if not hidden:
                gp = Glyph(ax, x1, x2, y, height, gradient=False, fc=color, zorder=3)
                gp.set_transform(tr)

        # Extra features (like repeats)
        if extra_features:
            for g in extra_features:
                gstart, gend = g.start, g.end
                x1, x2, a, b = self.get_coordinates(gstart, gend, y, cv, tr, inv)
                gp = Glyph(ax, x1, x2, y, height * 3 / 4, gradient=False,
                           fc='#ff7f00', zorder=2)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden:
            bbox = dict(boxstyle="round", fc='w', ec='w', alpha=.5)
            kwargs = dict(ha=ha, va="center",
                          rotation=trans_angle, bbox=bbox, zorder=10)

            # TODO: I spent several hours on trying to make this work - with no
            # good solutions. To generate labels on multiple lines, each line
            # with a different style is difficult in matplotlib. The only way,
            # if you can tolerate an extra dot (.), is to use the recipe below.
            #chr_label = r"\noindent " + markup(chr) + r" \\ ." if chr_label else None
            #loc_label = r"\noindent . \\ " + label if loc_label else None

            chr_label = markup(chr) if chr_label else None
            loc_label = label if loc_label else None
            if chr_label:
                if loc_label:
                    ax.text(lx, ly + vpad, chr_label, color=layout.color, **kwargs)
                    ax.text(lx, ly - vpad, loc_label, color="lightslategrey",
                            size=10, **kwargs)
                else:
                    ax.text(lx, ly, chr_label, color=layout.color, **kwargs)
Example #22
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import plt, markup, human_formatter, \
                human_base_formatter, savefig, set2, set_ticklabels_helvetica

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000, bins=100, xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1", default="Counts",
                 help="Label of y-axis on the left")
    p.add_option("--color", default='0', choices=[str(x) for x in range(8)],
                 help="Color of bars, which is an index 0-7 in brewer set2")
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * .5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000 ** SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1. / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, '-', color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = .95, .95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print >> sys.stderr, t
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= .05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(axis="x", direction="out", length=3,
                    left=False, right=False, top=False, bottom=True)
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Example #23
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option(
        "--vmin",
        dest="vmin",
        default=1,
        type="int",
        help="minimum value, inclusive",
    )
    p.add_option(
        "--vmax",
        dest="vmax",
        default=100,
        type="int",
        help="maximum value, inclusive",
    )
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Print PDF instead of ASCII plot",
    )
    p.add_option(
        "--method",
        choices=("nbinom", "allpaths"),
        default="nbinom",
        help=
        "'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes",
    )
    p.add_option(
        "--maxiter",
        default=100,
        type="int",
        help="Max iterations for optimization. Only used with --method nbinom",
    )
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option(
        "--nopeaks",
        default=False,
        action="store_true",
        help="Do not annotate K-mer peaks",
    )
    opts, args, iopts = p.set_image_options(args, figsize="7x7")

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    method = opts.method
    vmin, vmax = opts.vmin, opts.vmax
    ascii = not opts.pdf
    peaks = not opts.nopeaks and method == "allpaths"
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.lambda_ if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size /
                                                                 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(vmin, vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (iopts.w, iopts.h))
    plt.bar(x, y, fc="#b2df8a", lw=0)
    # Plot the negative binomial fit
    if method == "nbinom":
        generative_model = method_info["generative_model"]
        GG = method_info["Gbins"]
        ll = method_info["lambda"]
        rr = method_info["rho"]
        kf_range = method_info["kf_range"]
        stacked = generative_model(GG, ll, rr)
        plt.plot(
            kf_range,
            stacked,
            ":",
            color="#6a3d9a",
            lw=2,
        )

    ax = plt.gca()

    if peaks:  # Only works for method 'allpaths'
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, "ko", lw=3, mec="k", mfc="w")
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6
    if method == "nbinom":
        # Plot multiple CN locations, CN1, CN2, ... up to ploidy
        cn_color = "#a6cee3"
        for i in range(1, ks.ploidy + 1):
            x = i * ks.lambda_
            plt.plot((x, x), (0, ymax), "-.", color=cn_color)
            plt.text(
                x,
                ymax * 0.95,
                "CN{}".format(i),
                ha="right",
                va="center",
                color=cn_color,
                rotation=90,
            )

    messages = [
        Total_Kmers_msg,
        Kmer_coverage_msg,
        Genome_size_msg,
        Repetitive_msg,
        SNPrate_msg,
    ]
    if method == "nbinom":
        messages += [ks.ploidy_message] + ks.copy_messages
    write_messages(ax, messages)

    ax.set_title(markup(title))
    ax.set_xlim((0, vmax))
    ax.set_ylim((0, ymax))
    adjust_spines(ax, ["left", "bottom"], outward=True)
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + "." + iopts.format
    savefig(imagename, dpi=100)

    return Genome_size
Example #24
0
    def draw(self,
             roundrect=False,
             plot_label=True,
             plot_circles=True,
             pad=0.03,
             vpad=0.09):
        if self.empty:
            return

        y = self.y
        color = self.color
        ax = self.ax
        xstart = self.xstart
        gap = self.gap
        va = self.va
        nseqids = len(self.seqids)
        tr = self.tr

        for i, sid in enumerate(self.seqids):
            size = self.sizes[sid]
            rsize = self.ratio * size
            xend = xstart + rsize
            hc = HorizontalChromosome(
                ax,
                xstart,
                xend,
                y,
                height=self.height,
                lw=self.lw,
                fc=color,
                roundrect=roundrect,
            )
            hc.set_transform(tr)
            si = make_circle_name(sid, self.rev)
            xx = (xstart + xend) / 2
            xstart = xend + gap

            step = 2 if nseqids <= 40 else 10
            if nseqids >= 2 * MaxSeqids and (i + 1) % step != 0:
                continue
            if nseqids < 5:
                continue

            hpad = -pad if va == "bottom" else pad
            if plot_circles:
                TextCircle(
                    ax,
                    xx,
                    y + hpad,
                    si,
                    fc="w",
                    color=color,
                    size=10,
                    transform=tr,
                )

        label = markup(self.label)
        c = color if color != "gainsboro" else "k"
        if plot_label:
            if self.label_va == "top":
                x, y = self.x, self.y + vpad
            elif self.label_va == "bottom":
                x, y = self.x, self.y - vpad
            else:  # "center"
                x, y = self.xstart - vpad / 2, self.y
            ax.text(x,
                    y,
                    label,
                    ha="center",
                    va="center",
                    color=c,
                    transform=tr)
Example #25
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic. Find out totalKmers when running
    kmer.meryl().
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print >> sys.stderr, msg

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} genome {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        x, y = zip(*tcounts)
        tcounts = dict(tcounts)
        plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
        ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
        ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    tc = "gray"
    axt = ax.transAxes
    ax.text(.95, .95, Total_Kmers_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .9, Kmer_coverage_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .85, Genome_size_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .8, Repetitive_msg, color=tc, transform=axt, ha="right")
    ax.text(.95, .75, SNPrate_msg, color=tc, transform=axt, ha="right")

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title), color='r')
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel, color='r')
    ax.set_ylabel(ylabel, color='r')
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Example #26
0
def cartoon(args):
    """
    %prog synteny.py

    Generate cartoon illustration of SynFind.
    """
    p = OptionParser(cartoon.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="10x7")

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    # Panel A
    A = CartoonRegion(41)
    A.draw(root, .35, .85, strip=False, color=False)
    x1, x2 = A.x1, A.x2
    lsg = "lightslategray"
    pad = .01
    xc, yc = .35, .88
    arrowlen = x2 - xc - pad
    arrowprops = dict(length_includes_head=True, width=.01, fc=lsg, lw=0,
                      head_length=arrowlen * .15, head_width=.03)
    p = FancyArrow(xc - pad, yc, -arrowlen, 0, shape="left", **arrowprops)
    root.add_patch(p)
    p = FancyArrow(xc + pad, yc, arrowlen, 0, shape="right", **arrowprops)
    root.add_patch(p)

    yt = yc + 4 * pad
    root.text((x1 + xc) / 2, yt, "20 genes upstream", ha="center")
    root.text((x2 + xc) / 2, yt, "20 genes downstream", ha="center")
    root.plot((xc,), (yc,), "o", mfc='w', mec=lsg, mew=2, lw=2, color=lsg)
    root.text(xc, yt, "Query gene", ha="center")

    # Panel B
    A.draw(root, .35, .7, strip=False)

    RoundRect(root, (.07, .49), .56, .14, fc='y', alpha=.2)
    a = deepcopy(A)
    a.evolve(mode='S', target=10)
    a.draw(root, .35, .6)
    b = deepcopy(A)
    b.evolve(mode='F', target=8)
    b.draw(root, .35, .56)
    c = deepcopy(A)
    c.evolve(mode='G', target=6)
    c.draw(root, .35, .52)

    for x in (a, b, c):
        root.text(.64, x.y, "Score={0}".format(x.nonwhites), va="center")

    # Panel C
    A.truncate_between_flankers()
    a.truncate_between_flankers()
    b.truncate_between_flankers()
    c.truncate_between_flankers(target=6)

    plot_diagram(root, .14, .2, A, a, "S", "syntenic")
    plot_diagram(root, .37, .2, A, b, "F", "missing, with both flankers")
    plot_diagram(root, .6, .2, A, c, "G", "missing, with one flanker")

    labels = ((.04, .95, 'A'), (.04, .75, 'B'), (.04, .4, 'C'))
    panel_labels(root, labels)

    # Descriptions
    xt = .85
    desc = ("Extract neighborhood",
            "of *window* size",
            "Count gene pairs within *window*",
            "Find regions above *score* cutoff",
            "Identify flankers",
            "Annotate syntelog class"
            )
    for yt, t in zip((.88, .84, .64, .6, .3, .26), desc):
        root.text(xt, yt, markup(t), ha="center", va="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    pf = "cartoon"
    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #27
0
def histogram(args):
    """
    %prog histogram [reads.fasta|reads.fastq]

    Plot read length distribution for reads. The plot would be similar to the
    one generated by SMRT-portal, for example:

    http://blog.pacificbiosciences.com/2013/10/data-release-long-read-shotgun.html

    Plot has two axes - corresponding to pdf and cdf, respectively.  Also adding
    number of reads, average/median, N50, and total length.
    """
    from jcvi.utils.cbook import human_size, thousands, SUFFIXES
    from jcvi.formats.fastq import fasta
    from jcvi.graphics.histogram import stem_leaf_plot
    from jcvi.graphics.base import (
        plt,
        markup,
        human_formatter,
        human_base_formatter,
        savefig,
        set2,
        set_ticklabels_helvetica,
    )

    p = OptionParser(histogram.__doc__)
    p.set_histogram(vmax=50000,
                    bins=100,
                    xlabel="Read length",
                    title="Read length distribution")
    p.add_option("--ylabel1",
                 default="Counts",
                 help="Label of y-axis on the left")
    p.add_option(
        "--color",
        default="0",
        choices=[str(x) for x in range(8)],
        help="Color of bars, which is an index 0-7 in brewer set2",
    )
    opts, args, iopts = p.set_image_options(args, figsize="6x6", style="dark")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    fastafile, qualfile = fasta([fastafile, "--seqtk"])
    sizes = Sizes(fastafile)
    all_sizes = sorted(sizes.sizes)
    xmin, xmax, bins = opts.vmin, opts.vmax, opts.bins
    left, height = stem_leaf_plot(all_sizes, xmin, xmax, bins)

    plt.figure(1, (iopts.w, iopts.h))
    ax1 = plt.gca()

    width = (xmax - xmin) * 0.5 / bins
    color = set2[int(opts.color)]
    ax1.bar(left, height, width=width, linewidth=0, fc=color, align="center")
    ax1.set_xlabel(markup(opts.xlabel))
    ax1.set_ylabel(opts.ylabel1)

    ax2 = ax1.twinx()
    cur_size = 0
    total_size, l50, n50 = sizes.summary
    cdf = {}
    hsize = human_size(total_size)
    tag = hsize[-2:]
    unit = 1000**SUFFIXES[1000].index(tag)

    for x in all_sizes:
        if x not in cdf:
            cdf[x] = (total_size - cur_size) * 1.0 / unit
        cur_size += x
    x, y = zip(*sorted(cdf.items()))
    ax2.plot(x, y, "-", color="darkslategray")
    ylabel2 = "{0} above read length".format(tag)
    ax2.set_ylabel(ylabel2)

    for ax in (ax1, ax2):
        set_ticklabels_helvetica(ax)
        ax.set_xlim((xmin - width / 2, xmax + width / 2))

    tc = "gray"
    axt = ax1.transAxes
    xx, yy = 0.95, 0.95
    ma = "Total bases: {0}".format(hsize)
    mb = "Total reads: {0}".format(thousands(len(sizes)))
    mc = "Average read length: {0}bp".format(thousands(np.mean(all_sizes)))
    md = "Median read length: {0}bp".format(thousands(np.median(all_sizes)))
    me = "N50 read length: {0}bp".format(thousands(l50))
    for t in (ma, mb, mc, md, me):
        print(t, file=sys.stderr)
        ax1.text(xx, yy, t, color=tc, transform=axt, ha="right")
        yy -= 0.05

    ax1.set_title(markup(opts.title))
    # Seaborn removes ticks for all styles except 'ticks'. Now add them back:
    ax1.tick_params(
        axis="x",
        direction="out",
        length=3,
        left=False,
        right=False,
        top=False,
        bottom=True,
    )
    ax1.xaxis.set_major_formatter(human_base_formatter)
    ax1.yaxis.set_major_formatter(human_formatter)
    figname = sizes.filename + ".pdf"
    savefig(figname)
Example #28
0
def draw_depth(
    root,
    ax,
    bed,
    chrinfo={},
    defaultcolor="k",
    sepcolor="w",
    ylim=100,
    title=None,
    subtitle=None,
):
    """ Draw depth plot on the given axes, using data from bed

    Args:
        root (matplotlib.Axes): Canvas axes
        ax (matplotlib.Axes): Axes to plot data on
        bed (Bed): Bed data from mosdepth
        chrinfo (ChrInfoFile): seqid => color, new name
        defaultcolor (str): matplotlib-compatible color for data points
        sepcolor (str): matplotlib-compatible color for chromosome breaks
        ylim (int): Upper limit of the y-axis (depth)
        title (str): Title of the figure, to the right of the axis
        subtitle (str): Subtitle of the figure, just below title
    """
    if chrinfo is None:
        chrinfo = {}
    sizes = bed.max_bp_in_chr
    seqids = chrinfo.keys() if chrinfo else sizes.keys()
    starts = {}
    ends = {}
    label_positions = []
    start = 0
    for seqid in seqids:
        starts[seqid] = start
        end = start + sizes[seqid]
        ends[seqid] = end
        label_positions.append((seqid, (start + end) / 2))
        start = end
    xsize = end

    # Extract plotting data
    data = []
    data_by_seqid = defaultdict(list)
    for b in bed:
        seqid = b.seqid
        if seqid not in starts:
            continue
        # chr01A  2000000 3000000 113.00
        x = starts[seqid] + (b.start + b.end) / 2
        y = float(b.accn)
        c = chrinfo[seqid].color if seqid in chrinfo else "k"
        data.append((x, y, c))
        data_by_seqid[seqid].append(y)

    x, y, c = zip(*data)
    ax.scatter(
        x,
        y,
        c=c,
        edgecolors="none",
        s=8,
        lw=0,
    )
    logging.debug("Obtained {} data points with depth data".format(len(data)))

    # Per seqid median
    medians = {}
    for seqid, values in data_by_seqid.items():
        c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor
        seqid_start = starts[seqid]
        seqid_end = ends[seqid]
        seqid_median = np.median(values)
        medians[seqid] = seqid_median
        ax.plot(
            (seqid_start, seqid_end),
            (seqid_median, seqid_median),
            "-",
            lw=4,
            color=c,
            alpha=0.5,
        )

    # vertical lines for all the breaks
    for pos in starts.values():
        ax.plot((pos, pos), (0, ylim), "-", lw=1, color=sepcolor)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    median_depth_y = 0.88
    chr_label_y = 0.08
    for seqid, position in label_positions:
        xpos = 0.1 + position * 0.8 / xsize
        c = chrinfo[seqid].color if seqid in chrinfo else defaultcolor
        newseqid = chrinfo[seqid].new_name if seqid in chrinfo else seqid
        root.text(xpos,
                  chr_label_y,
                  newseqid,
                  color=c,
                  ha="center",
                  va="center",
                  rotation=20)
        seqid_median = medians[seqid]
        root.text(
            xpos,
            median_depth_y,
            str(int(seqid_median)),
            color=c,
            ha="center",
            va="center",
        )

    if title:
        root.text(
            0.95,
            0.5,
            markup(title),
            color="darkslategray",
            ha="center",
            va="center",
            size=15,
        )
    if subtitle:
        root.text(
            0.95,
            0.375,
            markup(subtitle),
            color="darkslategray",
            ha="center",
            va="center",
            size=15,
        )

    ax.set_xticks([])
    ax.set_xlim(0, xsize)
    ax.set_ylim(0, ylim)
    ax.set_ylabel("Depth")

    set_human_axis(ax)
    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
             color="gray",
             size=10)
    normalize_axes(root)
Example #29
0
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1,
        is_self=False, synteny=False, cmap_text=None, cmap="copper",
        genomenames=None, sample_number=10000, minfont=5, palette=None,
        chrlw=.01, title=None, sepcolor="gainsboro"):

    fp = open(anchorfile)

    qorder = qbed.order
    sorder = sbed.order

    data = []
    if cmap_text:
        logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\
                        .format(vmin, vmax))

    block_id = 0
    for row in fp:
        atoms = row.split()
        block_color = None
        if row[0] == "#":
            block_id += 1
            if palette:
                block_color = palette.get(block_id, "k")
            continue

        # first two columns are query and subject, and an optional third column
        if len(atoms) < 2:
            continue

        query, subject = atoms[:2]
        value = atoms[-1]

        if cmap_text:
            try:
                value = float(value)
            except ValueError:
                value = vmax

            if value < vmin:
                continue
            if value > vmax:
                continue
        else:
            value = 0

        if query not in qorder:
            continue
        if subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        nv = value if block_color is None else block_color
        data.append((qi, si, nv))
        if is_self:  # Mirror image
            data.append((si, qi, nv))

    npairs = len(data)
    # Only show random subset
    if npairs > sample_number:
        logging.debug("Showing a random subset of {0} data points (total {1}) " \
                      "for clarity.".format(sample_number, npairs))
        data = sample(data, sample_number)

    # the data are plotted in this order, the least value are plotted
    # last for aesthetics
    #if not palette:
    #    data.sort(key=lambda x: -x[2])

    x, y, c = zip(*data)

    if palette:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)
    else:
        ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap,
                vmin=vmin, vmax=vmax)

    if synteny:
        clusters = batch_scan(data, qbed, sbed)
        draw_box(clusters, ax)

    if cmap_text:
        draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap)

    xsize, ysize = len(qbed), len(sbed)
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))
    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    # Tag to mark whether to plot chr name (skip small ones)
    xchr_labels, ychr_labels = [], []
    th = TextHandler(fig)

    # plot the chromosome breaks
    for (seqid, beg, end) in qbed.get_breaks():
        xsize_ratio = abs(end - beg) * .8 / xsize
        fontsize = th.select_fontsize(xsize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        xchr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor)

    for (seqid, beg, end) in sbed.get_breaks():
        ysize_ratio = abs(end - beg) * .8 / ysize
        fontsize = th.select_fontsize(ysize_ratio)
        seqid = "".join(seqid_parse(seqid)[:2])

        ychr_labels.append((seqid, (beg + end) / 2, fontsize))
        ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor)

    # plot the chromosome labels
    for label, pos, fontsize in xchr_labels:
        pos = .1 + pos * .8 / xsize
        if fontsize >= minfont:
            root.text(pos, .91, latex(label), size=fontsize,
                ha="center", va="bottom", rotation=45, color="grey")

    # remember y labels are inverted
    for label, pos, fontsize in ychr_labels:
        pos = .9 - pos * .8 / ysize
        if fontsize >= minfont:
            root.text(.91, pos, latex(label), size=fontsize,
                va="center", color="grey")

    # create a diagonal to separate mirror image for self comparison
    if is_self:
        ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # add genome names
    if genomenames:
        gx, gy = genomenames.split("_")
    else:
        to_ax_label = lambda fname: op.basename(fname).split(".")[0]
        gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)]
    ax.set_xlabel(markup(gx), size=16)
    ax.set_ylabel(markup(gy), size=16)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
            color='gray', size=10)

    if palette:  # bottom-left has the palette, if available
        colors = palette.colors
        xstart, ystart = .1, .05
        for category, c in sorted(colors.items()):
            root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c))
            root.text(xstart + .04, ystart, category, color=c)
            xstart += .1

    if not title:
        title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy)
        if is_self:
            title = "Intra-genomic comparison within {0}".format(gx)
            npairs /= 2
        title += " ({0} gene pairs)".format(thousands(npairs))
    root.set_title(markup(title), x=.5, y=.96, color="k")
    logging.debug(title)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()
Example #30
0
    def __init__(self,
                 ax,
                 ext,
                 layout,
                 bed,
                 scale,
                 switch=None,
                 chr_label=True,
                 pad=.04,
                 vpad=.012):
        x, y = layout.x, layout.y
        ratio = layout.ratio
        scale /= ratio
        self.y = y
        lr = layout.rotation
        tr = mpl.transforms.Affine2D().\
                    rotate_deg_around(x, y, lr) + ax.transAxes
        inv = ax.transAxes.inverted()

        start, end, si, ei, chr, orientation, span = ext
        flank = span / scale / 2
        xstart, xend = x - flank, x + flank
        self.xstart, self.xend = xstart, xend

        cv = lambda t: xstart + abs(t - startbp) / scale
        hidden = layout.hidden

        # Chromosome
        if not hidden:
            ax.plot((xstart, xend), (y, y), color="gray", transform=tr, \
                    lw=2, zorder=1)

        self.genes = genes = bed[si:ei + 1]
        startbp, endbp = start.start, end.end
        if orientation == '-':
            startbp, endbp = endbp, startbp

        if switch:
            chr = switch.get(chr, chr)
        label = "-".join(
            (human_size(startbp,
                        target="Mb")[:-2], human_size(endbp, target="Mb")))

        height = .012
        self.gg = {}
        # Genes
        for g in genes:
            gstart, gend = g.start, g.end
            strand = g.strand
            if strand == '-':
                gstart, gend = gend, gstart
            if orientation == '-':
                strand = "+" if strand == "-" else "-"

            x1, x2 = cv(gstart), cv(gend)
            a, b = tr.transform((x1, y)), tr.transform((x2, y))
            a, b = inv.transform(a), inv.transform(b)
            self.gg[g.accn] = (a, b)

            color = "b" if strand == "+" else "g"
            if not hidden:
                gp = Glyph(ax,
                           x1,
                           x2,
                           y,
                           height,
                           gradient=False,
                           fc=color,
                           zorder=3)
                gp.set_transform(tr)

        ha, va = layout.ha, layout.va

        hpad = .02
        if ha == "left":
            xx = xstart - hpad
            ha = "right"
        elif ha == "right":
            xx = xend + hpad
            ha = "left"
        else:
            xx = x
            ha = "center"

        # Tentative solution to labels stick into glyph
        magic = 40.
        cc = abs(lr) / magic if abs(lr) > magic else 1
        if va == "top":
            yy = y + cc * pad
        elif va == "bottom":
            yy = y - cc * pad
        else:
            yy = y

        l = np.array((xx, yy))
        trans_angle = ax.transAxes.transform_angles(np.array((lr, )),
                                                    l.reshape((1, 2)))[0]
        lx, ly = l
        if not hidden and chr_label:
            ax.text(lx,
                    ly + vpad,
                    markup(chr),
                    color=layout.color,
                    ha=ha,
                    va="center",
                    rotation=trans_angle)
            ax.text(lx,
                    ly - vpad,
                    label,
                    color="k",
                    ha=ha,
                    va="center",
                    rotation=trans_angle)