def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.1, title=None, sep=True, sepcolor="g", stdpf=True): fp = open(anchorfile) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] gx, gy = markup(gx), markup(gy) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = downsample(data, sample_number=sample_number) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) qbreaks = qbed.get_breaks() sbreaks = sbed.get_breaks() xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize, qbreaks, sbreaks, sep=sep, chrlw=chrlw, sepcolor=sepcolor, minfont=minfont, stdpf=stdpf) # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if title is None: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(title, x=.5, y=.96, color="k") if title: logging.debug("Dot plot title: {}".format(title)) normalize_axes(root)
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, genomenames=None, sample_number=10000, ignore=.005, palette=None, chrlw=.01, title=None): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] try: value = float(value) except ValueError: value = vmax if value < vmin: value = vmin if value > vmax: value = vmax if query not in qorder: #logging.warning("ignore %s" % query) continue if subject not in sorder: #logging.warning("ignore %s" % subject) continue qi, q = qorder[query] si, s = sorder[subject] nv = vmax - value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) # only show random subset, default to sample_number = 5000 if len(data) > sample_number: logging.debug("Showing a random subset of %s data points (total %s) " \ "for clarity." % (sample_number, len(data))) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics if not palette: data.sort(key=lambda x: -x[2]) default_cm = cm.copper x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=default_cm, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis xchr_labels, ychr_labels = [], [] # Tag to mark whether to plot chr name (skip small ones) ignore_size_x = ignore_size_y = 0 if ignore: ignore_size_x = xsize * ignore ignore_size_y = ysize * ignore # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): ignore = abs(end - beg) < ignore_size_x seqid = seqid.split("_")[-1] try: seqid = int(seqid) seqid = "c%d" % seqid except: pass xchr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot([beg, beg], ylim, "g-", lw=chrlw) for (seqid, beg, end) in sbed.get_breaks(): ignore = abs(end - beg) < ignore_size_y seqid = seqid.split("_")[-1] try: seqid = int(seqid) seqid = "c%d" % seqid except: pass ychr_labels.append((seqid, (beg + end) / 2, ignore)) ax.plot(xlim, [beg, beg], "g-", lw=chrlw) # plot the chromosome labels for label, pos, ignore in xchr_labels: pos = .1 + pos * .8 / xsize if not ignore: root.text(pos, .91, label, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, ignore in ychr_labels: pos = .9 - pos * .8 / ysize if not ignore: root.text(.91, pos, label, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if title: fig.suptitle(title, x=.05, y=.98, color="k") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Normalize values to [%.1f, %.1f]" % (vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] try: value = float(value) except ValueError: value = vmax if value < vmin: value = vmin if value > vmax: value = vmax if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = vmax - value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics if not palette: data.sort(key=lambda x: -x[2]) default_cm = cm.copper x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=default_cm, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=default_cm, reverse=True) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(gx, size=16) ax.set_ylabel(gy, size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(title, x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()
def dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=0, vmax=1, is_self=False, synteny=False, cmap_text=None, cmap="copper", genomenames=None, sample_number=10000, minfont=5, palette=None, chrlw=.01, title=None, sepcolor="gainsboro"): fp = open(anchorfile) qorder = qbed.order sorder = sbed.order data = [] if cmap_text: logging.debug("Capping values within [{0:.1f}, {1:.1f}]"\ .format(vmin, vmax)) block_id = 0 for row in fp: atoms = row.split() block_color = None if row[0] == "#": block_id += 1 if palette: block_color = palette.get(block_id, "k") continue # first two columns are query and subject, and an optional third column if len(atoms) < 2: continue query, subject = atoms[:2] value = atoms[-1] if cmap_text: try: value = float(value) except ValueError: value = vmax if value < vmin: continue if value > vmax: continue else: value = 0 if query not in qorder: continue if subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] nv = value if block_color is None else block_color data.append((qi, si, nv)) if is_self: # Mirror image data.append((si, qi, nv)) npairs = len(data) # Only show random subset if npairs > sample_number: logging.debug("Showing a random subset of {0} data points (total {1}) " \ "for clarity.".format(sample_number, npairs)) data = sample(data, sample_number) # the data are plotted in this order, the least value are plotted # last for aesthetics #if not palette: # data.sort(key=lambda x: -x[2]) x, y, c = zip(*data) if palette: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0) else: ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0, cmap=cmap, vmin=vmin, vmax=vmax) if synteny: clusters = batch_scan(data, qbed, sbed) draw_box(clusters, ax) if cmap_text: draw_cmap(root, cmap_text, vmin, vmax, cmap=cmap) xsize, ysize = len(qbed), len(sbed) logging.debug("xsize=%d ysize=%d" % (xsize, ysize)) xlim = (0, xsize) ylim = (ysize, 0) # invert the y-axis # Tag to mark whether to plot chr name (skip small ones) xchr_labels, ychr_labels = [], [] th = TextHandler(fig) # plot the chromosome breaks for (seqid, beg, end) in qbed.get_breaks(): xsize_ratio = abs(end - beg) * .8 / xsize fontsize = th.select_fontsize(xsize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) xchr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot([beg, beg], ylim, "-", lw=chrlw, color=sepcolor) for (seqid, beg, end) in sbed.get_breaks(): ysize_ratio = abs(end - beg) * .8 / ysize fontsize = th.select_fontsize(ysize_ratio) seqid = "".join(seqid_parse(seqid)[:2]) ychr_labels.append((seqid, (beg + end) / 2, fontsize)) ax.plot(xlim, [beg, beg], "-", lw=chrlw, color=sepcolor) # plot the chromosome labels for label, pos, fontsize in xchr_labels: pos = .1 + pos * .8 / xsize if fontsize >= minfont: root.text(pos, .91, latex(label), size=fontsize, ha="center", va="bottom", rotation=45, color="grey") # remember y labels are inverted for label, pos, fontsize in ychr_labels: pos = .9 - pos * .8 / ysize if fontsize >= minfont: root.text(.91, pos, latex(label), size=fontsize, va="center", color="grey") # create a diagonal to separate mirror image for self comparison if is_self: ax.plot(xlim, (0, ysize), 'm-', alpha=.5, lw=2) ax.set_xlim(xlim) ax.set_ylim(ylim) # add genome names if genomenames: gx, gy = genomenames.split("_") else: to_ax_label = lambda fname: op.basename(fname).split(".")[0] gx, gy = [to_ax_label(x.filename) for x in (qbed, sbed)] ax.set_xlabel(markup(gx), size=16) ax.set_ylabel(markup(gy), size=16) # beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) set_human_axis(ax) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) if palette: # bottom-left has the palette, if available colors = palette.colors xstart, ystart = .1, .05 for category, c in sorted(colors.items()): root.add_patch(Rectangle((xstart, ystart), .03, .02, lw=0, fc=c)) root.text(xstart + .04, ystart, category, color=c) xstart += .1 if not title: title = "Inter-genomic comparison: {0} vs {1}".format(gx, gy) if is_self: title = "Intra-genomic comparison within {0}".format(gx) npairs /= 2 title += " ({0} gene pairs)".format(thousands(npairs)) root.set_title(markup(title), x=.5, y=.96, color="k") logging.debug(title) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off()