Ejemplo n.º 1
0
def _make_col_palette(palette_name, num_bars, symmetric):
    # Format the col variable for the R hist function.
    from genomicode import colorlib

    color_fn = get_palette_fn(palette_name)
    if not symmetric:
        x = color_fn(num_bars)
        print x
    else:
        # Color order goes from left to right.  The colors that was on
        # the left should now be in the middle.
        if num_bars % 2:  # Odd.
            x = color_fn(num_bars/2+1)
            x = list(reversed(x)) + x[:-1]
        else:
            x = color_fn(num_bars/2)
            x = list(reversed(x)) + x
    x = [colorlib.rgb2hex(x) for x in x]
    x = [x.replace("0x", "#") for x in x]
    assert len(x) == num_bars
    return x
Ejemplo n.º 2
0
def main():
    import argparse
    import math

    import arrayio
    from genomicode import config
    from genomicode import colorlib
    from genomicode import jmath
    from genomicode.jmath import R_fn, R_var, R_equals

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("expression_file", help="Gene expression file.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")
    parser.add_argument("-v", "--verbose", action="store_true", help="")
    parser.add_argument("--prism_file",
                        help="Save result in Prism-formatted file.")

    group = parser.add_argument_group(title="Genes")
    group.add_argument(
        "--gene_names",
        default=[],
        action="append",
        help="Comma-separated list of IDs (e.g. probes, gene names) "
        "to include.")
    group.add_argument("--all_genes",
                       default=False,
                       action="store_true",
                       help="Plot all genes in the file.")

    group = parser.add_argument_group(title="Plot")
    group.add_argument("--title",
                       default=None,
                       help="Put a title on the plot.")
    group.add_argument("--height",
                       default=None,
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       default=None,
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--mar_bottom",
        default=1.0,
        type=float,
        help="Scale margin at bottom of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--xlabel_size",
        default=1.0,
        type=float,
        help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument("--xlabel_off",
                       default=False,
                       action="store_true",
                       help="Turn off the X labels.")
    group.add_argument("--ylabel", help="Label the Y axis.")
    group.add_argument("--gene_name_header",
                       help="Header for gene names to be used in the legend.")
    group.add_argument("--yaxis_starts_at_0",
                       action="store_true",
                       help="Y-axis should start at 0.")
    group.add_argument("--legend_off",
                       action="store_true",
                       help="Do not draw legend.")
    group.add_argument("--horizontal_lines",
                       action="store_true",
                       help="Draw horizontal lines.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.expression_file):
        parser.error("I could not find file %s." % args.expression_file)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.gene_names or args.all_genes, \
           "Please specify some genes to plot."
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    assert args.xlabel_size > 0 and args.xlabel_size < 10

    height = args.height or 1600
    width = args.width or 1600

    MATRIX = arrayio.read(args.expression_file)
    assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix."

    I = None
    if args.gene_names:
        I = find_gene_names(MATRIX, args.gene_names)
    elif args.all_genes:
        I = range(MATRIX.nrow())
    assert I, "No genes found."
    assert len(I) < 50, "Too many genes."
    MATRIX = MATRIX.matrix(I, None)

    # Find the gene names for the legend.
    if args.gene_name_header:
        h = args.gene_name_header
        assert h in MATRIX.row_names(), "Missing header: %s" % h
        gene_names = MATRIX.row_names(h)
    else:
        gene_names = [
            get_pretty_gene_name(MATRIX, i) for i in range(MATRIX.nrow())
        ]
    assert len(gene_names) == MATRIX.nrow()

    if args.prism_file:
        write_prism_file(args.prism_file, MATRIX, gene_names)

    # Start R and set up the environment.
    R = jmath.start_R()
    path = config.changlab_Rlib
    plotlib = os.path.join(path, "plotlib.R")
    assert os.path.exists(plotlib), "I cannot find: %s" % plotlib
    R_fn("source", plotlib)

    main = R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    #ylab = "Gene Expression"
    ylab = ""
    if args.ylabel:
        ylab = args.ylabel
    labels = jmath.R_var("FALSE")
    #labels = MATRIX.col_names(arrayio.COL_ID)
    col = R_var("NULL")
    xlim = [1, MATRIX.ncol() + 1]
    y_max = jmath.max(jmath.max(MATRIX._X))
    y_min = jmath.min(jmath.min(MATRIX._X))
    ylim = [y_min - 1, y_max + 1]
    if args.yaxis_starts_at_0:
        assert y_max > 0
        ylim[0] = 0

    if not args.xlabel_off:
        labels = MATRIX.col_names(arrayio.COL_ID)

    lwd = 2
    las = 3  # vertical labels
    at = R_var("NULL")
    if labels != jmath.R_var("FALSE"):
        at = range(1, len(labels) + 1)
    cex_labels = 1 * args.xlabel_size
    cex_legend = 1
    cex_lab = 1.5
    cex_sub = 1.5
    x = colorlib.bild_colors(len(gene_names))
    x = [colorlib.rgb2hex(x) for x in x]
    x = [x.replace("0x", "#") for x in x]
    col = x

    R_equals(MATRIX._X, "X")
    R_equals(labels, "labels")
    R_equals(at, "at")

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    R_fn("bitmap",
         args.plot_file,
         type=bm_type,
         height=height,
         width=width,
         units="px",
         res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    R_fn("par", mar=mar, RETVAL="op")

    R_fn("plot",
         R_var("NA"),
         type="n",
         axes=R_var("FALSE"),
         xlab="",
         ylab="",
         xlim=xlim,
         ylim=ylim)
    jmath.R('usr <- par("usr")')
    jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    jmath.R_fn("box", lwd=lwd)
    jmath.R_fn("axis",
               1,
               lwd=lwd,
               labels=R_var("labels"),
               at=R_var("at"),
               las=las,
               **{"cex.axis": cex_labels})
    jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": 2.0,
                   "cex.sub": cex_sub
               })

    for i in range(MATRIX.nrow()):
        y = MATRIX._X[i]
        x = range(1, len(y) + 1)
        R_fn("lines", x, y, lwd=lwd, col=col[i])
        R_fn("points", x, y, pch=19, cex=1, col=col[i])

    if args.horizontal_lines:
        y1 = int(math.ceil(ylim[0]))
        y2 = int(math.floor(ylim[1]))
        for y in range(y1, y2 + 1):
            R_fn("lines", (1, MATRIX.ncol() + 1), (y, y), lty=3, col="#A0A0A0")

    if not args.legend_off:
        R_fn("legend",
             "bottomleft",
             legend=gene_names,
             fill=col,
             cex=1,
             inset=0.05,
             **{"box.lwd": 1.5})

    R_fn("par", R_var("op"))
    R_fn("dev.off")
Ejemplo n.º 3
0
def main():
    import os
    import argparse

    from genomicode import jmath
    from genomicode import AnnotationMatrix
    from genomicode import colorlib
    from genomicode import pcalib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    #parser.add_argument("x_header", help="Which column for X values.")
    #parser.add_argument("y_header", help="Which column for Y values.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")

    group = parser.add_argument_group(title="Data Series")
    group.add_argument(
        "--series",
        action="append",
        help="Add a data series to the plot.  At least one series must be "
        "plotted.  Format: <x_header>;<y_header>")

    group = parser.add_argument_group(title="General Appearance")
    group.add_argument("--no_box",
                       action="store_true",
                       help="Turn off the box around the plot.")
    group.add_argument("--height",
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument("--mar_bottom",
                       default=1.0,
                       type=float,
                       help="Scale margin at bottom of plot.  Default 1.0.")
    #group.add_argument(
    #    "--xlabel_size", default=1.0, type=float,
    #    help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument("--log_x",
                       action="store_true",
                       help="Plot the X-axis on a log scale.")
    group.add_argument("--log_y",
                       action="store_true",
                       help="Plot the Y-axis on a log scale.")
    group.add_argument(
        "--qq",
        action="store_true",
        help="Make a QQ-plot.  Will sort the values to be plotted.")

    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument("--ylab", help="Label the Y-axis.")
    group.add_argument("--add_regression",
                       action="store_true",
                       help="Put a regression line on the plot.")

    group = parser.add_argument_group(title="Legend")
    group.add_argument("--add_legend",
                       action="store_true",
                       help="Add a legend to the plot.")
    group.add_argument("--legend_inset", type=float, default=0.05, help="")
    LEGEND_LOCATIONS = [
        "bottomright",
        "bottom",
        "bottomleft",
        "left",
        "topleft",
        "top",
        "topright",
        "right",
        "center",
    ]
    group.add_argument("--legend_loc",
                       choices=LEGEND_LOCATIONS,
                       help="Where to draw the legend.")

    group = parser.add_argument_group(title="Point Appearance")
    group.add_argument("--scale_points",
                       default=1.0,
                       type=float,
                       help="Scale the size of the points.  Default 1.0")
    group.add_argument("--label_header",
                       help="Label each point with the values in this column.")
    group.add_argument("--label_size",
                       type=float,
                       help="Scale the size of the labels by this value.")
    group.add_argument("--label_pos",
                       default="top",
                       choices=["top", "bottom", "left", "right"],
                       help="Where to label the points.")

    group = parser.add_argument_group(title="Line Appearance")
    group.add_argument("--add_lines",
                       action="store_true",
                       help="Add lines that connect the points.")
    group.add_argument("--scale_lines",
                       default=1.0,
                       type=float,
                       help="Scale the thickness of the lines.  Default 1.0")

    group = parser.add_argument_group(title="Identity Line")
    group.add_argument("--add_identity_line",
                       action="store_true",
                       help="Add an identity line to the plot.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "-c",
        "--cluster",
        action="append",
        help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.")
    group.add_argument(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then index 1 is the first row "
        "with data.  If given, then index 1 is the very first row "
        "in the file, including the headers.")
    group.add_argument("--default_color",
                       help="Default color of points.  Format: #000000.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    #assert args.xlabel_size > 0 and args.xlabel_size < 10

    assert args.legend_inset >= 0 and args.legend_inset < 10
    if args.legend_loc is None:
        args.legend_loc = "bottomright"

    if args.default_color:
        assert len(args.default_color) == 7
        assert args.default_color[0] == "#"

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.series, "Need to add a data --series to plot."
    #assert len(args.series) <= 1, "Not implemented."
    #assert args.x_header in MATRIX.headers, \
    #       "header not found: %s" % args.x_header
    #assert args.y_header in MATRIX.headers, \
    #       "header not found: %s" % args.y_header
    if args.label_header:
        assert args.label_header in MATRIX.headers, \
               "header not found: %s" % args.label_header
    if args.label_size is not None:
        assert args.label_size > 0 and args.label_size <= 20
    assert args.scale_points > 0 and args.scale_points < 20
    assert args.scale_lines > 0 and args.scale_lines < 20

    series = _parse_series(MATRIX, args.series)
    cluster = None
    if args.cluster:
        cluster = _parse_cluster(args.cluster, args.indexes_include_headers,
                                 MATRIX)

    if len(series) > 1:
        assert not cluster, "Series and cluster not implemented."

    height = args.height or 2400
    width = args.width or 3200

    # Pull out the values and colors for the plot.
    default_color = "#000000"
    if args.default_color:
        default_color = args.default_color

    assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1)
    series_data = []  # list of (x_values, y_values, col) for each series
    for i in range(len(series)):
        x_header, y_header = series[i]
        x = MATRIX[x_header]
        y = MATRIX[y_header]
        I1 = [j for (j, a) in enumerate(x) if a]
        I2 = [j for (j, a) in enumerate(y) if a]
        I = [j for j in I1 if j in I2]
        x = [x[j] for j in I]
        y = [y[j] for j in I]
        x = map(float, x)
        y = map(float, y)
        assert len(x) == len(y)
        c = default_color
        if len(series) > 1:
            rgb = colorlib.BREWER_QUALITATIVE_SET1[i]
            c = colorlib.rgb2hex(rgb, prefix="#")
        c = [c] * len(x)
        x = x, y, c
        series_data.append(x)

    # Merge all the data point for each series.
    x_values = []
    y_values = []
    col = []
    for (x, y, c) in series_data:
        x_values.extend(x)
        y_values.extend(y)
        #c = [c] * len(x)
        col.extend(c)
    assert len(x_values) == len(y_values)
    assert len(x_values) == len(col)

    if args.qq:
        O = jmath.order(x_values)
        x_values = [x_values[i] for i in O]
        y_values = [y_values[i] for i in O]
        col = [col[i] for i in O]

    if cluster is not None:
        col_rgb = pcalib.choose_colors(cluster)
        col = [default_color] * len(col_rgb)
        for i in range(len(col_rgb)):
            if col_rgb[i] is None:
                continue
            col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#")
        assert len(col) == len(x_values)

    #for i in range(len(x_values)):
    #    x = x_values[i], y_values[i], cluster[i], col[i]
    #    print "\t".join(map(str, x))

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if len(series) == 1:
        xlab = x_header
    if args.xlab:
        xlab = args.xlab
    ylab = ""
    if len(series) == 1:
        ylab = y_header
    if args.xlab:
        ylab = args.ylab

    lwd_box = 2
    lwd_axis = 2
    lwd_regr = 3
    cex = 1.0 * args.scale_points
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.0

    plot_log = ""
    if args.log_x:
        plot_log += "x"
    if args.log_y:
        plot_log += "y"

    assert x_values
    assert y_values
    jmath.R_equals(x_values, "X")
    jmath.R_equals(y_values, "Y")

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               args.plot_file,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn("plot",
               jmath.R_var("X"),
               jmath.R_var("Y"),
               main="",
               xlab="",
               ylab="",
               pch=19,
               cex=cex,
               log=plot_log,
               col=col,
               axes=jmath.R_var("FALSE"),
               RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))

    if args.add_lines:
        lwd = 4 * args.scale_lines
        i = 0
        for (x, y, c) in series_data:
            # Cannot use c for the color.  It might've been changed by
            # --cluster.
            assert col and i < len(col)
            c = col[i:i + len(x)]
            i += len(x)

            # The "lines" function takes a scalar for col (except for
            # type=h, histogram vertical lines).  If there are
            # multiple colors, then split up the points based on the
            # colors.
            l_x, l_y, l_c = [], [], None
            for j in range(len(x)):
                if c[j] != l_c:
                    if l_x:
                        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)
                    # Add the previous point so that the points will
                    # connect.
                    if l_x:
                        l_x = [l_x[-1]]
                        l_y = [l_y[-1]]
                    else:
                        l_x, l_y, l_c = [], [], None
                l_x.append(x[j])
                l_y.append(y[j])
                l_c = c[j]
            if l_x:
                jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.add_identity_line:
        lwd = 4

        x_min, x_max = min(x_values), max(x_values)
        y_min, y_max = min(y_values), max(y_values)

        iden_min = max(x_min, y_min)
        iden_max = min(x_max, y_max)

        l_x = [iden_min, iden_max]
        l_y = l_x
        l_c = "#FF0000"
        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.label_header:
        cex = 1
        if args.label_size is not None:
            cex = args.label_size
        pos2specifier = {
            "top": 3,
            "bottom": 1,
            "left": 2,
            "right": 4,
        }
        pos = pos2specifier[args.label_pos]
        point_labels = MATRIX[args.label_header]
        jmath.R_fn("text",
                   jmath.R_var("X"),
                   jmath.R_var("Y"),
                   labels=point_labels,
                   cex=cex,
                   pos=pos)

    # Calculate correlation, and other statistics.
    # TODO: Should calculate this for each series.
    r = jmath.R("cor(X, Y)")
    p_value = jmath.R("cor.test(X, Y)$p.value")
    r = r[0]
    p_value = p_value[0]
    print "R = %.2f" % r
    print "p = %.2g" % p_value

    # Add a regression line.
    if args.add_regression:
        jmath.R("fit <- lm(Y ~ X)")
        coef = jmath.R("fit$coefficients")
        assert len(coef) == 2
        b, m = coef
        x1 = min(x_values)
        y1 = x1 * m + b
        x2 = max(x_values)
        y2 = x2 * m + b
        jmath.R_fn("lines", [x1, x2], [y1, y2],
                   lwd=lwd_regr,
                   lty=2,
                   col="#C63F31")
        sub = "R=%.2f (p=%.2g)" % (r, p_value)
        header = "X", "Y", "R", "p"
        print "\t".join(header)
        x = xlab, ylab, r, p_value
        print "\t".join(map(str, x))

    if args.add_legend:
        leg = [x[1] for x in series]
        fill = [x[-1] for x in series_data]
        #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)")
        # alpha does not seem to be supported here.
        jmath.R_fn("legend",
                   args.legend_loc,
                   legend=leg,
                   fill=fill,
                   inset=args.legend_inset)

    if not args.no_box:
        jmath.R_fn("box", lwd=lwd_box)
    jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": cex_main,
                   "cex.sub": cex_sub
               })
    R("par(op)")
    jmath.R_fn("dev.off")
Ejemplo n.º 4
0
def plot_boxplot(filename,
                 group_names,
                 group2values,
                 height=None,
                 width=None,
                 cluster=None,
                 title="",
                 subtitle="",
                 sub="",
                 xlab="",
                 ylab="",
                 subtitle_size=1.0,
                 subtitle_line=0.5,
                 subtitle_col="#000000",
                 xlabel_size=1.0,
                 xlabel_off=False,
                 mar_bottom=1.0,
                 mar_left=1.0,
                 mar_top=1.0):
    # group_names is a list of the names for each group.
    # group2values is a dictionary of group_name -> list of values.
    # Also, can be matrix (values x groups).
    # subtitle goes under title.  sub goes under plot.
    from genomicode import config
    from genomicode import jmath
    from genomicode import colorlib
    from genomicode import pcalib

    # Start R and set up the environment.
    R = jmath.start_R()
    path = config.changlab_Rlib
    plotlib = os.path.join(path, "plotlib.R")
    assert os.path.exists(plotlib), "I cannot find: %s" % plotlib
    jmath.R_fn("source", plotlib)

    main = jmath.R_var("NA")
    if title:
        main = title
    sub = sub
    xlab = xlab
    ylab = ylab
    xlabel = group_names
    if xlabel_off:
        xlabel = jmath.R_var("FALSE")

    col = jmath.R_var("NULL")
    if cluster is not None:
        x = pcalib.choose_colors(cluster)
        x = [colorlib.rgb2hex(x) for x in x]
        x = [x.replace("0x", "#") for x in x]
        col = x

    lwd = 2
    las = 3  # vertical labels
    at = jmath.R_var("NULL")
    if xlabel != jmath.R_var("FALSE"):
        at = range(1, len(xlabel) + 1)
    cex_labels = 1.25 * xlabel_size
    #cex_legend = 1
    cex_xlab = 1.5
    cex_ylab = 2.0
    cex_sub = 1.5

    if type(group2values) is type([]):
        # Is matrix.  Should do more checking here.
        jmath.R_equals(group2values, "X")
    else:
        R("X <- list()")
        for i, n in enumerate(group_names):
            x = group2values.get(n, [])
            x = [x for x in x if x is not None]
            jmath.R_equals(x, "s")
            R("X[[%d]] <- s" % (i + 1))

    #try:
    #    #jmath.R_equals(MATRIX._X, "X")
    #    jmath.R_equals(X, "X")
    #except ValueError, x:
    #    # Not needed anymore.  Missing values are now implemented in jmath.
    #    ## Look for missing values.
    #    #for i in range(len(MATRIX._X)):
    #    #    assert None not in MATRIX._X[i], \
    #    #           "Missing values in row %d (0-based)." % i
    #    ## Cannot diagnose error.  Raise the original exception.
    #    raise

    jmath.R_equals(xlabel, "labels")
    jmath.R_equals(at, "at")

    bm_type = "png16m"
    if filename.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               filename,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    # default is 5.1, 4.1, 4.1, 2.1
    label_adjust = 1.0
    if xlabel == jmath.R_var("FALSE"):
        label_adjust = 0.2
    x = 5 * 2.0 * mar_bottom * label_adjust, 4 * 1.2 * mar_left, 4 * mar_top, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn("boxplot",
               jmath.R_var("X"),
               col=col,
               main="",
               xlab="",
               ylab="",
               axes=jmath.R_var("FALSE"),
               pch=19,
               cex=1,
               ylim=jmath.R_var("NULL"))
    # Make plot area solid white.
    jmath.R('usr <- par("usr")')
    jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    jmath.R_fn("boxplot",
               jmath.R_var("X"),
               col=col,
               main="",
               xlab="",
               ylab="",
               axes=jmath.R_var("FALSE"),
               pch=19,
               cex=1,
               ylim=jmath.R_var("NULL"),
               add=jmath.R_var("TRUE"))

    jmath.R_fn("box", lwd=lwd)
    jmath.R_fn("axis",
               1,
               lwd=lwd,
               labels=jmath.R_var("labels"),
               at=jmath.R_var("at"),
               las=las,
               **{"cex.axis": cex_labels})
    jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_xlab,
                   "cex.main": 2.0,
                   "cex.sub": cex_sub,
                   "col.sub": "#A60400"
               })
    if subtitle:
        jmath.R_fn("mtext",
                   subtitle,
                   cex=1.0 * subtitle_size,
                   line=subtitle_line,
                   col=subtitle_col)
    R("par(op)")
    jmath.R_fn("dev.off")
Ejemplo n.º 5
0
def main():
    from optparse import OptionParser, OptionGroup
    import numpy
    import arrayio
    from genomicode import jmath
    from genomicode import pcalib
    from genomicode import colorlib
    from genomicode import prismlib

    # Does a PCA on the columns.
    usage = "usage: %prog [options] filename outfile.png"
    parser = OptionParser(usage=usage, version="%prog 01")

    #parser.add_option(
    #    "-l", "--log_transform", default=False,
    #    action="store_true",
    #    help="Log transform the data first.")

    parser.add_option(
        "--num_header_cols",
        type=int,
        help="This number of columns are headers.  If not given, will guess.")
    parser.add_option("-g",
                      "--genes",
                      default=None,
                      type="int",
                      help="Number of genes to use.")
    parser.add_option(
        "--prism_file",
        help="Write the column principal components to a prism-formatted "
        "file.")
    parser.add_option(
        "--row_pc_file",
        help="Write the principal components of the rows to this file.")
    parser.add_option(
        "--col_pc_file",
        help="Write the principal components of the cols to this file.")
    #parser.add_option(
    #    "-v", "--verbose", default=False, action="store_true",
    #    help="")

    group = OptionGroup(parser, "Clustering")
    parser.add_option_group(group)
    group.add_option(
        "-c",
        "--cluster",
        default=[],
        action="append",
        help="Group samples into a cluster (e.g. -c 1-5); 1-based.")
    group.add_option(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then index 1 is the first column "
        "with data.  If given, then index 1 is the very first column "
        "in the file, including the headers.")
    group.add_option(
        "--cluster_file",
        help="A KGG format file of the clusters for the samples.  "
        "Clusters in this file can be 0-based or 1-based.")

    group = OptionGroup(parser, "Visualization")
    parser.add_option_group(group)
    group.add_option("--title", help="Put a title on the plot.")
    group.add_option("--width",
                     default=None,
                     type="int",
                     help="Width (in pixels) of the plot.")
    group.add_option("--label",
                     default=False,
                     action="store_true",
                     help="Label the samples.")
    group.add_option("--label_axes",
                     default=False,
                     action="store_true",
                     help="Label the axes.")
    group.add_option("--scale_label",
                     type=float,
                     default=1.0,
                     help="Scale the size of the labels.")

    # Parse the input arguments.
    options, args = parser.parse_args()
    if len(args) < 2:
        parser.error("Please specify an infile and an outfile.")
    elif len(args) > 2:
        parser.error("Too many input parameters (%d)." % len(args))
    filename, outfile = args
    if not os.path.exists(filename):
        parser.error("I could not find file %s." % filename)
    if options.num_header_cols is not None:
        assert options.num_header_cols > 0 and options.num_header_cols < 100
    if options.width is not None:
        assert options.width > 10, "too small"
        assert options.width < 4096 * 16, "width too big"
    assert options.scale_label > 0.01 and options.scale_label < 100
    options.log_transform = False

    num_genes = options.genes
    #K = 10  # number of dimensions

    MATRIX = read_matrix(filename, options.num_header_cols)
    if options.log_transform:
        MATRIX._X = jmath.log(MATRIX._X, base=2, safe=1)
    assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix."

    cluster = None
    if options.cluster and options.cluster_file:
        parser.error("Cannot specify clusters and a cluster file.")
    if options.cluster:
        cluster = _parse_cluster(options.cluster,
                                 options.indexes_include_headers, MATRIX)
    if options.cluster_file:
        if not os.path.exists(options.cluster_file):
            parser.error("I could not find cluster file: %s" %
                         options.cluster_file)
        cluster = _parse_cluster_file(options.cluster_file, MATRIX)

    # Select a subset of the genes.
    if num_genes:
        assert MATRIX.ncol() > 1, "Not enough samples to select genes."
        I = pcalib.select_genes_var(MATRIX._X, num_genes)
        MATRIX = MATRIX.matrix(I, None)

    # Calculate the principal components and plot them.
    K = min(MATRIX.nrow(), MATRIX.ncol())
    principal_components, perc_var = pcalib.svd_project_cols(MATRIX._X, K)
    X = [x[0] for x in principal_components]
    Y = [x[1] for x in principal_components]
    color = None
    if cluster is not None:
        color = pcalib.choose_colors(cluster)
    LABEL = None
    if options.label:
        LABEL = MATRIX.col_names(arrayio.COL_ID)
    assert not LABEL or len(LABEL) == len(X), "%d %d" % (len(X), len(LABEL))
    height = width = None
    if options.width is not None:
        height, width = int(options.width * 0.75), options.width
    pcalib.plot_scatter(X,
                        Y,
                        outfile,
                        group=cluster,
                        color=color,
                        title=options.title,
                        label=LABEL,
                        xlabel=options.label_axes,
                        ylabel=options.label_axes,
                        scale_label=options.scale_label,
                        height=height,
                        width=width)

    # Write out the scatter plot in Prism format.
    if options.prism_file:
        # Write out as prism format.
        num_series = 1
        if cluster:
            num_series = max(cluster) + 1
        names = ["CLUSTER-%d" % (i + 1) for i in range(num_series)]
        DATA = {}
        rownames = {}
        for i in range(num_series):
            xy = []
            n = []
            for j in range(len(principal_components)):
                if cluster and cluster[j] != i:
                    continue
                x = principal_components[j][0]
                y = principal_components[j][1]
                xy.append([x, y])
                n.append(MATRIX.col_names(arrayio.COL_ID)[j])
            if xy:
                DATA[names[i]] = xy
                rownames[names[i]] = n

        prismlib.write_scatterplot(options.prism_file, DATA, rownames)

    if options.col_pc_file:
        # Write out the principal components.
        handle = open(options.col_pc_file, 'w')
        assert cluster is None or len(cluster) == len(principal_components)
        x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)]
        header = ["Index", "Sample", "Cluster", "Color"] + x
        print >> handle, "\t".join(header)
        for i in range(len(principal_components)):
            x = MATRIX.col_names(arrayio.COL_ID)[i]
            c = ""
            if color and color[i] is not None:
                c = colorlib.rgb2hex(color[i])
            clust = ""
            if cluster is not None and cluster[i] is not None:
                clust = cluster[i]
            x = [i + 1, x, clust, c] + principal_components[i]
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()

    # Look at the principal components on the rows.
    if options.row_pc_file:
        handle = open(options.row_pc_file, 'w')
        row_names = MATRIX.row_names()
        x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)]
        header = ["Index"] + row_names + x
        print >> handle, "\t".join(header)

        # U  nrow x k  columns are principal components
        # V  k x ncol  rows are principal components
        U, s, V = numpy.linalg.svd(MATRIX._X, full_matrices=False)
        for i in range(len(U)):
            assert len(U[i]) == K, "%d %d" % (len(U), len(U[i]), K)
            n = [MATRIX.row_names(x)[i] for x in row_names]
            x = [i + 1] + n + list(U[i])
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()