Esempio n. 1
0
def t_test(X, Y, exact=True):
    """X,Y is a matrix slice"""
    # Return tuple (list of t-values, list of p-values).  t-values and
    # p-values are floats or None (for NA, inf, nan).
    assert len(X) == len(Y), 'X and Y should be equal length'

    R = jmath.start_R()
    t_value = []
    p_value = []
    for i in range(len(X)):
        X[i] = [jmath.R_var('NA') if numpy.isnan(x) else x for x in X[i]]
        Y[i] = [jmath.R_var('NA') if numpy.isnan(x) else x for x in Y[i]]
        jmath.R_equals(X[i], 'x')
        jmath.R_equals(Y[i], 'y')
        R('a<-try(t.test(x,y,exact=exact), silent=TRUE)')
        R('if (is(a, "try-error")) p=NA else p=a$p.value')
        R('if (is(a, "try-error")) t=NA else t=a$t')
        R('if (is.null(t)) t=NA')
        p = R["p"]
        t = R["t"]
        if str(p) in ["NA", "inf", "nan"]:
            p = None
        else:
            p = float(p[0])
        if str(t) in ["NA", "inf", "nan"]:
            t = None
        else:
            t = float(t[0])
        t_value.append(t)
        p_value.append(p)
    return t_value, p_value
Esempio n. 2
0
def convert_gene_list_platform(genes, platform):
    from genomicode import jmath
    from genomicode import arrayplatformlib

    platform_list = [i.name for i in arrayplatformlib.platforms]
    assert platform in platform_list, ('we cannot convert to the platform %s' %
                                       platform)
    chip = arrayplatformlib.guess_chip_from_probesets(genes)
    assert chip, 'we cannot guess the platform for the input file'
    in_attribute = arrayplatformlib.get_bm_attribute(chip)
    in_mart = arrayplatformlib.get_bm_organism(chip)
    out_attribute = arrayplatformlib.get_bm_attribute(platform)
    out_mart = arrayplatformlib.get_bm_organism(platform)
    R = jmath.start_R()
    jmath.R_equals_vector(genes, 'gene_id')
    R('library(biomaRt)')
    jmath.R_equals(in_attribute, 'in_attribute')
    jmath.R_equals(in_attribute, 'filters')
    jmath.R_equals(in_mart, 'in_mart')
    R('old=useMart("ensembl",in_mart)')
    jmath.R_equals(out_attribute, 'out_attribute')
    jmath.R_equals(out_mart, 'out_mart')
    R('new=useMart("ensembl",out_mart)')
    R(
        str('homolog = getLDS(attributes=in_attribute,') +
        str('filters=filters,values=gene_id,mart=old,') +
        str('attributesL=out_attribute,martL=new)'))
    homolog = R['homolog']
    #old_id = [str(i) for i in homolog[0]]
    human_id = [str(i) for i in homolog[1]]
    return human_id
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='run the gene pattern module')

    parser.add_argument('--parameters',
                        default=[],
                        action='append',
                        help='key:value')
    parser.add_argument('-o',
                        dest='outpath',
                        default=".",
                        help='Directory to the save the results.')
    parser.add_argument("module_name", nargs=1)
    parser.add_argument(
        "--id_and_version",
        help='specify the lsid and version in id:verison format')
    args = parser.parse_args()
    module_name = args.module_name[0]

    parameters = dict()
    for i in args.parameters:
        assert ':' in i, 'parameters should be in key:value format'
        key, value = i.split(':', 1)
        assert ':' not in value, 'parameters should be in key:value format'
        parameters[key] = value

    # given the module_name and the module parameters in dict, call
    # module in Genepattern
    R = jmath.start_R()
    jmath.R_equals(config.gp_user, 'username')
    jmath.R_equals(config.gp_passwd, 'password')
    jmath.R_equals(config.gp_server, 'servername')
    R('library(GenePattern)')
    R('gp.client <- gp.login(servername, username, password)')

    params = []
    params.append("gp.client")
    if args.id_and_version:
        params.append(
            "'urn:lsid:broad.mit.edu:cancer.software.genepattern.module.analysis:%s'"
            % args.id_and_version)
    else:
        params.append("'%s'" % module_name)
    for (key, value) in parameters.iteritems():
        params.append("%s='%s'" % (key, value))
    params_str = ", ".join(params)
    x = "result <- run.analysis(%s)" % params_str
    R(x)

    # Download the files to outpath.
    jmath.R_equals(args.outpath, 'outpath')
    R('job.result.download.files(result, outpath)')
    assert os.path.exists(args.outpath), \
           "Missing output directory for: %s" % module_name

    # Look for "stderr.txt".
    result_files = os.listdir(args.outpath)
    assert 'stderr.txt' not in result_files, (
        "Run failed.  GenePattern generated an error:\n%s" %
        file(os.path.join(args.outpath, 'stderr.txt')).read())
Esempio n. 4
0
def _start_R():
    global GLOBAL_R
    from genomicode import jmath

    if GLOBAL_R is None:
        R = jmath.start_R()
        R('library(biomaRt)')
        GLOBAL_R = R
    return GLOBAL_R
Esempio n. 5
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import os
     from genomicode import jmath
     from genomicode import filelib
     in_data = antecedents
     cwd = os.getcwd()
     R = jmath.start_R()
     R('require(limma,quietly=TRUE)')
     R('library(marray)')
     os.chdir(in_data.identifier)
     try:
         R('dir<-getwd()')
         R('files<-list.files(dir)')
         R('x.read<-read.Agilent(files)')
     finally:
         os.chdir(cwd)
 
     
     R('xnorm.loc <- maNorm(x.read, norm = "loess")')
     R('x.norm <- maNormScale(xnorm.loc, norm = "p")')
     tmpfile = 'tmp.txt'
     jmath.R_equals(tmpfile, 'tmpfile')
     R('write.marray(x.norm,tmpfile)')
     f = open(tmpfile, 'r')
     text = f.readlines()
     firstline = text[0].split()
     f.close()
     firstindex = firstline.index('"ProbeName"')
     if '"Sequence"' in firstline:
         secondindex = firstline.index('"Sequence"')
     else:
         secondindex = firstline.index('"ControlType"')
 
     
     sample = range(secondindex + 1, len(firstline))
     f = open(outfile, 'w')
     for i in text:
         line = i.split()
         f.write(line[firstindex] + '\t')
         for j in sample:
             f.write(line[j] + '\t')
         f.write('\n')
 
     
     f.close()
     os.remove(tmpfile)
     assert filelib.exists_nz(outfile), (
         'the output file %s for preprocess_agilent fails' % outfile
     )
Esempio n. 6
0
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     import arrayio
     from Betsy import read_label_file
     from genomicode import jmath
     
     cls_node_train, data_node = antecedents
     result, label_line, second_line = read_label_file.read(
         cls_node_train.identifier)
     y = [second_line[int(i)] for i in label_line]
     R = jmath.start_R()
     M = arrayio.read(data_node.identifier)
     M_train = M.matrix(None, range(0, len(label_line)))
     M_test = M.matrix(None, range(len(label_line), M.dim()[1]))
     M1 = M_train.slice()
     M_train = jmath.transpose(M1)
     jmath.R_equals_matrix(M_train, 'data')
     M2 = M_test.slice()
     M2 = jmath.transpose(M2)
     jmath.R_equals_matrix(M2, 'test')
     jmath.R_equals(y, 'y')
     R('y<-as.factor(y)')
     R('require(randomForest, quietly=TRUE)')
     R('library(randomForest)')
     R('model <- randomForest(data,y=y,importance=TRUE)')
     R('predict_result <- predict(model, test)')
     predict_result = R['predict_result']
     levels = predict_result.levels
     predict_labels = predict_result[:]
     predict_labels = [levels[i - 1] for i in predict_labels]
     name = M_test._col_names.keys()[0]
     sample_name = M_test._col_names[name]
     result = [['Sample_name', 'Predicted_class', 'Confidence']]
     for i in range(len(sample_name)):
         result.append([str(sample_name[i]), predict_labels[i], ''])
     
     f = file(outfile, 'w')
     for i in result:
         f.write('\t'.join(i))
         f.write('\n')
     f.close()
Esempio n. 7
0
def start_and_init_R():
    global GLOBAL_R
    import os
    from genomicode import jmath
    from genomicode import config

    if GLOBAL_R is None:
        assert os.path.exists(config.changlab_Rlib)
        km_lib = os.path.join(config.changlab_Rlib, "kaplanmeierlib.R")
        stat_lib = os.path.join(config.changlab_Rlib, "statlib.R")
        prism_lib = os.path.join(config.changlab_Rlib, "prismlib.R")
        assert os.path.exists(km_lib), "File not found: %s" % km_lib
        assert os.path.exists(stat_lib), "File not found: %s" % stat_lib
        assert os.path.exists(prism_lib), "File not found: %s" % prism_lib

        R = jmath.start_R()
        R('require(splines, quietly=TRUE)')
        R('source("%s")' % km_lib)
        R('source("%s")' % stat_lib)
        R('source("%s")' % prism_lib)
        GLOBAL_R = R
    return GLOBAL_R
Esempio n. 8
0
def main():
    import argparse
    import math

    import arrayio
    from genomicode import config
    from genomicode import colorlib
    from genomicode import jmath
    from genomicode.jmath import R_fn, R_var, R_equals

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("expression_file", help="Gene expression file.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")
    parser.add_argument("-v", "--verbose", action="store_true", help="")
    parser.add_argument("--prism_file",
                        help="Save result in Prism-formatted file.")

    group = parser.add_argument_group(title="Genes")
    group.add_argument(
        "--gene_names",
        default=[],
        action="append",
        help="Comma-separated list of IDs (e.g. probes, gene names) "
        "to include.")
    group.add_argument("--all_genes",
                       default=False,
                       action="store_true",
                       help="Plot all genes in the file.")

    group = parser.add_argument_group(title="Plot")
    group.add_argument("--title",
                       default=None,
                       help="Put a title on the plot.")
    group.add_argument("--height",
                       default=None,
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       default=None,
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--mar_bottom",
        default=1.0,
        type=float,
        help="Scale margin at bottom of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--xlabel_size",
        default=1.0,
        type=float,
        help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument("--xlabel_off",
                       default=False,
                       action="store_true",
                       help="Turn off the X labels.")
    group.add_argument("--ylabel", help="Label the Y axis.")
    group.add_argument("--gene_name_header",
                       help="Header for gene names to be used in the legend.")
    group.add_argument("--yaxis_starts_at_0",
                       action="store_true",
                       help="Y-axis should start at 0.")
    group.add_argument("--legend_off",
                       action="store_true",
                       help="Do not draw legend.")
    group.add_argument("--horizontal_lines",
                       action="store_true",
                       help="Draw horizontal lines.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.expression_file):
        parser.error("I could not find file %s." % args.expression_file)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.gene_names or args.all_genes, \
           "Please specify some genes to plot."
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    assert args.xlabel_size > 0 and args.xlabel_size < 10

    height = args.height or 1600
    width = args.width or 1600

    MATRIX = arrayio.read(args.expression_file)
    assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix."

    I = None
    if args.gene_names:
        I = find_gene_names(MATRIX, args.gene_names)
    elif args.all_genes:
        I = range(MATRIX.nrow())
    assert I, "No genes found."
    assert len(I) < 50, "Too many genes."
    MATRIX = MATRIX.matrix(I, None)

    # Find the gene names for the legend.
    if args.gene_name_header:
        h = args.gene_name_header
        assert h in MATRIX.row_names(), "Missing header: %s" % h
        gene_names = MATRIX.row_names(h)
    else:
        gene_names = [
            get_pretty_gene_name(MATRIX, i) for i in range(MATRIX.nrow())
        ]
    assert len(gene_names) == MATRIX.nrow()

    if args.prism_file:
        write_prism_file(args.prism_file, MATRIX, gene_names)

    # Start R and set up the environment.
    R = jmath.start_R()
    path = config.changlab_Rlib
    plotlib = os.path.join(path, "plotlib.R")
    assert os.path.exists(plotlib), "I cannot find: %s" % plotlib
    R_fn("source", plotlib)

    main = R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    #ylab = "Gene Expression"
    ylab = ""
    if args.ylabel:
        ylab = args.ylabel
    labels = jmath.R_var("FALSE")
    #labels = MATRIX.col_names(arrayio.COL_ID)
    col = R_var("NULL")
    xlim = [1, MATRIX.ncol() + 1]
    y_max = jmath.max(jmath.max(MATRIX._X))
    y_min = jmath.min(jmath.min(MATRIX._X))
    ylim = [y_min - 1, y_max + 1]
    if args.yaxis_starts_at_0:
        assert y_max > 0
        ylim[0] = 0

    if not args.xlabel_off:
        labels = MATRIX.col_names(arrayio.COL_ID)

    lwd = 2
    las = 3  # vertical labels
    at = R_var("NULL")
    if labels != jmath.R_var("FALSE"):
        at = range(1, len(labels) + 1)
    cex_labels = 1 * args.xlabel_size
    cex_legend = 1
    cex_lab = 1.5
    cex_sub = 1.5
    x = colorlib.bild_colors(len(gene_names))
    x = [colorlib.rgb2hex(x) for x in x]
    x = [x.replace("0x", "#") for x in x]
    col = x

    R_equals(MATRIX._X, "X")
    R_equals(labels, "labels")
    R_equals(at, "at")

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    R_fn("bitmap",
         args.plot_file,
         type=bm_type,
         height=height,
         width=width,
         units="px",
         res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    R_fn("par", mar=mar, RETVAL="op")

    R_fn("plot",
         R_var("NA"),
         type="n",
         axes=R_var("FALSE"),
         xlab="",
         ylab="",
         xlim=xlim,
         ylim=ylim)
    jmath.R('usr <- par("usr")')
    jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    jmath.R_fn("box", lwd=lwd)
    jmath.R_fn("axis",
               1,
               lwd=lwd,
               labels=R_var("labels"),
               at=R_var("at"),
               las=las,
               **{"cex.axis": cex_labels})
    jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": 2.0,
                   "cex.sub": cex_sub
               })

    for i in range(MATRIX.nrow()):
        y = MATRIX._X[i]
        x = range(1, len(y) + 1)
        R_fn("lines", x, y, lwd=lwd, col=col[i])
        R_fn("points", x, y, pch=19, cex=1, col=col[i])

    if args.horizontal_lines:
        y1 = int(math.ceil(ylim[0]))
        y2 = int(math.floor(ylim[1]))
        for y in range(y1, y2 + 1):
            R_fn("lines", (1, MATRIX.ncol() + 1), (y, y), lty=3, col="#A0A0A0")

    if not args.legend_off:
        R_fn("legend",
             "bottomleft",
             legend=gene_names,
             fill=col,
             cex=1,
             inset=0.05,
             **{"box.lwd": 1.5})

    R_fn("par", R_var("op"))
    R_fn("dev.off")
Esempio n. 9
0
def main():
    import os
    import argparse
    
    from genomicode import jmath
    from genomicode import AnnotationMatrix

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    parser.add_argument("header", help="Which column contains data to plot.")
    parser.add_argument(
        "plot_file", help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")
    parser.add_argument(
        "--prism_file", help="Write Prism-formatted results to this file.")
    parser.add_argument(
        "--ignore_missing_values", action="store_true",
        help="Ignore missing values in the file.")

    group = parser.add_argument_group(title="Calculations")
    group.add_argument(
        "--breaks_seq",
        help="Set the breakpoints.  Format: <start>,<stop>,<skip>.")
    group.add_argument(
        "--num_breaks", type=int, help="Number of breakpoints.")
    group.add_argument(
        "--ymax", type=int,
        help="Set the maximum value for the Y axis.")
    
    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument(
        "--xlabel_size", default=1.0, type=float,
        help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument(
        "--xlabel_off", action="store_true", help="Do not label the X axis.")
    group.add_argument(
        "--ylabel_off", action="store_true", help="Do not label the Y axis.")
    group.add_argument(
        "--xtick_label_off", action="store_true",
        help="Do not draw the tick labels on the X axis.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "--bar_color",  help="Set the color of the bars.  Default #FFFFFF")
    x = _fmt_palettes()
    group.add_argument(
        "--bar_palette", help="Color the bars according to a palette: %s." % x)
    group.add_argument(
        "--symmetric_palette", action="store_true",
        help="Make the color symmetric.")

    group = parser.add_argument_group(title="Appearance")
    group.add_argument(
        "--height", type=int, help="Height (in pixels) of the plot.")
    group.add_argument(
        "--width", type=int, help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left", default=1.0, type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--mar_bottom", default=1.0, type=float,
        help="Scale margin at bottom of plot.  Default 1.0.")
    group.add_argument(
        "--xaxis_off", action="store_true", help="Do not show the X axis.")
    group.add_argument(
        "--yaxis_off", action="store_true", help="Do not show the Y axis.")


    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    assert not (args.breaks_seq and args.num_breaks)
    if args.num_breaks:
        assert args.num_breaks >= 2 and args.num_breaks <= 1000
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096*16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096*16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    assert args.xlabel_size > 0 and args.xlabel_size < 10
    assert not (args.bar_color and args.bar_palette)
    assert not args.symmetric_palette or args.bar_palette
    assert args.ymax is None or args.ymax > 0


    height = args.height or 2400
    width = args.width or 3200

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.header in MATRIX.headers, "header not found: %s" % args.header

    # Pull out the values for the histogram.
    x = MATRIX[args.header]
    if args.ignore_missing_values:
        x = [x for x in x if x.strip()]
    values = map(float, x)

    value_min = value_max = None

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if args.xlab:
        xlab = args.xlab
    ylab = "Frequency"
    xtick_labels = jmath.R_var("TRUE")
    ytick_labels = jmath.R_var("TRUE")

    if args.xlabel_off:
        xlab = ""
    if args.ylabel_off:
        ylab = ""
    if args.xtick_label_off:
        xtick_labels = jmath.R_var("FALSE")

    breaks = "Sturges"
    if args.breaks_seq:
        breaks = _parse_breaks_seq(args.breaks_seq)
        value_min, value_max = min(breaks), max(breaks)
        jmath.R_equals(breaks, "breaks")
        breaks = jmath.R_var("breaks")
    if args.num_breaks:
        breaks = args.num_breaks

    if value_min is not None:
        values = [x for x in values if x >= value_min]
    if value_max is not None:
        values = [x for x in values if x < value_max]

    lwd = 2
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.5
    ylim = jmath.R_var("NULL")
    if args.ymax is not None:
        ylim = [0, args.ymax]

    assert values
    jmath.R_equals(values, "X")

    # Figure out the colors.  Do it after X is assigned.
    col = jmath.R_var("NULL")
    if args.bar_color:
        assert args.bar_color.startswith("#")
        col = args.bar_color
    elif args.bar_palette:
        # Figure out how many breaks there are.  Number of bars is num
        # breaks + 1.
        jmath.R_fn(
            "hist", jmath.R_var("X"), breaks=breaks, plot=jmath.R_var("FALSE"),
            RETVAL="x")
        breaks = [x for x in R["x"].rx2("breaks")]
        num_bars = len(breaks) + 1
        col = _make_col_palette(
            args.bar_palette, num_bars, args.symmetric_palette)

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn(
        "bitmap", args.plot_file, type=bm_type, 
        height=height, width=width, units="px", res=300)
    
    # Set the margins.
    x = 5*1.2*args.mar_bottom, 4*1.2*args.mar_left, 4, 2
    mar = [x+0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn(
        "hist", jmath.R_var("X"), breaks=breaks, main="", xlab="", ylab="",
        ylim=ylim, axes=jmath.R_var("FALSE"), col=col, RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))
    
    #jmath.R_fn("box", lwd=lwd)
    # x-axis
    if not args.xaxis_off:
        jmath.R_fn(
            "axis", 1, lwd=lwd, labels=xtick_labels, **{ "cex.axis" : 1.5 })
    # y-axis
    if not args.yaxis_off:
        jmath.R_fn(
            "axis", 2, lwd=lwd, labels=ytick_labels, **{ "cex.axis" : 1.5 })
    jmath.R_fn(
        "title", main=main, sub=sub, xlab=xlab, ylab=ylab,
        **{ "cex.lab" : cex_lab, "cex.main" : cex_main, "cex.sub" : cex_sub })
    R("par(op)")
    jmath.R_fn("dev.off")

    if args.prism_file:
        write_prism_file(args.prism_file, R["x"])
Esempio n. 10
0
def find_diffexp_genes(outfile, gmt_file, algorithm, paired, MATRIX,
                       geneid_header, genename_header, genename_delim, name1,
                       name2, classes, filter_fold_change, fold_change,
                       p_cutoff, fdr_cutoff, bonf_cutoff, sam_DELTA,
                       sam_qq_file, edger_tagwise_dispersion, num_procs):
    # classes must be 0, 1, None.
    import os
    import sys
    import math
    import StringIO
    import warnings

    from rpy2 import rinterface

    from genomicode import config
    from genomicode import jmath
    from genomicode import genesetlib

    algorithm2function_unpaired = {
        "fold_change": "find.de.genes.fc",
        "ttest": "find.de.genes.ttest",
        "sam": "find.de.genes.sam",
        "ebayes": "find.de.genes.ebayes",
        "deseq2": "find.de.genes.deseq2",
        "edger": "find.de.genes.edgeR",
    }
    algorithm2function_paired = {
        "ebayes": "find.de.genes.paired.ebayes",
    }
    algorithm2function = algorithm2function_unpaired
    if paired:
        algorithm2function = algorithm2function_paired
        assert algorithm in algorithm2function_paired, \
               "No paired version of %s" % algorithm
    assert algorithm in algorithm2function, "Unknown algorithm: %s" % algorithm

    # Select the relevant columns from MATRIX.
    I = [i for (i, x) in enumerate(classes) if x in [0, 1]]
    assert len(I)
    MATRIX = MATRIX.matrix(None, I)
    classes = [classes[i] for i in I]

    # All algorithms except "fold_change" need at least 2 samples of
    # each class.
    counts = {}
    for x in classes:
        counts[x] = counts.get(x, 0) + 1
    assert sorted(counts) == [0, 1], "Only one class represented."

    if algorithm not in ["fold_change", "deseq2"]:
        assert counts[0] >= 2, "There must be at least 2 of each class."
        assert counts[1] >= 2, "There must be at least 2 of each class."

    names = [name1, name2]
    X = MATRIX._X
    Y = [names[x] for x in classes]
    sample_name = None
    if MATRIX.col_names():
        sample_name = MATRIX.col_names(MATRIX.col_names()[0])

    x = choose_gene_names(MATRIX)
    if not geneid_header:
        geneid_header = x[0]
    if not genename_header:
        genename_header = x[1]
    assert not geneid_header or geneid_header in MATRIX.row_names()
    assert not genename_header or genename_header in MATRIX.row_names()

    R = jmath.start_R()
    de_lib = os.path.join(config.changlab_Rlib, "diffexp.R")
    stat_lib = os.path.join(config.changlab_Rlib, "statlib.R")
    assert os.path.exists(de_lib), "I could not find file: %s" % de_lib
    assert os.path.exists(stat_lib), "I could not find file: %s" % stat_lib
    R('source("%s")' % de_lib)
    R('source("%s")' % stat_lib)

    jmath.R_equals(X, "X")
    jmath.R_equals(Y, "Y")
    if sample_name:
        jmath.R_equals(sample_name, "sample.name")
        jmath.R('colnames(X) <- sample.name')

    geneid = genenames = None
    if geneid_header:
        geneid = MATRIX.row_names(geneid_header)
        jmath.R_equals(geneid, "geneid")
    if genename_header:
        genenames = MATRIX.row_names(genename_header)
        jmath.R_equals(genenames, "genenames")

    # Set up the arguments.
    args = ["X", "Y"]
    if algorithm == "sam":
        args.append("%g" % sam_DELTA)
    if geneid:
        args.append("geneid=geneid")
    if genenames:
        args.append("genenames=genenames")
    # Pass the fold change to the algorithm, because it can affect the
    # multiple hypothesis correction.
    if filter_fold_change is not None:
        args.append("FOLD.CHANGE=%g" % filter_fold_change)
    if algorithm in ["ttest", "deseq2"]:
        args.append("NPROCS=%d" % num_procs)  # t-test only
    #if show_all_genes and algorithm != "sam":
    if algorithm not in ["sam", "fold_change"]:
        args.append("filter.p05=FALSE")
    if algorithm == "edger":
        if edger_tagwise_dispersion:
            args.append("tagwise.dispersion=TRUE")
        else:
            args.append("tagwise.dispersion=FALSE")

    # Prevent SAM from writing junk to the screen.
    handle = StringIO.StringIO()
    old_stdout = sys.stdout
    sys.stdout = handle

    # Call the proper R function.  DESeq2 throws off a lot of
    # warnings.  Turn them off temporarily.
    fn = algorithm2function[algorithm]
    x = ", ".join(args)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        R("x <- %s(%s)" % (fn, x))
    R("DATA <- x$DATA")
    DATA_R = R["DATA"]

    sys.stdout = old_stdout

    # Write out a QQ file for SAM.
    if algorithm == "sam" and sam_qq_file:
        R('S <- x$S')
        jmath.R_fn("bitmap",
                   sam_qq_file,
                   type="png256",
                   height=1600,
                   width=1600,
                   units="px",
                   res=300)
        jmath.R_fn("samr.plot", jmath.R_var("S"), sam_DELTA)
        jmath.R_fn("dev.off")

    # Convert this DataFrame into a Python object.  Columns of floats
    # can be StrVector objects if there are NA embedded within them.
    # NA are special objects of either type
    # rpy2.rinterface.NACharacterType or type
    # rpy2.rinterface.NARealType.
    tDATA_py = []
    header = [DATA_R.colnames[i] for i in range(DATA_R.ncol)]
    for zzz, col_R in enumerate(DATA_R):  # iterate over columns
        col_py = [col_R[i] for i in range(len(col_R))]

        if col_R.__class__.__name__ == "StrVector":
            pass
        elif col_R.__class__.__name__ == "FloatVector":
            col_py = [float(x) for x in col_py]
        elif col_R.__class__.__name__ == "IntVector":
            col_py = [int(x) for x in col_py]
        tDATA_py.append(col_py)
    DATA_py = jmath.transpose(tDATA_py)

    #handle = open('test01.txt', 'w')
    #for x in DATA_py:
    #    print >>handle, "\t".join(map(str, x))

    # Convert NA to None.
    for i in range(len(DATA_py)):
        for j in range(len(DATA_py[i])):
            if type(DATA_py[i][j]) in [
                    rinterface.NACharacterType, rinterface.NARealType
            ]:
                DATA_py[i][j] = None

    # Sort by increasing p-value, then decreasing fold change.
    name = "p.value"
    direction = 1
    #if algorithm == "sam":
    #    name = "Score(d)"
    if name not in header:
        name = "Log_2 Fold Change"
        direction = -1
    assert name in header, 'I could not find the "%s" column.' % name

    I = header.index(name)
    #schwartz = [(direction*float(x[I]), x) for x in DATA_py]
    values = [x[I] for x in DATA_py]
    for i in range(len(values)):
        if values[i] is None:
            values[i] = direction * 1E10
        else:
            values[i] = direction * float(values[i])
    schwartz = zip(values, DATA_py)
    schwartz.sort()
    DATA_py = [x[-1] for x in schwartz]

    # Filter based on user criteria.
    if fold_change is not None:
        log_2_fc = math.log(fold_change, 2)
        name = "Log_2 Fold Change"
        assert name in header, 'I could not find the "%s" column.' % name
        I = header.index(name)
        DATA_py = [
            x for x in DATA_py if x[I] is not None and abs(x[I]) >= log_2_fc
        ]
    if p_cutoff is not None:
        name = "p.value"
        assert name in header, 'I could not find the "%s" column.' % name
        I = header.index(name)
        DATA_py = [
            x for x in DATA_py if x[I] is not None and float(x[I]) < p_cutoff
        ]
    if fdr_cutoff is not None:
        name = "FDR"
        # This might be missing if all the genes have already been
        # filtered.
        #assert name in header, 'I could not find the "%s" column.' % name
        if name in header:
            I = header.index(name)
            DATA_py = [
                x for x in DATA_py
                if x[I] is not None and float(x[I]) < fdr_cutoff
            ]
    if bonf_cutoff is not None:
        name = "Bonf"
        assert name in header, 'I could not find the "%s" column.' % name
        I = header.index(name)
        DATA_py = [
            x for x in DATA_py
            if x[I] is not None and float(x[I]) < bonf_cutoff
        ]

    ## If no significant genes, then don't produce any output.
    ##if not DATA_py:
    ##    return

    # Write to the outhandle.
    _write_matrix(outfile, header, DATA_py)
    # Don't close someone else's file handle.
    #outhandle.close()

    # Write out the gene sets in GMT format, if requested.
    if not gmt_file:
        return
    assert "Direction" in header, 'I could not find the "Direction" column.'
    assert "Gene ID" in header, 'I could not find the "Gene ID" column.'
    assert "Gene Name" in header, 'I could not find the "Gene Name" column.'
    I_direction = header.index("Direction")
    I_geneid = header.index("Gene ID")
    I_genename = header.index("Gene Name")

    # "Higher in <name1>"
    # "Higher in <name2>"
    # "SAME"
    possible_directions = [
        "Higher in %s" % name1,
        "Higher in %s" % name2, "SAME"
    ]
    direction = [x[I_direction] for x in DATA_py]
    for x in direction:
        assert x.startswith("Higher in ") or x == "SAME"
        assert x in possible_directions
    samples = [x.replace("Higher in ", "") for x in direction]

    genesets = []  # list of (<SAMPLE>, [UP|DN])
    for s in samples:
        if s == "SAME":
            continue
        assert s in [name1, name2]
        # Make genesets relative to name2.  (Assume name1 is control).
        d = "UP"
        if s == name1:
            s, d = name2, "DN"
        genesets.append((s, d))
    genesets_all = sorted({}.fromkeys(genesets))

    outhandle = open(gmt_file, 'w')
    for geneset in genesets_all:
        sample, direct = geneset
        I = [i for (i, gs) in enumerate(genesets) if gs == geneset]
        gid = [DATA_py[i][I_geneid] for i in I]
        gn = [DATA_py[i][I_genename] for i in I]
        # gn might be float.  genesetlib expects array of strings.
        #import sys; sys.exit(0)
        gid = genesetlib.clean_genes(gid)
        gn = genesetlib.clean_genes(gn, delim=genename_delim)
        # <SAMPLE>_[ID|NAME]_[UP|DN]
        if gid:
            x = "%s_%s_%s" % (sample, "ID", direct)
            x = [x, "na"] + gid
            print >> outhandle, "\t".join(x)
        if gn:
            x = "%s_%s_%s" % (sample, "NAME", direct)
            x = [x, "na"] + gn
            print >> outhandle, "\t".join(x)
    outhandle.close()
Esempio n. 11
0
def calc_association(phenotypes, scores, ignore_insufficient_groups):
    # Return a dictionary with keys:
    # n                    Number of samples.
    # m                    Number of groups.
    # scores               n-list of <float>
    # delta                None or <float>
    # phenotypes           n-list of <string>
    # groups               n-list of <int>  [0, length(group_names)-1]
    # group_names          m-list of <string>  (unique list of pheno)
    # num_samples          dict of <group (int)> : <int>
    # mean_score           dict of <group (int)> : <float>
    # p_value              <float>
    # relationship         <string>
    #
    # May return None if there is only 1 group, and
    # ignore_insufficient_groups is a true value.
    from genomicode import jmath
    from genomicode import sortlib
    
    # Select only the samples with phenotype and score information.
    I1 = [i for (i, x) in enumerate(phenotypes) if x]
    I2 = [i for (i, x) in enumerate(scores) if x != ""]
    I = sorted(set.intersection(set(I1), set(I2)))
    assert I, "No valid samples."

    phenotypes = [phenotypes[i] for i in I]
    scores = [float(scores[i]) for i in I]

    # Figure out the groupings.
    #group_names = sorted({}.fromkeys(phenotypes))
    group_names = sortlib.sort_natural({}.fromkeys(phenotypes))
    if len(group_names) < 2 and ignore_insufficient_groups:
        return None
    assert len(group_names) >= 2, "Need at least 2 groups (%s)." % \
           str(group_names)
    groups = [None] * len(phenotypes)
    for i in range(len(phenotypes)):
        x = group_names.index(phenotypes[i])
        groups[i] = x

    # Calculate the association.
    group2scores = {}  # group -> list of scores
    for i in range(len(scores)):
        n = groups[i]
        if n not in group2scores:
            group2scores[n] = []
        group2scores[n].append(scores[i])

    y = scores
    x = [[0]*len(group_names) for i in range(len(y))]
    for i in range(len(groups)):
        x[i][groups[i]] = 1
    jmath.start_R()
    jmath.R_equals(x, "x")
    jmath.R_equals(y, "y")
    jmath.R("m <- aov(y~x)")
    p_value = jmath.R('summary(m)[[1]][["Pr(>F)"]][1]')[0]

    # Count other things.
    num_samples = {}
    for n in group2scores:
        num_samples[n] = len(group2scores[n])
    mean_score = {}
    for n in group2scores:
        mean_score[n] = jmath.mean(group2scores[n])
    # If there are exactly 2 groups, then find the difference between
    # the two groups.
    delta = None   # list of deltas
    if len(group_names) == 2:
        delta = mean_score[1] - mean_score[0]

    # Figure out the relationship.
    relationship = ""
    assert len(group_names) >= 2
    high_score = None
    for n, score in mean_score.iteritems():
        if high_score is not None and score <= high_score:
            continue
        high_score = score
        x1 = "Higher"
        if len(group_names) > 2:
            x1 = "Highest"
        relationship = "%s in %s" % (x1, group_names[n])
    
    SCORE = {}
    SCORE["n"] = len(scores)
    SCORE["m"] = len(group_names)
    SCORE["scores"] = scores
    SCORE["phenotypes"] = phenotypes
    SCORE["groups"] = groups
    SCORE["group_names"] = group_names
    SCORE["num_samples"] = num_samples
    SCORE["mean_score"] = mean_score
    SCORE["delta"] = delta
    SCORE["p_value"] = p_value
    SCORE["relationship"] = relationship
    return SCORE
Esempio n. 12
0
def main():
    import os
    import argparse

    from genomicode import jmath
    #from genomicode import AnnotationMatrix
    #from genomicode import colorlib
    #from genomicode import pcalib
    from genomicode import hashlib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile",
                        help="Tab-delimited text file in Prism format.  "
                        "Each column is a series.  First row is header.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")

    group = parser.add_argument_group(title="General Appearance")
    group.add_argument("--no_box",
                       action="store_true",
                       help="Turn off the box around the plot.")
    group.add_argument("--height",
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument("--mar_bottom",
                       default=1.0,
                       type=float,
                       help="Scale margin at bottom of plot.  Default 1.0.")

    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument("--ylab", help="Label the Y-axis.")

    group = parser.add_argument_group(title="Legend")
    group.add_argument("--add_legend",
                       action="store_true",
                       help="Add a legend to the plot.")
    group.add_argument("--legend_inset", type=float, default=0.05, help="")
    LEGEND_LOCATIONS = [
        "bottomright",
        "bottom",
        "bottomleft",
        "left",
        "topleft",
        "top",
        "topright",
        "right",
        "center",
    ]
    group.add_argument("--legend_loc",
                       choices=LEGEND_LOCATIONS,
                       help="Where to draw the legend.")

    group = parser.add_argument_group(title="Point Appearance")
    group.add_argument("--scale_points",
                       default=1.0,
                       type=float,
                       help="Scale the size of the points.  Default 1.0")
    group.add_argument("--default_color",
                       help="Default color of points.  Format: #000000.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10

    assert args.legend_inset >= 0 and args.legend_inset < 10
    if args.legend_loc is None:
        args.legend_loc = "bottomright"
    assert args.scale_points > 0 and args.scale_points < 20

    if args.default_color:
        assert len(args.default_color) == 7
        assert args.default_color[0] == "#"

    # Read the data file.
    # List of (name, values).
    MATRIX = read_prism_file(args.datafile)

    height = args.height or 2400
    width = args.width or 3200

    # Pull out the values and colors for the plot.
    default_color = "#000000"
    if args.default_color:
        default_color = args.default_color

    # Start R and set up the environment.
    R = jmath.start_R()
    R("library(beeswarm)")

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if args.xlab:
        xlab = args.xlab
    ylab = ""
    if args.xlab:
        ylab = args.ylab

    lwd_box = 2
    lwd_axis = 2
    #lwd_regr = 3
    cex = 1.0 * args.scale_points
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.0

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               args.plot_file,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    R("X <- list()")
    for title, values in MATRIX:
        title_h = hashlib.hash_var(title)
        jmath.R_equals(values, "x")
        R('X[["%s"]] <- x' % title_h)

    keywds = {
        "cex.axis": cex_lab,  # Y-axis
        "cex.names": cex_lab,  # X-axis
    }
    jmath.R_fn(
        "beeswarm",
        jmath.R_var("X"),
        main="",
        xlab="",
        ylab="",
        pch=19,
        cex=cex,
        #axes=jmath.R_var("FALSE"),
        RETVAL="x",
        **keywds)
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))

    # Calculate correlation, and other statistics.
    # TODO: Should calculate this for each series.
    #r = jmath.R("cor(X, Y)")
    #p_value = jmath.R("cor.test(X, Y)$p.value")
    #r = r[0]
    #p_value = p_value[0]
    #print "R = %.2f" % r
    #print "p = %.2g" % p_value

    if not args.no_box:
        jmath.R_fn("box", lwd=lwd_box)
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": cex_main,
                   "cex.sub": cex_sub
               })
    R("par(op)")
    jmath.R_fn("dev.off")
Esempio n. 13
0
def sam(X, Y, genenames, delta, foldchange, pngfig):
    """X is a matrix slice,Y is the label list"""
    assert len(X[0]) == len(Y), 'X and Y should be equal length'
    R = jmath.start_R()
    jmath.R_equals_matrix(X, 'x')
    jmath.R_equals(genenames, 'genenames')
    jmath.R_equals(Y, 'y')
    jmath.R_equals(foldchange, 'foldchange')
    jmath.R_equals(delta, 'DELTA')
    R('library(samr)')
    R('D<-list(x=x,y=y,logged2=TRUE,geneid = 1:length(x),genenames=genenames)')
    R('S<-samr(D,resp.type="Two class unpaired",nperms=100)')
    R('DTAB<-samr.compute.delta.table(S,min.foldchange=foldchange)')
    R('SIG<-samr.compute.siggenes.table(S,DELTA,D,DTAB,min.foldchange=foldchange)'
      )
    R('up<-SIG$ngenes.up')
    R('lo<-SIG$ngenes.lo')
    R('library(R.utils)')
    command = 'bitmap("' + pngfig + '",type="png256")'
    R(command)
    R('samr.plot(S,DELTA)')
    R('title(main=paste("DELTA=",DELTA))')
    R('dev.off()')
    import rpy2.robjects as robjects
    R = robjects.r
    up = R['up']
    lo = R['lo']
    gene_ids = []
    scores = []
    numerators = []
    denominators = []
    foldchanges = []
    q_values = []
    if up[0] > 0:
        R('geneID1<-SIG$genes.up[,"Gene ID"]')
        R('Score1<-SIG$genes.up[,"Score(d)"]')
        R('Numerator1<-SIG$genes.up[,"Numerator(r)"]')
        R('Denominator1<-SIG$genes.up[,"Denominator(s+s0)"]')
        R('foldchange1<-SIG$genes.up[,"Fold Change"]')
        R('q1<-SIG$genes.up[,"q-value(%)"]')
        gene_id = R['geneID1']
        gene_ids.extend(gene_id)
        scores.extend(R['Score1'])
        numerators.extend(R['Numerator1'])
        denominators.extend(R['Denominator1'])
        foldchanges.extend(R['foldchange1'])
        q = [float(i) / 100 for i in R['q1']]
        q_values.extend(q)

    if lo[0] > 0:
        R('geneID2<-SIG$genes.lo[,"Gene ID"]')
        R('Score2<-SIG$genes.lo[,"Score(d)"]')
        R('Numerator2<-SIG$genes.lo[,"Numerator(r)"]')
        R('Denominator2<-SIG$genes.lo[,"Denominator(s+s0)"]')
        R('foldchange2<-SIG$genes.lo[,"Fold Change"]')
        R('q2<-SIG$genes.lo[,"q-value(%)"]')
        gene_id = R['geneID2']
        gene_ids.extend(gene_id)
        scores.extend(R['Score2'])
        numerators.extend(R['Numerator2'])
        denominators.extend(R['Denominator2'])
        foldchanges.extend(R['foldchange2'])
        q = [float(i) / 100 for i in R['q2']]
        q_values.extend(q)

    return gene_ids, [scores, numerators, denominators, foldchanges, q_values]
Esempio n. 14
0
def plot_boxplot(filename,
                 group_names,
                 group2values,
                 height=None,
                 width=None,
                 cluster=None,
                 title="",
                 subtitle="",
                 sub="",
                 xlab="",
                 ylab="",
                 subtitle_size=1.0,
                 subtitle_line=0.5,
                 subtitle_col="#000000",
                 xlabel_size=1.0,
                 xlabel_off=False,
                 mar_bottom=1.0,
                 mar_left=1.0,
                 mar_top=1.0):
    # group_names is a list of the names for each group.
    # group2values is a dictionary of group_name -> list of values.
    # Also, can be matrix (values x groups).
    # subtitle goes under title.  sub goes under plot.
    from genomicode import config
    from genomicode import jmath
    from genomicode import colorlib
    from genomicode import pcalib

    # Start R and set up the environment.
    R = jmath.start_R()
    path = config.changlab_Rlib
    plotlib = os.path.join(path, "plotlib.R")
    assert os.path.exists(plotlib), "I cannot find: %s" % plotlib
    jmath.R_fn("source", plotlib)

    main = jmath.R_var("NA")
    if title:
        main = title
    sub = sub
    xlab = xlab
    ylab = ylab
    xlabel = group_names
    if xlabel_off:
        xlabel = jmath.R_var("FALSE")

    col = jmath.R_var("NULL")
    if cluster is not None:
        x = pcalib.choose_colors(cluster)
        x = [colorlib.rgb2hex(x) for x in x]
        x = [x.replace("0x", "#") for x in x]
        col = x

    lwd = 2
    las = 3  # vertical labels
    at = jmath.R_var("NULL")
    if xlabel != jmath.R_var("FALSE"):
        at = range(1, len(xlabel) + 1)
    cex_labels = 1.25 * xlabel_size
    #cex_legend = 1
    cex_xlab = 1.5
    cex_ylab = 2.0
    cex_sub = 1.5

    if type(group2values) is type([]):
        # Is matrix.  Should do more checking here.
        jmath.R_equals(group2values, "X")
    else:
        R("X <- list()")
        for i, n in enumerate(group_names):
            x = group2values.get(n, [])
            x = [x for x in x if x is not None]
            jmath.R_equals(x, "s")
            R("X[[%d]] <- s" % (i + 1))

    #try:
    #    #jmath.R_equals(MATRIX._X, "X")
    #    jmath.R_equals(X, "X")
    #except ValueError, x:
    #    # Not needed anymore.  Missing values are now implemented in jmath.
    #    ## Look for missing values.
    #    #for i in range(len(MATRIX._X)):
    #    #    assert None not in MATRIX._X[i], \
    #    #           "Missing values in row %d (0-based)." % i
    #    ## Cannot diagnose error.  Raise the original exception.
    #    raise

    jmath.R_equals(xlabel, "labels")
    jmath.R_equals(at, "at")

    bm_type = "png16m"
    if filename.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               filename,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    # default is 5.1, 4.1, 4.1, 2.1
    label_adjust = 1.0
    if xlabel == jmath.R_var("FALSE"):
        label_adjust = 0.2
    x = 5 * 2.0 * mar_bottom * label_adjust, 4 * 1.2 * mar_left, 4 * mar_top, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn("boxplot",
               jmath.R_var("X"),
               col=col,
               main="",
               xlab="",
               ylab="",
               axes=jmath.R_var("FALSE"),
               pch=19,
               cex=1,
               ylim=jmath.R_var("NULL"))
    # Make plot area solid white.
    jmath.R('usr <- par("usr")')
    jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    jmath.R_fn("boxplot",
               jmath.R_var("X"),
               col=col,
               main="",
               xlab="",
               ylab="",
               axes=jmath.R_var("FALSE"),
               pch=19,
               cex=1,
               ylim=jmath.R_var("NULL"),
               add=jmath.R_var("TRUE"))

    jmath.R_fn("box", lwd=lwd)
    jmath.R_fn("axis",
               1,
               lwd=lwd,
               labels=jmath.R_var("labels"),
               at=jmath.R_var("at"),
               las=las,
               **{"cex.axis": cex_labels})
    jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_xlab,
                   "cex.main": 2.0,
                   "cex.sub": cex_sub,
                   "col.sub": "#A60400"
               })
    if subtitle:
        jmath.R_fn("mtext",
                   subtitle,
                   cex=1.0 * subtitle_size,
                   line=subtitle_line,
                   col=subtitle_col)
    R("par(op)")
    jmath.R_fn("dev.off")
Esempio n. 15
0
def main():
    import os
    import argparse

    from genomicode import jmath
    from genomicode import AnnotationMatrix
    from genomicode import colorlib
    from genomicode import pcalib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    #parser.add_argument("x_header", help="Which column for X values.")
    #parser.add_argument("y_header", help="Which column for Y values.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")

    group = parser.add_argument_group(title="Data Series")
    group.add_argument(
        "--series",
        action="append",
        help="Add a data series to the plot.  At least one series must be "
        "plotted.  Format: <x_header>;<y_header>")

    group = parser.add_argument_group(title="General Appearance")
    group.add_argument("--no_box",
                       action="store_true",
                       help="Turn off the box around the plot.")
    group.add_argument("--height",
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument("--mar_bottom",
                       default=1.0,
                       type=float,
                       help="Scale margin at bottom of plot.  Default 1.0.")
    #group.add_argument(
    #    "--xlabel_size", default=1.0, type=float,
    #    help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument("--log_x",
                       action="store_true",
                       help="Plot the X-axis on a log scale.")
    group.add_argument("--log_y",
                       action="store_true",
                       help="Plot the Y-axis on a log scale.")
    group.add_argument(
        "--qq",
        action="store_true",
        help="Make a QQ-plot.  Will sort the values to be plotted.")

    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument("--ylab", help="Label the Y-axis.")
    group.add_argument("--add_regression",
                       action="store_true",
                       help="Put a regression line on the plot.")

    group = parser.add_argument_group(title="Legend")
    group.add_argument("--add_legend",
                       action="store_true",
                       help="Add a legend to the plot.")
    group.add_argument("--legend_inset", type=float, default=0.05, help="")
    LEGEND_LOCATIONS = [
        "bottomright",
        "bottom",
        "bottomleft",
        "left",
        "topleft",
        "top",
        "topright",
        "right",
        "center",
    ]
    group.add_argument("--legend_loc",
                       choices=LEGEND_LOCATIONS,
                       help="Where to draw the legend.")

    group = parser.add_argument_group(title="Point Appearance")
    group.add_argument("--scale_points",
                       default=1.0,
                       type=float,
                       help="Scale the size of the points.  Default 1.0")
    group.add_argument("--label_header",
                       help="Label each point with the values in this column.")
    group.add_argument("--label_size",
                       type=float,
                       help="Scale the size of the labels by this value.")
    group.add_argument("--label_pos",
                       default="top",
                       choices=["top", "bottom", "left", "right"],
                       help="Where to label the points.")

    group = parser.add_argument_group(title="Line Appearance")
    group.add_argument("--add_lines",
                       action="store_true",
                       help="Add lines that connect the points.")
    group.add_argument("--scale_lines",
                       default=1.0,
                       type=float,
                       help="Scale the thickness of the lines.  Default 1.0")

    group = parser.add_argument_group(title="Identity Line")
    group.add_argument("--add_identity_line",
                       action="store_true",
                       help="Add an identity line to the plot.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "-c",
        "--cluster",
        action="append",
        help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.")
    group.add_argument(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then index 1 is the first row "
        "with data.  If given, then index 1 is the very first row "
        "in the file, including the headers.")
    group.add_argument("--default_color",
                       help="Default color of points.  Format: #000000.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    #assert args.xlabel_size > 0 and args.xlabel_size < 10

    assert args.legend_inset >= 0 and args.legend_inset < 10
    if args.legend_loc is None:
        args.legend_loc = "bottomright"

    if args.default_color:
        assert len(args.default_color) == 7
        assert args.default_color[0] == "#"

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.series, "Need to add a data --series to plot."
    #assert len(args.series) <= 1, "Not implemented."
    #assert args.x_header in MATRIX.headers, \
    #       "header not found: %s" % args.x_header
    #assert args.y_header in MATRIX.headers, \
    #       "header not found: %s" % args.y_header
    if args.label_header:
        assert args.label_header in MATRIX.headers, \
               "header not found: %s" % args.label_header
    if args.label_size is not None:
        assert args.label_size > 0 and args.label_size <= 20
    assert args.scale_points > 0 and args.scale_points < 20
    assert args.scale_lines > 0 and args.scale_lines < 20

    series = _parse_series(MATRIX, args.series)
    cluster = None
    if args.cluster:
        cluster = _parse_cluster(args.cluster, args.indexes_include_headers,
                                 MATRIX)

    if len(series) > 1:
        assert not cluster, "Series and cluster not implemented."

    height = args.height or 2400
    width = args.width or 3200

    # Pull out the values and colors for the plot.
    default_color = "#000000"
    if args.default_color:
        default_color = args.default_color

    assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1)
    series_data = []  # list of (x_values, y_values, col) for each series
    for i in range(len(series)):
        x_header, y_header = series[i]
        x = MATRIX[x_header]
        y = MATRIX[y_header]
        I1 = [j for (j, a) in enumerate(x) if a]
        I2 = [j for (j, a) in enumerate(y) if a]
        I = [j for j in I1 if j in I2]
        x = [x[j] for j in I]
        y = [y[j] for j in I]
        x = map(float, x)
        y = map(float, y)
        assert len(x) == len(y)
        c = default_color
        if len(series) > 1:
            rgb = colorlib.BREWER_QUALITATIVE_SET1[i]
            c = colorlib.rgb2hex(rgb, prefix="#")
        c = [c] * len(x)
        x = x, y, c
        series_data.append(x)

    # Merge all the data point for each series.
    x_values = []
    y_values = []
    col = []
    for (x, y, c) in series_data:
        x_values.extend(x)
        y_values.extend(y)
        #c = [c] * len(x)
        col.extend(c)
    assert len(x_values) == len(y_values)
    assert len(x_values) == len(col)

    if args.qq:
        O = jmath.order(x_values)
        x_values = [x_values[i] for i in O]
        y_values = [y_values[i] for i in O]
        col = [col[i] for i in O]

    if cluster is not None:
        col_rgb = pcalib.choose_colors(cluster)
        col = [default_color] * len(col_rgb)
        for i in range(len(col_rgb)):
            if col_rgb[i] is None:
                continue
            col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#")
        assert len(col) == len(x_values)

    #for i in range(len(x_values)):
    #    x = x_values[i], y_values[i], cluster[i], col[i]
    #    print "\t".join(map(str, x))

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if len(series) == 1:
        xlab = x_header
    if args.xlab:
        xlab = args.xlab
    ylab = ""
    if len(series) == 1:
        ylab = y_header
    if args.xlab:
        ylab = args.ylab

    lwd_box = 2
    lwd_axis = 2
    lwd_regr = 3
    cex = 1.0 * args.scale_points
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.0

    plot_log = ""
    if args.log_x:
        plot_log += "x"
    if args.log_y:
        plot_log += "y"

    assert x_values
    assert y_values
    jmath.R_equals(x_values, "X")
    jmath.R_equals(y_values, "Y")

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               args.plot_file,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn("plot",
               jmath.R_var("X"),
               jmath.R_var("Y"),
               main="",
               xlab="",
               ylab="",
               pch=19,
               cex=cex,
               log=plot_log,
               col=col,
               axes=jmath.R_var("FALSE"),
               RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))

    if args.add_lines:
        lwd = 4 * args.scale_lines
        i = 0
        for (x, y, c) in series_data:
            # Cannot use c for the color.  It might've been changed by
            # --cluster.
            assert col and i < len(col)
            c = col[i:i + len(x)]
            i += len(x)

            # The "lines" function takes a scalar for col (except for
            # type=h, histogram vertical lines).  If there are
            # multiple colors, then split up the points based on the
            # colors.
            l_x, l_y, l_c = [], [], None
            for j in range(len(x)):
                if c[j] != l_c:
                    if l_x:
                        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)
                    # Add the previous point so that the points will
                    # connect.
                    if l_x:
                        l_x = [l_x[-1]]
                        l_y = [l_y[-1]]
                    else:
                        l_x, l_y, l_c = [], [], None
                l_x.append(x[j])
                l_y.append(y[j])
                l_c = c[j]
            if l_x:
                jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.add_identity_line:
        lwd = 4

        x_min, x_max = min(x_values), max(x_values)
        y_min, y_max = min(y_values), max(y_values)

        iden_min = max(x_min, y_min)
        iden_max = min(x_max, y_max)

        l_x = [iden_min, iden_max]
        l_y = l_x
        l_c = "#FF0000"
        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.label_header:
        cex = 1
        if args.label_size is not None:
            cex = args.label_size
        pos2specifier = {
            "top": 3,
            "bottom": 1,
            "left": 2,
            "right": 4,
        }
        pos = pos2specifier[args.label_pos]
        point_labels = MATRIX[args.label_header]
        jmath.R_fn("text",
                   jmath.R_var("X"),
                   jmath.R_var("Y"),
                   labels=point_labels,
                   cex=cex,
                   pos=pos)

    # Calculate correlation, and other statistics.
    # TODO: Should calculate this for each series.
    r = jmath.R("cor(X, Y)")
    p_value = jmath.R("cor.test(X, Y)$p.value")
    r = r[0]
    p_value = p_value[0]
    print "R = %.2f" % r
    print "p = %.2g" % p_value

    # Add a regression line.
    if args.add_regression:
        jmath.R("fit <- lm(Y ~ X)")
        coef = jmath.R("fit$coefficients")
        assert len(coef) == 2
        b, m = coef
        x1 = min(x_values)
        y1 = x1 * m + b
        x2 = max(x_values)
        y2 = x2 * m + b
        jmath.R_fn("lines", [x1, x2], [y1, y2],
                   lwd=lwd_regr,
                   lty=2,
                   col="#C63F31")
        sub = "R=%.2f (p=%.2g)" % (r, p_value)
        header = "X", "Y", "R", "p"
        print "\t".join(header)
        x = xlab, ylab, r, p_value
        print "\t".join(map(str, x))

    if args.add_legend:
        leg = [x[1] for x in series]
        fill = [x[-1] for x in series_data]
        #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)")
        # alpha does not seem to be supported here.
        jmath.R_fn("legend",
                   args.legend_loc,
                   legend=leg,
                   fill=fill,
                   inset=args.legend_inset)

    if not args.no_box:
        jmath.R_fn("box", lwd=lwd_box)
    jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": cex_main,
                   "cex.sub": cex_sub
               })
    R("par(op)")
    jmath.R_fn("dev.off")
Esempio n. 16
0
def draw_venn1(filename, all_names, name2genes, all_labels, args_margin,
               args_title, args_title_size, args_title_y, args_label_size,
               args_count_size):
    # Draw a Venn diagram using the VennDiagram package.
    # Only up to 5 areas.  No Euler plots.
    # Generates TIFF files.
    import sys
    import StringIO
    from genomicode import jmath
    R_fn = jmath.R_fn
    R_var = jmath.R_var
    R_equals = jmath.R_equals

    R = jmath.start_R()

    # Prevent R from writing junk to the screen.
    handle1, handle2 = StringIO.StringIO(), StringIO.StringIO()
    old_stdout, old_stderr = sys.stdout, sys.stderr
    sys.stdout, sys.stderr = handle1, handle2
    R_fn('library', R_var('VennDiagram'))
    sys.stdout, sys.stderr = old_stdout, old_stderr

    # venn diagram can't handle missing gene sets.  Get rid of them.
    I = [i for (i, x) in enumerate(all_names) if x in name2genes]
    all_names = [all_names[i] for i in I]
    all_labels = [all_labels[i] for i in I]

    # Five is the maximum supported by package.
    assert len(all_names) <= 5, "Can't draw venn diagram with %d circles." % \
           len(all_names)

    varnames = ["A", "B", "C", "D", "E"]
    for i in range(len(all_labels)):
        n = all_names[i]
        genes = name2genes[n]
        R_equals(genes, varnames[i])
    #n1, n2, n3 = all_names
    #R_equals(name2genes[n1], "A")
    #R_equals(name2genes[n2], "B")
    #R_equals(name2genes[n3], "C")
    if len(all_names) == 2:
        R('x <- list(A=A, B=B)')
    elif len(all_names) == 3:
        R('x <- list(A=A, B=B, C=C)')
    elif len(all_names) == 4:
        R('x <- list(A=A, B=B, C=C, D=D)')
    elif len(all_names) == 5:
        R('x <- list(A=A, B=B, C=C, D=D, E=E)')
    else:
        raise NotImplementedError
    for i in range(len(all_names)):
        #n = all_names[i]
        n = all_labels[i]
        R('names(x)[%d] <- "%s"' % (i + 1, n))
    #R('names(x)[1] <- "%s"' % n1)
    #R('names(x)[2] <- "%s"' % n2)
    #R('names(x)[3] <- "%s"' % n3)

    #cex = 1*args_count_size         # Size of number in each circle.
    #cat_cex = 1.5*args_label_size   # Size of category labels.
    #margin = 0.05*args_margin   # Amount of space around plot.

    # Bigger margin is smaller figure.
    margin = 0.10 * args_margin
    cat_cex = 0.75 * args_label_size
    cex = 0.65 * args_count_size

    # The length of fill needs to match the number of non-empty
    # values.
    x = [x for x in all_names if x in name2genes]
    num_fill = len(x)

    if num_fill == 2:
        fill = ["cornflowerblue", "darkorchid1"]
        cat_col = ["cornflowerblue", "darkorchid1"]
    elif num_fill == 3:
        fill = ["cornflowerblue", "green", "yellow"]
        cat_col = ["darkblue", "darkgreen", "orange"]
    elif num_fill == 4:
        fill = ["dodgerblue", "goldenrod1", "seagreen3", "orchid3"]
        cat_col = ["dodgerblue", "goldenrod1", "seagreen3", "orchid3"]
    elif num_fill == 5:
        fill = [
            "dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3"
        ]
        cat_col = [
            "dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3"
        ]
    else:
        raise NotImplementedError

    # main.pos
    # (0, 0) is lower left.
    # (1, 1) is upper right.
    # (0.5, 1) is middle top.
    main = R_var("NULL")
    if args_title:
        main = args_title
    main_cex = 2.0 * args_title_size

    font_family = "Helvetica"

    params = {
        #"col" : "transparent",   # color of outer lines
        "main": main,
        "main.pos": (0.5, args_title_y),
        "main.fontfamily": font_family,
        "main.cex": main_cex,
        "col": "#000000",  # color of outer lines
        "lty": 2,  # dashed line
        "fill": fill,  # color of circles
        "alpha": 0.50,

        # Number of items.
        "cex": cex,
        #"fontfamily" : 3,
        "fontfamily": font_family,

        # Category labels
        "cat.cex": cat_cex,
        #"cat.col" : cat_col,
        "cat.col": "#333333",
        #"cat.fontfamily" : 3,
        "cat.fontfamily": font_family,
        #"cat.default.pos" : "text",
        "cat.default.pos": "outer",
        "margin": margin,

        #"euler.d" : R_var("TRUE"),
        #"scaled" : R_var("TRUE"),
    }
    R_fn("venn.diagram", R_var("x"), filename=filename, **params)
Esempio n. 17
0
def draw_venn2(filename, all_names, name2genes, all_labels, args_margin,
               args_title, args_title_size, args_title_y, args_label_size,
               args_count_size):
    # Draw a Venn diagram using the venneuler package.
    # Generates PDF files.
    # Not implemented yet:
    #   args_title_y
    # Not good, because it doesn't provide a way to label the
    # intersection.
    import sys
    import string
    import StringIO
    from genomicode import jmath

    R_fn = jmath.R_fn
    R_var = jmath.R_var
    R_equals = jmath.R_equals

    R = jmath.start_R()

    # Prevent R from writing junk to the screen.
    handle1, handle2 = StringIO.StringIO(), StringIO.StringIO()
    old_stdout, old_stderr = sys.stdout, sys.stderr
    sys.stdout, sys.stderr = handle1, handle2
    R_fn('library', R_var('venneuler'))
    sys.stdout, sys.stderr = old_stdout, old_stderr

    # For figuring out the colors for the legend.
    x = [
        "col.fn <- function(col, alpha=0.3) {",
        "  col<- hcl(col * 360, 130, 60)",
        "  col <- col2rgb(col)/255",
        "  col <- rgb(col[1, ], col[2, ], col[3, ], alpha)",
        "  col",
        "}",
    ]
    x = "\n".join(x)
    R(x)

    assert len(all_names) < len(string.ascii_uppercase)
    varnames = list(string.ascii_uppercase[:len(all_names)])

    # Assign each of the genes to a specific list.
    gene2names = {}  # gene -> list of names
    for name in all_names:
        genes = name2genes.get(name, [])
        for gene in genes:
            if gene not in gene2names:
                gene2names[gene] = []
            assert name not in gene2names[gene]
            gene2names[gene].append(name)

    ## Make a matrix where the gene is the first column, and sets are
    ## the second column.
    #data = []
    #for gene, names in gene2names.iteritems():
    #    for n in names:
    #        i = all_names.index(n)
    #        vname = varnames[i]
    #        x = [gene, vname]
    #        data.append(x)

    # Calculate the intersection between each of the lists.
    combo2genes = {}  # (gs1, gs2[, ...]) -> list of genes
    for gene, names in gene2names.iteritems():
        vnames = []
        for n in names:
            i = all_names.index(n)
            vnames.append(varnames[i])
        combo = tuple(vnames)
        if combo not in combo2genes:
            combo2genes[combo] = []
        combo2genes[combo].append(gene)
    sets = []
    weights = []
    for combo in sorted(combo2genes):
        num_genes = len(combo2genes[combo])
        n = "&".join(combo)
        sets.append(n)
        weights.append(num_genes)
    R_equals(weights, "M")
    R_equals(sets, "n")
    R("names(M) <- n")
    #R("print(M)")

    # Maybe can draw legend instead.
    # http://stackoverflow.com/questions/9121956/legend-venn-diagram-in-venneuler

    R_fn("venneuler", R_var("M"), RETVAL="v")
    #R("print(v)")

    # Bigger margin is smaller figure.
    margin = 0.10 * args_margin
    cex = 0.65 * args_count_size
    cat_cex = 0.75 * args_label_size

    main = R_var("NULL")
    if args_title:
        main = args_title
    main_cex = 2.0 * args_title_size

    font_family = "Helvetica"

    params = {}

    # Somehow doesn't work with transparency.
    #R_fn(
    #    "bitmap", filename, type="pdfwrite",
    #    height=1600, width=1600, units="px", res=300)
    R_fn("pdf", filename)
    R("COL <- col.fn(v$colors)")
    R("LABS <- v$labels")
    for i in range(len(all_names)):
        R('v$labels[%d] <- ""' % (i + 1))
    R_fn("plot", R_var("v"))
    R_fn("title", main=main, fontfamily=font_family, **{"cex.main": main_cex})
    # Write the counts myself, so I can control the text size.
    # Doesn't work.  Hard to find right place to label.
    #rnames = list(R("rownames(v$centers)"))
    #for i in range(len(sets)):
    #    x = sets[i].split("&")
    #    if len(x) == 1:
    #        # If this is a single category, plot label in middle of circle.
    #        i = rnames.index(x[0])
    #        x = R("v$centers[%d, 1]" % (i+1))[0]
    #        y = R("v$centers[%d, 2]" % (i+1))[0]
    #        R_fn("text", x, y, weights[i], cex=cex)
    #for i in range(len(weights)):
    #    x = R("v$centers[%d, 1]" % (i+1))[0]
    #    y = R("v$centers[%d, 2]" % (i+1))[0]
    #    R_fn("text", x, y, weights[i], cex=cex)
    #for i in range(len(sets)):
    #    if sets[i].find("&") >= 0:
    #        continue
    #    x = R("v$centers[%d, 1]" % (i+1))[0]
    #    y = R("v$centers[%d, 2]" % (i+1))[0]
    #    R_fn("text", x, y, sets[i], cex=cat_cex)

    # Draw the legend so I know what is what.
    R_fn("match", varnames, R_var("LABS"), RETVAL="O")
    #R_fn(
    #    "legend", "topleft", inset=0, legend=all_labels,
    #    fill=R_var("COL[O]"))
    R_fn("dev.off")
Esempio n. 18
0
def plot_waterfall(
    filename, scores, phenotypes, group_names, sample_names, p_value, gene_id,
    mar_bottom, mar_left, mar_top, xlabel_off):
    import os
    from genomicode import jmath
    from genomicode.jmath import R_fn, R_var, R_equals
    from genomicode import config
    from genomicode import colorlib
    import analyze_clinical_outcome as aco

    # Sort by increasing score.
    O = jmath.order(scores)
    scores = [scores[i] for i in O]
    phenotypes = [phenotypes[i] for i in O]
    sample_names = [sample_names[i] for i in O]

    # Plot the colors.
    assert len(group_names) >= 2
    colors = ['#1533AD', '#FFB300']
    if len(group_names) > 2:
        x = colorlib.bild_colors(len(group_names))
        x = [aco.colortuple2hex(*x) for x in x]
        colors = x

    xlabel_size = 1.0
    height = 1600
    width = 1600

    R = jmath.start_R()
    path = config.changlab_Rlib
    plotlib = os.path.join(path, "plotlib.R")
    assert os.path.exists(plotlib), "I cannot find: %s" % plotlib
    R_fn("source", plotlib)

    #main = R_var("NA")
    main = gene_id
    sub = ""
    #sub = "%.2g" % p_value
    xlab = ""
    ylab = "Gene Expression"
    labels = sample_names
    col = [colors[group_names.index(x)] for x in phenotypes]
    x = range(1, len(scores)+1)
    y = scores

    r = (max(y)-min(y))*0.10
    mn = min(y)-r
    mx = max(y)+r
    ylim = (mn, mx)

    lwd = 2
    las = 3   # vertical labels
    cex_labels = 1.25*xlabel_size
    cex_ytick = 1.5
    #cex_legend = 1
    cex_xlab = 2.0
    cex_ylab = 2.0
    cex_sub = 2.0
    legend_x = "topleft"

    R_equals(labels, "labels")
    R_equals(y, "y")

    bm_type = "png16m"
    if filename.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    R_fn(
        "bitmap", filename, type=bm_type,
        height=height, width=width, units="px", res=300)
    
    # Set the margins.
    xlabel_bottom = 2.0
    if xlabel_off:
        R_equals(R_var("FALSE"), "labels")
        xlabel_bottom = 0.5
    x = 5*mar_bottom*xlabel_bottom, 5*mar_left, 4*mar_top, 2
    mar = [x+0.1 for x in x]
    R_fn("par", mar=mar, RETVAL="op")
    
    R_fn(
        "barplot", R_var("y"), xlab="", ylab="",
        axes=R_var("FALSE"), ylim=ylim, xpd=R_var("FALSE"),
        RETVAL="mp")
    # Make plot area solid white.
    jmath.R('usr <- par("usr")')
    jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    R_fn("box", lwd=lwd)
    mgp = 3, 1.5, 0
    R_fn("par", mgp=mgp, RETVAL="op2")
    R_fn(
        "axis", 1, lwd=lwd, labels=R_var("labels"),
        at=R_var("mp"), las=las, **{ "cex.axis" : cex_labels })
    R("par(op2)")
    R_fn("axis", 2, lwd=lwd, **{ "cex.axis" : cex_ytick })
    R_fn(
        "title", main=main, sub=sub, xlab=xlab, ylab="",
        **{ "cex.lab" : cex_xlab, "cex.main" : 2.0, "cex.sub" : cex_sub,
            "col.sub" : "#A60400" })
    R_fn("title", ylab=ylab, **{ "cex.lab" : cex_ylab } )
    R_fn(
        "barplot", R_var("y"), col=col, xlab="", ylab="",
        axes=R_var("FALSE"), ylim=ylim, add=R_var("TRUE"), xpd=R_var("FALSE"))
    R_fn(
        "legend", legend_x, legend=group_names, fill=colors, inset=0.05,
        bg="#FFFFFF")
    R("par(op)")
    R_fn("dev.off")