def t_test(X, Y, exact=True): """X,Y is a matrix slice""" # Return tuple (list of t-values, list of p-values). t-values and # p-values are floats or None (for NA, inf, nan). assert len(X) == len(Y), 'X and Y should be equal length' R = jmath.start_R() t_value = [] p_value = [] for i in range(len(X)): X[i] = [jmath.R_var('NA') if numpy.isnan(x) else x for x in X[i]] Y[i] = [jmath.R_var('NA') if numpy.isnan(x) else x for x in Y[i]] jmath.R_equals(X[i], 'x') jmath.R_equals(Y[i], 'y') R('a<-try(t.test(x,y,exact=exact), silent=TRUE)') R('if (is(a, "try-error")) p=NA else p=a$p.value') R('if (is(a, "try-error")) t=NA else t=a$t') R('if (is.null(t)) t=NA') p = R["p"] t = R["t"] if str(p) in ["NA", "inf", "nan"]: p = None else: p = float(p[0]) if str(t) in ["NA", "inf", "nan"]: t = None else: t = float(t[0]) t_value.append(t) p_value.append(p) return t_value, p_value
def convert_gene_list_platform(genes, platform): from genomicode import jmath from genomicode import arrayplatformlib platform_list = [i.name for i in arrayplatformlib.platforms] assert platform in platform_list, ('we cannot convert to the platform %s' % platform) chip = arrayplatformlib.guess_chip_from_probesets(genes) assert chip, 'we cannot guess the platform for the input file' in_attribute = arrayplatformlib.get_bm_attribute(chip) in_mart = arrayplatformlib.get_bm_organism(chip) out_attribute = arrayplatformlib.get_bm_attribute(platform) out_mart = arrayplatformlib.get_bm_organism(platform) R = jmath.start_R() jmath.R_equals_vector(genes, 'gene_id') R('library(biomaRt)') jmath.R_equals(in_attribute, 'in_attribute') jmath.R_equals(in_attribute, 'filters') jmath.R_equals(in_mart, 'in_mart') R('old=useMart("ensembl",in_mart)') jmath.R_equals(out_attribute, 'out_attribute') jmath.R_equals(out_mart, 'out_mart') R('new=useMart("ensembl",out_mart)') R( str('homolog = getLDS(attributes=in_attribute,') + str('filters=filters,values=gene_id,mart=old,') + str('attributesL=out_attribute,martL=new)')) homolog = R['homolog'] #old_id = [str(i) for i in homolog[0]] human_id = [str(i) for i in homolog[1]] return human_id
def main(): parser = argparse.ArgumentParser(description='run the gene pattern module') parser.add_argument('--parameters', default=[], action='append', help='key:value') parser.add_argument('-o', dest='outpath', default=".", help='Directory to the save the results.') parser.add_argument("module_name", nargs=1) parser.add_argument( "--id_and_version", help='specify the lsid and version in id:verison format') args = parser.parse_args() module_name = args.module_name[0] parameters = dict() for i in args.parameters: assert ':' in i, 'parameters should be in key:value format' key, value = i.split(':', 1) assert ':' not in value, 'parameters should be in key:value format' parameters[key] = value # given the module_name and the module parameters in dict, call # module in Genepattern R = jmath.start_R() jmath.R_equals(config.gp_user, 'username') jmath.R_equals(config.gp_passwd, 'password') jmath.R_equals(config.gp_server, 'servername') R('library(GenePattern)') R('gp.client <- gp.login(servername, username, password)') params = [] params.append("gp.client") if args.id_and_version: params.append( "'urn:lsid:broad.mit.edu:cancer.software.genepattern.module.analysis:%s'" % args.id_and_version) else: params.append("'%s'" % module_name) for (key, value) in parameters.iteritems(): params.append("%s='%s'" % (key, value)) params_str = ", ".join(params) x = "result <- run.analysis(%s)" % params_str R(x) # Download the files to outpath. jmath.R_equals(args.outpath, 'outpath') R('job.result.download.files(result, outpath)') assert os.path.exists(args.outpath), \ "Missing output directory for: %s" % module_name # Look for "stderr.txt". result_files = os.listdir(args.outpath) assert 'stderr.txt' not in result_files, ( "Run failed. GenePattern generated an error:\n%s" % file(os.path.join(args.outpath, 'stderr.txt')).read())
def _start_R(): global GLOBAL_R from genomicode import jmath if GLOBAL_R is None: R = jmath.start_R() R('library(biomaRt)') GLOBAL_R = R return GLOBAL_R
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os from genomicode import jmath from genomicode import filelib in_data = antecedents cwd = os.getcwd() R = jmath.start_R() R('require(limma,quietly=TRUE)') R('library(marray)') os.chdir(in_data.identifier) try: R('dir<-getwd()') R('files<-list.files(dir)') R('x.read<-read.Agilent(files)') finally: os.chdir(cwd) R('xnorm.loc <- maNorm(x.read, norm = "loess")') R('x.norm <- maNormScale(xnorm.loc, norm = "p")') tmpfile = 'tmp.txt' jmath.R_equals(tmpfile, 'tmpfile') R('write.marray(x.norm,tmpfile)') f = open(tmpfile, 'r') text = f.readlines() firstline = text[0].split() f.close() firstindex = firstline.index('"ProbeName"') if '"Sequence"' in firstline: secondindex = firstline.index('"Sequence"') else: secondindex = firstline.index('"ControlType"') sample = range(secondindex + 1, len(firstline)) f = open(outfile, 'w') for i in text: line = i.split() f.write(line[firstindex] + '\t') for j in sample: f.write(line[j] + '\t') f.write('\n') f.close() os.remove(tmpfile) assert filelib.exists_nz(outfile), ( 'the output file %s for preprocess_agilent fails' % outfile )
def run( self, network, antecedents, out_attributes, user_options, num_cores, outfile): import arrayio from Betsy import read_label_file from genomicode import jmath cls_node_train, data_node = antecedents result, label_line, second_line = read_label_file.read( cls_node_train.identifier) y = [second_line[int(i)] for i in label_line] R = jmath.start_R() M = arrayio.read(data_node.identifier) M_train = M.matrix(None, range(0, len(label_line))) M_test = M.matrix(None, range(len(label_line), M.dim()[1])) M1 = M_train.slice() M_train = jmath.transpose(M1) jmath.R_equals_matrix(M_train, 'data') M2 = M_test.slice() M2 = jmath.transpose(M2) jmath.R_equals_matrix(M2, 'test') jmath.R_equals(y, 'y') R('y<-as.factor(y)') R('require(randomForest, quietly=TRUE)') R('library(randomForest)') R('model <- randomForest(data,y=y,importance=TRUE)') R('predict_result <- predict(model, test)') predict_result = R['predict_result'] levels = predict_result.levels predict_labels = predict_result[:] predict_labels = [levels[i - 1] for i in predict_labels] name = M_test._col_names.keys()[0] sample_name = M_test._col_names[name] result = [['Sample_name', 'Predicted_class', 'Confidence']] for i in range(len(sample_name)): result.append([str(sample_name[i]), predict_labels[i], '']) f = file(outfile, 'w') for i in result: f.write('\t'.join(i)) f.write('\n') f.close()
def start_and_init_R(): global GLOBAL_R import os from genomicode import jmath from genomicode import config if GLOBAL_R is None: assert os.path.exists(config.changlab_Rlib) km_lib = os.path.join(config.changlab_Rlib, "kaplanmeierlib.R") stat_lib = os.path.join(config.changlab_Rlib, "statlib.R") prism_lib = os.path.join(config.changlab_Rlib, "prismlib.R") assert os.path.exists(km_lib), "File not found: %s" % km_lib assert os.path.exists(stat_lib), "File not found: %s" % stat_lib assert os.path.exists(prism_lib), "File not found: %s" % prism_lib R = jmath.start_R() R('require(splines, quietly=TRUE)') R('source("%s")' % km_lib) R('source("%s")' % stat_lib) R('source("%s")' % prism_lib) GLOBAL_R = R return GLOBAL_R
def main(): import argparse import math import arrayio from genomicode import config from genomicode import colorlib from genomicode import jmath from genomicode.jmath import R_fn, R_var, R_equals parser = argparse.ArgumentParser(description="") parser.add_argument("expression_file", help="Gene expression file.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") parser.add_argument("-v", "--verbose", action="store_true", help="") parser.add_argument("--prism_file", help="Save result in Prism-formatted file.") group = parser.add_argument_group(title="Genes") group.add_argument( "--gene_names", default=[], action="append", help="Comma-separated list of IDs (e.g. probes, gene names) " "to include.") group.add_argument("--all_genes", default=False, action="store_true", help="Plot all genes in the file.") group = parser.add_argument_group(title="Plot") group.add_argument("--title", default=None, help="Put a title on the plot.") group.add_argument("--height", default=None, type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", default=None, type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0 (no scaling).") group.add_argument( "--xlabel_size", default=1.0, type=float, help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument("--xlabel_off", default=False, action="store_true", help="Turn off the X labels.") group.add_argument("--ylabel", help="Label the Y axis.") group.add_argument("--gene_name_header", help="Header for gene names to be used in the legend.") group.add_argument("--yaxis_starts_at_0", action="store_true", help="Y-axis should start at 0.") group.add_argument("--legend_off", action="store_true", help="Do not draw legend.") group.add_argument("--horizontal_lines", action="store_true", help="Draw horizontal lines.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.expression_file): parser.error("I could not find file %s." % args.expression_file) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.gene_names or args.all_genes, \ "Please specify some genes to plot." assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 assert args.xlabel_size > 0 and args.xlabel_size < 10 height = args.height or 1600 width = args.width or 1600 MATRIX = arrayio.read(args.expression_file) assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix." I = None if args.gene_names: I = find_gene_names(MATRIX, args.gene_names) elif args.all_genes: I = range(MATRIX.nrow()) assert I, "No genes found." assert len(I) < 50, "Too many genes." MATRIX = MATRIX.matrix(I, None) # Find the gene names for the legend. if args.gene_name_header: h = args.gene_name_header assert h in MATRIX.row_names(), "Missing header: %s" % h gene_names = MATRIX.row_names(h) else: gene_names = [ get_pretty_gene_name(MATRIX, i) for i in range(MATRIX.nrow()) ] assert len(gene_names) == MATRIX.nrow() if args.prism_file: write_prism_file(args.prism_file, MATRIX, gene_names) # Start R and set up the environment. R = jmath.start_R() path = config.changlab_Rlib plotlib = os.path.join(path, "plotlib.R") assert os.path.exists(plotlib), "I cannot find: %s" % plotlib R_fn("source", plotlib) main = R_var("NA") if args.title: main = args.title sub = "" xlab = "" #ylab = "Gene Expression" ylab = "" if args.ylabel: ylab = args.ylabel labels = jmath.R_var("FALSE") #labels = MATRIX.col_names(arrayio.COL_ID) col = R_var("NULL") xlim = [1, MATRIX.ncol() + 1] y_max = jmath.max(jmath.max(MATRIX._X)) y_min = jmath.min(jmath.min(MATRIX._X)) ylim = [y_min - 1, y_max + 1] if args.yaxis_starts_at_0: assert y_max > 0 ylim[0] = 0 if not args.xlabel_off: labels = MATRIX.col_names(arrayio.COL_ID) lwd = 2 las = 3 # vertical labels at = R_var("NULL") if labels != jmath.R_var("FALSE"): at = range(1, len(labels) + 1) cex_labels = 1 * args.xlabel_size cex_legend = 1 cex_lab = 1.5 cex_sub = 1.5 x = colorlib.bild_colors(len(gene_names)) x = [colorlib.rgb2hex(x) for x in x] x = [x.replace("0x", "#") for x in x] col = x R_equals(MATRIX._X, "X") R_equals(labels, "labels") R_equals(at, "at") bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] R_fn("par", mar=mar, RETVAL="op") R_fn("plot", R_var("NA"), type="n", axes=R_var("FALSE"), xlab="", ylab="", xlim=xlim, ylim=ylim) jmath.R('usr <- par("usr")') jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') jmath.R_fn("box", lwd=lwd) jmath.R_fn("axis", 1, lwd=lwd, labels=R_var("labels"), at=R_var("at"), las=las, **{"cex.axis": cex_labels}) jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": 2.0, "cex.sub": cex_sub }) for i in range(MATRIX.nrow()): y = MATRIX._X[i] x = range(1, len(y) + 1) R_fn("lines", x, y, lwd=lwd, col=col[i]) R_fn("points", x, y, pch=19, cex=1, col=col[i]) if args.horizontal_lines: y1 = int(math.ceil(ylim[0])) y2 = int(math.floor(ylim[1])) for y in range(y1, y2 + 1): R_fn("lines", (1, MATRIX.ncol() + 1), (y, y), lty=3, col="#A0A0A0") if not args.legend_off: R_fn("legend", "bottomleft", legend=gene_names, fill=col, cex=1, inset=0.05, **{"box.lwd": 1.5}) R_fn("par", R_var("op")) R_fn("dev.off")
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") parser.add_argument("header", help="Which column contains data to plot.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") parser.add_argument( "--prism_file", help="Write Prism-formatted results to this file.") parser.add_argument( "--ignore_missing_values", action="store_true", help="Ignore missing values in the file.") group = parser.add_argument_group(title="Calculations") group.add_argument( "--breaks_seq", help="Set the breakpoints. Format: <start>,<stop>,<skip>.") group.add_argument( "--num_breaks", type=int, help="Number of breakpoints.") group.add_argument( "--ymax", type=int, help="Set the maximum value for the Y axis.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument( "--xlabel_size", default=1.0, type=float, help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument( "--xlabel_off", action="store_true", help="Do not label the X axis.") group.add_argument( "--ylabel_off", action="store_true", help="Do not label the Y axis.") group.add_argument( "--xtick_label_off", action="store_true", help="Do not draw the tick labels on the X axis.") group = parser.add_argument_group(title="Colors") group.add_argument( "--bar_color", help="Set the color of the bars. Default #FFFFFF") x = _fmt_palettes() group.add_argument( "--bar_palette", help="Color the bars according to a palette: %s." % x) group.add_argument( "--symmetric_palette", action="store_true", help="Make the color symmetric.") group = parser.add_argument_group(title="Appearance") group.add_argument( "--height", type=int, help="Height (in pixels) of the plot.") group.add_argument( "--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument( "--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") group.add_argument( "--xaxis_off", action="store_true", help="Do not show the X axis.") group.add_argument( "--yaxis_off", action="store_true", help="Do not show the Y axis.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) assert not (args.breaks_seq and args.num_breaks) if args.num_breaks: assert args.num_breaks >= 2 and args.num_breaks <= 1000 if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096*16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096*16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 assert args.xlabel_size > 0 and args.xlabel_size < 10 assert not (args.bar_color and args.bar_palette) assert not args.symmetric_palette or args.bar_palette assert args.ymax is None or args.ymax > 0 height = args.height or 2400 width = args.width or 3200 MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.header in MATRIX.headers, "header not found: %s" % args.header # Pull out the values for the histogram. x = MATRIX[args.header] if args.ignore_missing_values: x = [x for x in x if x.strip()] values = map(float, x) value_min = value_max = None # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if args.xlab: xlab = args.xlab ylab = "Frequency" xtick_labels = jmath.R_var("TRUE") ytick_labels = jmath.R_var("TRUE") if args.xlabel_off: xlab = "" if args.ylabel_off: ylab = "" if args.xtick_label_off: xtick_labels = jmath.R_var("FALSE") breaks = "Sturges" if args.breaks_seq: breaks = _parse_breaks_seq(args.breaks_seq) value_min, value_max = min(breaks), max(breaks) jmath.R_equals(breaks, "breaks") breaks = jmath.R_var("breaks") if args.num_breaks: breaks = args.num_breaks if value_min is not None: values = [x for x in values if x >= value_min] if value_max is not None: values = [x for x in values if x < value_max] lwd = 2 cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.5 ylim = jmath.R_var("NULL") if args.ymax is not None: ylim = [0, args.ymax] assert values jmath.R_equals(values, "X") # Figure out the colors. Do it after X is assigned. col = jmath.R_var("NULL") if args.bar_color: assert args.bar_color.startswith("#") col = args.bar_color elif args.bar_palette: # Figure out how many breaks there are. Number of bars is num # breaks + 1. jmath.R_fn( "hist", jmath.R_var("X"), breaks=breaks, plot=jmath.R_var("FALSE"), RETVAL="x") breaks = [x for x in R["x"].rx2("breaks")] num_bars = len(breaks) + 1 col = _make_col_palette( args.bar_palette, num_bars, args.symmetric_palette) bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn( "bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5*1.2*args.mar_bottom, 4*1.2*args.mar_left, 4, 2 mar = [x+0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn( "hist", jmath.R_var("X"), breaks=breaks, main="", xlab="", ylab="", ylim=ylim, axes=jmath.R_var("FALSE"), col=col, RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) #jmath.R_fn("box", lwd=lwd) # x-axis if not args.xaxis_off: jmath.R_fn( "axis", 1, lwd=lwd, labels=xtick_labels, **{ "cex.axis" : 1.5 }) # y-axis if not args.yaxis_off: jmath.R_fn( "axis", 2, lwd=lwd, labels=ytick_labels, **{ "cex.axis" : 1.5 }) jmath.R_fn( "title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab" : cex_lab, "cex.main" : cex_main, "cex.sub" : cex_sub }) R("par(op)") jmath.R_fn("dev.off") if args.prism_file: write_prism_file(args.prism_file, R["x"])
def find_diffexp_genes(outfile, gmt_file, algorithm, paired, MATRIX, geneid_header, genename_header, genename_delim, name1, name2, classes, filter_fold_change, fold_change, p_cutoff, fdr_cutoff, bonf_cutoff, sam_DELTA, sam_qq_file, edger_tagwise_dispersion, num_procs): # classes must be 0, 1, None. import os import sys import math import StringIO import warnings from rpy2 import rinterface from genomicode import config from genomicode import jmath from genomicode import genesetlib algorithm2function_unpaired = { "fold_change": "find.de.genes.fc", "ttest": "find.de.genes.ttest", "sam": "find.de.genes.sam", "ebayes": "find.de.genes.ebayes", "deseq2": "find.de.genes.deseq2", "edger": "find.de.genes.edgeR", } algorithm2function_paired = { "ebayes": "find.de.genes.paired.ebayes", } algorithm2function = algorithm2function_unpaired if paired: algorithm2function = algorithm2function_paired assert algorithm in algorithm2function_paired, \ "No paired version of %s" % algorithm assert algorithm in algorithm2function, "Unknown algorithm: %s" % algorithm # Select the relevant columns from MATRIX. I = [i for (i, x) in enumerate(classes) if x in [0, 1]] assert len(I) MATRIX = MATRIX.matrix(None, I) classes = [classes[i] for i in I] # All algorithms except "fold_change" need at least 2 samples of # each class. counts = {} for x in classes: counts[x] = counts.get(x, 0) + 1 assert sorted(counts) == [0, 1], "Only one class represented." if algorithm not in ["fold_change", "deseq2"]: assert counts[0] >= 2, "There must be at least 2 of each class." assert counts[1] >= 2, "There must be at least 2 of each class." names = [name1, name2] X = MATRIX._X Y = [names[x] for x in classes] sample_name = None if MATRIX.col_names(): sample_name = MATRIX.col_names(MATRIX.col_names()[0]) x = choose_gene_names(MATRIX) if not geneid_header: geneid_header = x[0] if not genename_header: genename_header = x[1] assert not geneid_header or geneid_header in MATRIX.row_names() assert not genename_header or genename_header in MATRIX.row_names() R = jmath.start_R() de_lib = os.path.join(config.changlab_Rlib, "diffexp.R") stat_lib = os.path.join(config.changlab_Rlib, "statlib.R") assert os.path.exists(de_lib), "I could not find file: %s" % de_lib assert os.path.exists(stat_lib), "I could not find file: %s" % stat_lib R('source("%s")' % de_lib) R('source("%s")' % stat_lib) jmath.R_equals(X, "X") jmath.R_equals(Y, "Y") if sample_name: jmath.R_equals(sample_name, "sample.name") jmath.R('colnames(X) <- sample.name') geneid = genenames = None if geneid_header: geneid = MATRIX.row_names(geneid_header) jmath.R_equals(geneid, "geneid") if genename_header: genenames = MATRIX.row_names(genename_header) jmath.R_equals(genenames, "genenames") # Set up the arguments. args = ["X", "Y"] if algorithm == "sam": args.append("%g" % sam_DELTA) if geneid: args.append("geneid=geneid") if genenames: args.append("genenames=genenames") # Pass the fold change to the algorithm, because it can affect the # multiple hypothesis correction. if filter_fold_change is not None: args.append("FOLD.CHANGE=%g" % filter_fold_change) if algorithm in ["ttest", "deseq2"]: args.append("NPROCS=%d" % num_procs) # t-test only #if show_all_genes and algorithm != "sam": if algorithm not in ["sam", "fold_change"]: args.append("filter.p05=FALSE") if algorithm == "edger": if edger_tagwise_dispersion: args.append("tagwise.dispersion=TRUE") else: args.append("tagwise.dispersion=FALSE") # Prevent SAM from writing junk to the screen. handle = StringIO.StringIO() old_stdout = sys.stdout sys.stdout = handle # Call the proper R function. DESeq2 throws off a lot of # warnings. Turn them off temporarily. fn = algorithm2function[algorithm] x = ", ".join(args) with warnings.catch_warnings(): warnings.simplefilter("ignore") R("x <- %s(%s)" % (fn, x)) R("DATA <- x$DATA") DATA_R = R["DATA"] sys.stdout = old_stdout # Write out a QQ file for SAM. if algorithm == "sam" and sam_qq_file: R('S <- x$S') jmath.R_fn("bitmap", sam_qq_file, type="png256", height=1600, width=1600, units="px", res=300) jmath.R_fn("samr.plot", jmath.R_var("S"), sam_DELTA) jmath.R_fn("dev.off") # Convert this DataFrame into a Python object. Columns of floats # can be StrVector objects if there are NA embedded within them. # NA are special objects of either type # rpy2.rinterface.NACharacterType or type # rpy2.rinterface.NARealType. tDATA_py = [] header = [DATA_R.colnames[i] for i in range(DATA_R.ncol)] for zzz, col_R in enumerate(DATA_R): # iterate over columns col_py = [col_R[i] for i in range(len(col_R))] if col_R.__class__.__name__ == "StrVector": pass elif col_R.__class__.__name__ == "FloatVector": col_py = [float(x) for x in col_py] elif col_R.__class__.__name__ == "IntVector": col_py = [int(x) for x in col_py] tDATA_py.append(col_py) DATA_py = jmath.transpose(tDATA_py) #handle = open('test01.txt', 'w') #for x in DATA_py: # print >>handle, "\t".join(map(str, x)) # Convert NA to None. for i in range(len(DATA_py)): for j in range(len(DATA_py[i])): if type(DATA_py[i][j]) in [ rinterface.NACharacterType, rinterface.NARealType ]: DATA_py[i][j] = None # Sort by increasing p-value, then decreasing fold change. name = "p.value" direction = 1 #if algorithm == "sam": # name = "Score(d)" if name not in header: name = "Log_2 Fold Change" direction = -1 assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) #schwartz = [(direction*float(x[I]), x) for x in DATA_py] values = [x[I] for x in DATA_py] for i in range(len(values)): if values[i] is None: values[i] = direction * 1E10 else: values[i] = direction * float(values[i]) schwartz = zip(values, DATA_py) schwartz.sort() DATA_py = [x[-1] for x in schwartz] # Filter based on user criteria. if fold_change is not None: log_2_fc = math.log(fold_change, 2) name = "Log_2 Fold Change" assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and abs(x[I]) >= log_2_fc ] if p_cutoff is not None: name = "p.value" assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and float(x[I]) < p_cutoff ] if fdr_cutoff is not None: name = "FDR" # This might be missing if all the genes have already been # filtered. #assert name in header, 'I could not find the "%s" column.' % name if name in header: I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and float(x[I]) < fdr_cutoff ] if bonf_cutoff is not None: name = "Bonf" assert name in header, 'I could not find the "%s" column.' % name I = header.index(name) DATA_py = [ x for x in DATA_py if x[I] is not None and float(x[I]) < bonf_cutoff ] ## If no significant genes, then don't produce any output. ##if not DATA_py: ## return # Write to the outhandle. _write_matrix(outfile, header, DATA_py) # Don't close someone else's file handle. #outhandle.close() # Write out the gene sets in GMT format, if requested. if not gmt_file: return assert "Direction" in header, 'I could not find the "Direction" column.' assert "Gene ID" in header, 'I could not find the "Gene ID" column.' assert "Gene Name" in header, 'I could not find the "Gene Name" column.' I_direction = header.index("Direction") I_geneid = header.index("Gene ID") I_genename = header.index("Gene Name") # "Higher in <name1>" # "Higher in <name2>" # "SAME" possible_directions = [ "Higher in %s" % name1, "Higher in %s" % name2, "SAME" ] direction = [x[I_direction] for x in DATA_py] for x in direction: assert x.startswith("Higher in ") or x == "SAME" assert x in possible_directions samples = [x.replace("Higher in ", "") for x in direction] genesets = [] # list of (<SAMPLE>, [UP|DN]) for s in samples: if s == "SAME": continue assert s in [name1, name2] # Make genesets relative to name2. (Assume name1 is control). d = "UP" if s == name1: s, d = name2, "DN" genesets.append((s, d)) genesets_all = sorted({}.fromkeys(genesets)) outhandle = open(gmt_file, 'w') for geneset in genesets_all: sample, direct = geneset I = [i for (i, gs) in enumerate(genesets) if gs == geneset] gid = [DATA_py[i][I_geneid] for i in I] gn = [DATA_py[i][I_genename] for i in I] # gn might be float. genesetlib expects array of strings. #import sys; sys.exit(0) gid = genesetlib.clean_genes(gid) gn = genesetlib.clean_genes(gn, delim=genename_delim) # <SAMPLE>_[ID|NAME]_[UP|DN] if gid: x = "%s_%s_%s" % (sample, "ID", direct) x = [x, "na"] + gid print >> outhandle, "\t".join(x) if gn: x = "%s_%s_%s" % (sample, "NAME", direct) x = [x, "na"] + gn print >> outhandle, "\t".join(x) outhandle.close()
def calc_association(phenotypes, scores, ignore_insufficient_groups): # Return a dictionary with keys: # n Number of samples. # m Number of groups. # scores n-list of <float> # delta None or <float> # phenotypes n-list of <string> # groups n-list of <int> [0, length(group_names)-1] # group_names m-list of <string> (unique list of pheno) # num_samples dict of <group (int)> : <int> # mean_score dict of <group (int)> : <float> # p_value <float> # relationship <string> # # May return None if there is only 1 group, and # ignore_insufficient_groups is a true value. from genomicode import jmath from genomicode import sortlib # Select only the samples with phenotype and score information. I1 = [i for (i, x) in enumerate(phenotypes) if x] I2 = [i for (i, x) in enumerate(scores) if x != ""] I = sorted(set.intersection(set(I1), set(I2))) assert I, "No valid samples." phenotypes = [phenotypes[i] for i in I] scores = [float(scores[i]) for i in I] # Figure out the groupings. #group_names = sorted({}.fromkeys(phenotypes)) group_names = sortlib.sort_natural({}.fromkeys(phenotypes)) if len(group_names) < 2 and ignore_insufficient_groups: return None assert len(group_names) >= 2, "Need at least 2 groups (%s)." % \ str(group_names) groups = [None] * len(phenotypes) for i in range(len(phenotypes)): x = group_names.index(phenotypes[i]) groups[i] = x # Calculate the association. group2scores = {} # group -> list of scores for i in range(len(scores)): n = groups[i] if n not in group2scores: group2scores[n] = [] group2scores[n].append(scores[i]) y = scores x = [[0]*len(group_names) for i in range(len(y))] for i in range(len(groups)): x[i][groups[i]] = 1 jmath.start_R() jmath.R_equals(x, "x") jmath.R_equals(y, "y") jmath.R("m <- aov(y~x)") p_value = jmath.R('summary(m)[[1]][["Pr(>F)"]][1]')[0] # Count other things. num_samples = {} for n in group2scores: num_samples[n] = len(group2scores[n]) mean_score = {} for n in group2scores: mean_score[n] = jmath.mean(group2scores[n]) # If there are exactly 2 groups, then find the difference between # the two groups. delta = None # list of deltas if len(group_names) == 2: delta = mean_score[1] - mean_score[0] # Figure out the relationship. relationship = "" assert len(group_names) >= 2 high_score = None for n, score in mean_score.iteritems(): if high_score is not None and score <= high_score: continue high_score = score x1 = "Higher" if len(group_names) > 2: x1 = "Highest" relationship = "%s in %s" % (x1, group_names[n]) SCORE = {} SCORE["n"] = len(scores) SCORE["m"] = len(group_names) SCORE["scores"] = scores SCORE["phenotypes"] = phenotypes SCORE["groups"] = groups SCORE["group_names"] = group_names SCORE["num_samples"] = num_samples SCORE["mean_score"] = mean_score SCORE["delta"] = delta SCORE["p_value"] = p_value SCORE["relationship"] = relationship return SCORE
def main(): import os import argparse from genomicode import jmath #from genomicode import AnnotationMatrix #from genomicode import colorlib #from genomicode import pcalib from genomicode import hashlib parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited text file in Prism format. " "Each column is a series. First row is header.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") group = parser.add_argument_group(title="General Appearance") group.add_argument("--no_box", action="store_true", help="Turn off the box around the plot.") group.add_argument("--height", type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument("--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument("--ylab", help="Label the Y-axis.") group = parser.add_argument_group(title="Legend") group.add_argument("--add_legend", action="store_true", help="Add a legend to the plot.") group.add_argument("--legend_inset", type=float, default=0.05, help="") LEGEND_LOCATIONS = [ "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center", ] group.add_argument("--legend_loc", choices=LEGEND_LOCATIONS, help="Where to draw the legend.") group = parser.add_argument_group(title="Point Appearance") group.add_argument("--scale_points", default=1.0, type=float, help="Scale the size of the points. Default 1.0") group.add_argument("--default_color", help="Default color of points. Format: #000000.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 assert args.legend_inset >= 0 and args.legend_inset < 10 if args.legend_loc is None: args.legend_loc = "bottomright" assert args.scale_points > 0 and args.scale_points < 20 if args.default_color: assert len(args.default_color) == 7 assert args.default_color[0] == "#" # Read the data file. # List of (name, values). MATRIX = read_prism_file(args.datafile) height = args.height or 2400 width = args.width or 3200 # Pull out the values and colors for the plot. default_color = "#000000" if args.default_color: default_color = args.default_color # Start R and set up the environment. R = jmath.start_R() R("library(beeswarm)") main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if args.xlab: xlab = args.xlab ylab = "" if args.xlab: ylab = args.ylab lwd_box = 2 lwd_axis = 2 #lwd_regr = 3 cex = 1.0 * args.scale_points cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.0 bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") R("X <- list()") for title, values in MATRIX: title_h = hashlib.hash_var(title) jmath.R_equals(values, "x") R('X[["%s"]] <- x' % title_h) keywds = { "cex.axis": cex_lab, # Y-axis "cex.names": cex_lab, # X-axis } jmath.R_fn( "beeswarm", jmath.R_var("X"), main="", xlab="", ylab="", pch=19, cex=cex, #axes=jmath.R_var("FALSE"), RETVAL="x", **keywds) # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) # Calculate correlation, and other statistics. # TODO: Should calculate this for each series. #r = jmath.R("cor(X, Y)") #p_value = jmath.R("cor.test(X, Y)$p.value") #r = r[0] #p_value = p_value[0] #print "R = %.2f" % r #print "p = %.2g" % p_value if not args.no_box: jmath.R_fn("box", lwd=lwd_box) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": cex_main, "cex.sub": cex_sub }) R("par(op)") jmath.R_fn("dev.off")
def sam(X, Y, genenames, delta, foldchange, pngfig): """X is a matrix slice,Y is the label list""" assert len(X[0]) == len(Y), 'X and Y should be equal length' R = jmath.start_R() jmath.R_equals_matrix(X, 'x') jmath.R_equals(genenames, 'genenames') jmath.R_equals(Y, 'y') jmath.R_equals(foldchange, 'foldchange') jmath.R_equals(delta, 'DELTA') R('library(samr)') R('D<-list(x=x,y=y,logged2=TRUE,geneid = 1:length(x),genenames=genenames)') R('S<-samr(D,resp.type="Two class unpaired",nperms=100)') R('DTAB<-samr.compute.delta.table(S,min.foldchange=foldchange)') R('SIG<-samr.compute.siggenes.table(S,DELTA,D,DTAB,min.foldchange=foldchange)' ) R('up<-SIG$ngenes.up') R('lo<-SIG$ngenes.lo') R('library(R.utils)') command = 'bitmap("' + pngfig + '",type="png256")' R(command) R('samr.plot(S,DELTA)') R('title(main=paste("DELTA=",DELTA))') R('dev.off()') import rpy2.robjects as robjects R = robjects.r up = R['up'] lo = R['lo'] gene_ids = [] scores = [] numerators = [] denominators = [] foldchanges = [] q_values = [] if up[0] > 0: R('geneID1<-SIG$genes.up[,"Gene ID"]') R('Score1<-SIG$genes.up[,"Score(d)"]') R('Numerator1<-SIG$genes.up[,"Numerator(r)"]') R('Denominator1<-SIG$genes.up[,"Denominator(s+s0)"]') R('foldchange1<-SIG$genes.up[,"Fold Change"]') R('q1<-SIG$genes.up[,"q-value(%)"]') gene_id = R['geneID1'] gene_ids.extend(gene_id) scores.extend(R['Score1']) numerators.extend(R['Numerator1']) denominators.extend(R['Denominator1']) foldchanges.extend(R['foldchange1']) q = [float(i) / 100 for i in R['q1']] q_values.extend(q) if lo[0] > 0: R('geneID2<-SIG$genes.lo[,"Gene ID"]') R('Score2<-SIG$genes.lo[,"Score(d)"]') R('Numerator2<-SIG$genes.lo[,"Numerator(r)"]') R('Denominator2<-SIG$genes.lo[,"Denominator(s+s0)"]') R('foldchange2<-SIG$genes.lo[,"Fold Change"]') R('q2<-SIG$genes.lo[,"q-value(%)"]') gene_id = R['geneID2'] gene_ids.extend(gene_id) scores.extend(R['Score2']) numerators.extend(R['Numerator2']) denominators.extend(R['Denominator2']) foldchanges.extend(R['foldchange2']) q = [float(i) / 100 for i in R['q2']] q_values.extend(q) return gene_ids, [scores, numerators, denominators, foldchanges, q_values]
def plot_boxplot(filename, group_names, group2values, height=None, width=None, cluster=None, title="", subtitle="", sub="", xlab="", ylab="", subtitle_size=1.0, subtitle_line=0.5, subtitle_col="#000000", xlabel_size=1.0, xlabel_off=False, mar_bottom=1.0, mar_left=1.0, mar_top=1.0): # group_names is a list of the names for each group. # group2values is a dictionary of group_name -> list of values. # Also, can be matrix (values x groups). # subtitle goes under title. sub goes under plot. from genomicode import config from genomicode import jmath from genomicode import colorlib from genomicode import pcalib # Start R and set up the environment. R = jmath.start_R() path = config.changlab_Rlib plotlib = os.path.join(path, "plotlib.R") assert os.path.exists(plotlib), "I cannot find: %s" % plotlib jmath.R_fn("source", plotlib) main = jmath.R_var("NA") if title: main = title sub = sub xlab = xlab ylab = ylab xlabel = group_names if xlabel_off: xlabel = jmath.R_var("FALSE") col = jmath.R_var("NULL") if cluster is not None: x = pcalib.choose_colors(cluster) x = [colorlib.rgb2hex(x) for x in x] x = [x.replace("0x", "#") for x in x] col = x lwd = 2 las = 3 # vertical labels at = jmath.R_var("NULL") if xlabel != jmath.R_var("FALSE"): at = range(1, len(xlabel) + 1) cex_labels = 1.25 * xlabel_size #cex_legend = 1 cex_xlab = 1.5 cex_ylab = 2.0 cex_sub = 1.5 if type(group2values) is type([]): # Is matrix. Should do more checking here. jmath.R_equals(group2values, "X") else: R("X <- list()") for i, n in enumerate(group_names): x = group2values.get(n, []) x = [x for x in x if x is not None] jmath.R_equals(x, "s") R("X[[%d]] <- s" % (i + 1)) #try: # #jmath.R_equals(MATRIX._X, "X") # jmath.R_equals(X, "X") #except ValueError, x: # # Not needed anymore. Missing values are now implemented in jmath. # ## Look for missing values. # #for i in range(len(MATRIX._X)): # # assert None not in MATRIX._X[i], \ # # "Missing values in row %d (0-based)." % i # ## Cannot diagnose error. Raise the original exception. # raise jmath.R_equals(xlabel, "labels") jmath.R_equals(at, "at") bm_type = "png16m" if filename.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", filename, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. # default is 5.1, 4.1, 4.1, 2.1 label_adjust = 1.0 if xlabel == jmath.R_var("FALSE"): label_adjust = 0.2 x = 5 * 2.0 * mar_bottom * label_adjust, 4 * 1.2 * mar_left, 4 * mar_top, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn("boxplot", jmath.R_var("X"), col=col, main="", xlab="", ylab="", axes=jmath.R_var("FALSE"), pch=19, cex=1, ylim=jmath.R_var("NULL")) # Make plot area solid white. jmath.R('usr <- par("usr")') jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') jmath.R_fn("boxplot", jmath.R_var("X"), col=col, main="", xlab="", ylab="", axes=jmath.R_var("FALSE"), pch=19, cex=1, ylim=jmath.R_var("NULL"), add=jmath.R_var("TRUE")) jmath.R_fn("box", lwd=lwd) jmath.R_fn("axis", 1, lwd=lwd, labels=jmath.R_var("labels"), at=jmath.R_var("at"), las=las, **{"cex.axis": cex_labels}) jmath.R_fn("axis", 2, lwd=lwd, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_xlab, "cex.main": 2.0, "cex.sub": cex_sub, "col.sub": "#A60400" }) if subtitle: jmath.R_fn("mtext", subtitle, cex=1.0 * subtitle_size, line=subtitle_line, col=subtitle_col) R("par(op)") jmath.R_fn("dev.off")
def main(): import os import argparse from genomicode import jmath from genomicode import AnnotationMatrix from genomicode import colorlib from genomicode import pcalib parser = argparse.ArgumentParser(description="") parser.add_argument("datafile", help="Tab-delimited data file.") #parser.add_argument("x_header", help="Which column for X values.") #parser.add_argument("y_header", help="Which column for Y values.") parser.add_argument( "plot_file", help="Name of image file, e.g. outfile.png. " "Will generate PNG format by default. If this file name ends with " ".pdf, will generate a PDF file instead.") group = parser.add_argument_group(title="Data Series") group.add_argument( "--series", action="append", help="Add a data series to the plot. At least one series must be " "plotted. Format: <x_header>;<y_header>") group = parser.add_argument_group(title="General Appearance") group.add_argument("--no_box", action="store_true", help="Turn off the box around the plot.") group.add_argument("--height", type=int, help="Height (in pixels) of the plot.") group.add_argument("--width", type=int, help="Width (in pixels) of the plot.") group.add_argument( "--mar_left", default=1.0, type=float, help="Scale margin at left of plot. Default 1.0 (no scaling).") group.add_argument("--mar_bottom", default=1.0, type=float, help="Scale margin at bottom of plot. Default 1.0.") #group.add_argument( # "--xlabel_size", default=1.0, type=float, # help="Scale the size of the labels on X-axis. Default 1.0.") group.add_argument("--log_x", action="store_true", help="Plot the X-axis on a log scale.") group.add_argument("--log_y", action="store_true", help="Plot the Y-axis on a log scale.") group.add_argument( "--qq", action="store_true", help="Make a QQ-plot. Will sort the values to be plotted.") group = parser.add_argument_group(title="Plot Labels") group.add_argument("--title", help="Put a title on the plot.") group.add_argument("--xlab", help="Label the X-axis.") group.add_argument("--ylab", help="Label the Y-axis.") group.add_argument("--add_regression", action="store_true", help="Put a regression line on the plot.") group = parser.add_argument_group(title="Legend") group.add_argument("--add_legend", action="store_true", help="Add a legend to the plot.") group.add_argument("--legend_inset", type=float, default=0.05, help="") LEGEND_LOCATIONS = [ "bottomright", "bottom", "bottomleft", "left", "topleft", "top", "topright", "right", "center", ] group.add_argument("--legend_loc", choices=LEGEND_LOCATIONS, help="Where to draw the legend.") group = parser.add_argument_group(title="Point Appearance") group.add_argument("--scale_points", default=1.0, type=float, help="Scale the size of the points. Default 1.0") group.add_argument("--label_header", help="Label each point with the values in this column.") group.add_argument("--label_size", type=float, help="Scale the size of the labels by this value.") group.add_argument("--label_pos", default="top", choices=["top", "bottom", "left", "right"], help="Where to label the points.") group = parser.add_argument_group(title="Line Appearance") group.add_argument("--add_lines", action="store_true", help="Add lines that connect the points.") group.add_argument("--scale_lines", default=1.0, type=float, help="Scale the thickness of the lines. Default 1.0") group = parser.add_argument_group(title="Identity Line") group.add_argument("--add_identity_line", action="store_true", help="Add an identity line to the plot.") group = parser.add_argument_group(title="Colors") group.add_argument( "-c", "--cluster", action="append", help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.") group.add_argument( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then index 1 is the first row " "with data. If given, then index 1 is the very first row " "in the file, including the headers.") group.add_argument("--default_color", help="Default color of points. Format: #000000.") # Parse the input arguments. args = parser.parse_args() if not os.path.exists(args.datafile): parser.error("File not found: %s" % args.datafile) if args.width is not None: assert args.width > 10, "too small" assert args.width < 4096 * 16, "width too big" if args.height is not None: assert args.height > 10, "too small" assert args.height < 4096 * 16, "height too big" assert args.mar_bottom > 0 and args.mar_bottom < 10 assert args.mar_left > 0 and args.mar_left < 10 #assert args.xlabel_size > 0 and args.xlabel_size < 10 assert args.legend_inset >= 0 and args.legend_inset < 10 if args.legend_loc is None: args.legend_loc = "bottomright" if args.default_color: assert len(args.default_color) == 7 assert args.default_color[0] == "#" MATRIX = AnnotationMatrix.read(args.datafile, False) assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix." assert args.series, "Need to add a data --series to plot." #assert len(args.series) <= 1, "Not implemented." #assert args.x_header in MATRIX.headers, \ # "header not found: %s" % args.x_header #assert args.y_header in MATRIX.headers, \ # "header not found: %s" % args.y_header if args.label_header: assert args.label_header in MATRIX.headers, \ "header not found: %s" % args.label_header if args.label_size is not None: assert args.label_size > 0 and args.label_size <= 20 assert args.scale_points > 0 and args.scale_points < 20 assert args.scale_lines > 0 and args.scale_lines < 20 series = _parse_series(MATRIX, args.series) cluster = None if args.cluster: cluster = _parse_cluster(args.cluster, args.indexes_include_headers, MATRIX) if len(series) > 1: assert not cluster, "Series and cluster not implemented." height = args.height or 2400 width = args.width or 3200 # Pull out the values and colors for the plot. default_color = "#000000" if args.default_color: default_color = args.default_color assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1) series_data = [] # list of (x_values, y_values, col) for each series for i in range(len(series)): x_header, y_header = series[i] x = MATRIX[x_header] y = MATRIX[y_header] I1 = [j for (j, a) in enumerate(x) if a] I2 = [j for (j, a) in enumerate(y) if a] I = [j for j in I1 if j in I2] x = [x[j] for j in I] y = [y[j] for j in I] x = map(float, x) y = map(float, y) assert len(x) == len(y) c = default_color if len(series) > 1: rgb = colorlib.BREWER_QUALITATIVE_SET1[i] c = colorlib.rgb2hex(rgb, prefix="#") c = [c] * len(x) x = x, y, c series_data.append(x) # Merge all the data point for each series. x_values = [] y_values = [] col = [] for (x, y, c) in series_data: x_values.extend(x) y_values.extend(y) #c = [c] * len(x) col.extend(c) assert len(x_values) == len(y_values) assert len(x_values) == len(col) if args.qq: O = jmath.order(x_values) x_values = [x_values[i] for i in O] y_values = [y_values[i] for i in O] col = [col[i] for i in O] if cluster is not None: col_rgb = pcalib.choose_colors(cluster) col = [default_color] * len(col_rgb) for i in range(len(col_rgb)): if col_rgb[i] is None: continue col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#") assert len(col) == len(x_values) #for i in range(len(x_values)): # x = x_values[i], y_values[i], cluster[i], col[i] # print "\t".join(map(str, x)) # Start R and set up the environment. R = jmath.start_R() main = jmath.R_var("NA") if args.title: main = args.title sub = "" xlab = "" if len(series) == 1: xlab = x_header if args.xlab: xlab = args.xlab ylab = "" if len(series) == 1: ylab = y_header if args.xlab: ylab = args.ylab lwd_box = 2 lwd_axis = 2 lwd_regr = 3 cex = 1.0 * args.scale_points cex_lab = 1.5 cex_main = 2.0 cex_sub = 1.0 plot_log = "" if args.log_x: plot_log += "x" if args.log_y: plot_log += "y" assert x_values assert y_values jmath.R_equals(x_values, "X") jmath.R_equals(y_values, "Y") bm_type = "png16m" if args.plot_file.lower().endswith(".pdf"): bm_type = "pdfwrite" jmath.R_fn("bitmap", args.plot_file, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2 mar = [x + 0.1 for x in x] jmath.R_fn("par", mar=mar, RETVAL="op") jmath.R_fn("plot", jmath.R_var("X"), jmath.R_var("Y"), main="", xlab="", ylab="", pch=19, cex=cex, log=plot_log, col=col, axes=jmath.R_var("FALSE"), RETVAL="x") # Make plot area solid white. #jmath.R('usr <- par("usr")') #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') #jmath.R_fn( # "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"), # main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"), # add=jmath.R_var("TRUE")) if args.add_lines: lwd = 4 * args.scale_lines i = 0 for (x, y, c) in series_data: # Cannot use c for the color. It might've been changed by # --cluster. assert col and i < len(col) c = col[i:i + len(x)] i += len(x) # The "lines" function takes a scalar for col (except for # type=h, histogram vertical lines). If there are # multiple colors, then split up the points based on the # colors. l_x, l_y, l_c = [], [], None for j in range(len(x)): if c[j] != l_c: if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) # Add the previous point so that the points will # connect. if l_x: l_x = [l_x[-1]] l_y = [l_y[-1]] else: l_x, l_y, l_c = [], [], None l_x.append(x[j]) l_y.append(y[j]) l_c = c[j] if l_x: jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.add_identity_line: lwd = 4 x_min, x_max = min(x_values), max(x_values) y_min, y_max = min(y_values), max(y_values) iden_min = max(x_min, y_min) iden_max = min(x_max, y_max) l_x = [iden_min, iden_max] l_y = l_x l_c = "#FF0000" jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c) if args.label_header: cex = 1 if args.label_size is not None: cex = args.label_size pos2specifier = { "top": 3, "bottom": 1, "left": 2, "right": 4, } pos = pos2specifier[args.label_pos] point_labels = MATRIX[args.label_header] jmath.R_fn("text", jmath.R_var("X"), jmath.R_var("Y"), labels=point_labels, cex=cex, pos=pos) # Calculate correlation, and other statistics. # TODO: Should calculate this for each series. r = jmath.R("cor(X, Y)") p_value = jmath.R("cor.test(X, Y)$p.value") r = r[0] p_value = p_value[0] print "R = %.2f" % r print "p = %.2g" % p_value # Add a regression line. if args.add_regression: jmath.R("fit <- lm(Y ~ X)") coef = jmath.R("fit$coefficients") assert len(coef) == 2 b, m = coef x1 = min(x_values) y1 = x1 * m + b x2 = max(x_values) y2 = x2 * m + b jmath.R_fn("lines", [x1, x2], [y1, y2], lwd=lwd_regr, lty=2, col="#C63F31") sub = "R=%.2f (p=%.2g)" % (r, p_value) header = "X", "Y", "R", "p" print "\t".join(header) x = xlab, ylab, r, p_value print "\t".join(map(str, x)) if args.add_legend: leg = [x[1] for x in series] fill = [x[-1] for x in series_data] #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)") # alpha does not seem to be supported here. jmath.R_fn("legend", args.legend_loc, legend=leg, fill=fill, inset=args.legend_inset) if not args.no_box: jmath.R_fn("box", lwd=lwd_box) jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5}) jmath.R_fn("title", main=main, sub=sub, xlab=xlab, ylab=ylab, **{ "cex.lab": cex_lab, "cex.main": cex_main, "cex.sub": cex_sub }) R("par(op)") jmath.R_fn("dev.off")
def draw_venn1(filename, all_names, name2genes, all_labels, args_margin, args_title, args_title_size, args_title_y, args_label_size, args_count_size): # Draw a Venn diagram using the VennDiagram package. # Only up to 5 areas. No Euler plots. # Generates TIFF files. import sys import StringIO from genomicode import jmath R_fn = jmath.R_fn R_var = jmath.R_var R_equals = jmath.R_equals R = jmath.start_R() # Prevent R from writing junk to the screen. handle1, handle2 = StringIO.StringIO(), StringIO.StringIO() old_stdout, old_stderr = sys.stdout, sys.stderr sys.stdout, sys.stderr = handle1, handle2 R_fn('library', R_var('VennDiagram')) sys.stdout, sys.stderr = old_stdout, old_stderr # venn diagram can't handle missing gene sets. Get rid of them. I = [i for (i, x) in enumerate(all_names) if x in name2genes] all_names = [all_names[i] for i in I] all_labels = [all_labels[i] for i in I] # Five is the maximum supported by package. assert len(all_names) <= 5, "Can't draw venn diagram with %d circles." % \ len(all_names) varnames = ["A", "B", "C", "D", "E"] for i in range(len(all_labels)): n = all_names[i] genes = name2genes[n] R_equals(genes, varnames[i]) #n1, n2, n3 = all_names #R_equals(name2genes[n1], "A") #R_equals(name2genes[n2], "B") #R_equals(name2genes[n3], "C") if len(all_names) == 2: R('x <- list(A=A, B=B)') elif len(all_names) == 3: R('x <- list(A=A, B=B, C=C)') elif len(all_names) == 4: R('x <- list(A=A, B=B, C=C, D=D)') elif len(all_names) == 5: R('x <- list(A=A, B=B, C=C, D=D, E=E)') else: raise NotImplementedError for i in range(len(all_names)): #n = all_names[i] n = all_labels[i] R('names(x)[%d] <- "%s"' % (i + 1, n)) #R('names(x)[1] <- "%s"' % n1) #R('names(x)[2] <- "%s"' % n2) #R('names(x)[3] <- "%s"' % n3) #cex = 1*args_count_size # Size of number in each circle. #cat_cex = 1.5*args_label_size # Size of category labels. #margin = 0.05*args_margin # Amount of space around plot. # Bigger margin is smaller figure. margin = 0.10 * args_margin cat_cex = 0.75 * args_label_size cex = 0.65 * args_count_size # The length of fill needs to match the number of non-empty # values. x = [x for x in all_names if x in name2genes] num_fill = len(x) if num_fill == 2: fill = ["cornflowerblue", "darkorchid1"] cat_col = ["cornflowerblue", "darkorchid1"] elif num_fill == 3: fill = ["cornflowerblue", "green", "yellow"] cat_col = ["darkblue", "darkgreen", "orange"] elif num_fill == 4: fill = ["dodgerblue", "goldenrod1", "seagreen3", "orchid3"] cat_col = ["dodgerblue", "goldenrod1", "seagreen3", "orchid3"] elif num_fill == 5: fill = [ "dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3" ] cat_col = [ "dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3" ] else: raise NotImplementedError # main.pos # (0, 0) is lower left. # (1, 1) is upper right. # (0.5, 1) is middle top. main = R_var("NULL") if args_title: main = args_title main_cex = 2.0 * args_title_size font_family = "Helvetica" params = { #"col" : "transparent", # color of outer lines "main": main, "main.pos": (0.5, args_title_y), "main.fontfamily": font_family, "main.cex": main_cex, "col": "#000000", # color of outer lines "lty": 2, # dashed line "fill": fill, # color of circles "alpha": 0.50, # Number of items. "cex": cex, #"fontfamily" : 3, "fontfamily": font_family, # Category labels "cat.cex": cat_cex, #"cat.col" : cat_col, "cat.col": "#333333", #"cat.fontfamily" : 3, "cat.fontfamily": font_family, #"cat.default.pos" : "text", "cat.default.pos": "outer", "margin": margin, #"euler.d" : R_var("TRUE"), #"scaled" : R_var("TRUE"), } R_fn("venn.diagram", R_var("x"), filename=filename, **params)
def draw_venn2(filename, all_names, name2genes, all_labels, args_margin, args_title, args_title_size, args_title_y, args_label_size, args_count_size): # Draw a Venn diagram using the venneuler package. # Generates PDF files. # Not implemented yet: # args_title_y # Not good, because it doesn't provide a way to label the # intersection. import sys import string import StringIO from genomicode import jmath R_fn = jmath.R_fn R_var = jmath.R_var R_equals = jmath.R_equals R = jmath.start_R() # Prevent R from writing junk to the screen. handle1, handle2 = StringIO.StringIO(), StringIO.StringIO() old_stdout, old_stderr = sys.stdout, sys.stderr sys.stdout, sys.stderr = handle1, handle2 R_fn('library', R_var('venneuler')) sys.stdout, sys.stderr = old_stdout, old_stderr # For figuring out the colors for the legend. x = [ "col.fn <- function(col, alpha=0.3) {", " col<- hcl(col * 360, 130, 60)", " col <- col2rgb(col)/255", " col <- rgb(col[1, ], col[2, ], col[3, ], alpha)", " col", "}", ] x = "\n".join(x) R(x) assert len(all_names) < len(string.ascii_uppercase) varnames = list(string.ascii_uppercase[:len(all_names)]) # Assign each of the genes to a specific list. gene2names = {} # gene -> list of names for name in all_names: genes = name2genes.get(name, []) for gene in genes: if gene not in gene2names: gene2names[gene] = [] assert name not in gene2names[gene] gene2names[gene].append(name) ## Make a matrix where the gene is the first column, and sets are ## the second column. #data = [] #for gene, names in gene2names.iteritems(): # for n in names: # i = all_names.index(n) # vname = varnames[i] # x = [gene, vname] # data.append(x) # Calculate the intersection between each of the lists. combo2genes = {} # (gs1, gs2[, ...]) -> list of genes for gene, names in gene2names.iteritems(): vnames = [] for n in names: i = all_names.index(n) vnames.append(varnames[i]) combo = tuple(vnames) if combo not in combo2genes: combo2genes[combo] = [] combo2genes[combo].append(gene) sets = [] weights = [] for combo in sorted(combo2genes): num_genes = len(combo2genes[combo]) n = "&".join(combo) sets.append(n) weights.append(num_genes) R_equals(weights, "M") R_equals(sets, "n") R("names(M) <- n") #R("print(M)") # Maybe can draw legend instead. # http://stackoverflow.com/questions/9121956/legend-venn-diagram-in-venneuler R_fn("venneuler", R_var("M"), RETVAL="v") #R("print(v)") # Bigger margin is smaller figure. margin = 0.10 * args_margin cex = 0.65 * args_count_size cat_cex = 0.75 * args_label_size main = R_var("NULL") if args_title: main = args_title main_cex = 2.0 * args_title_size font_family = "Helvetica" params = {} # Somehow doesn't work with transparency. #R_fn( # "bitmap", filename, type="pdfwrite", # height=1600, width=1600, units="px", res=300) R_fn("pdf", filename) R("COL <- col.fn(v$colors)") R("LABS <- v$labels") for i in range(len(all_names)): R('v$labels[%d] <- ""' % (i + 1)) R_fn("plot", R_var("v")) R_fn("title", main=main, fontfamily=font_family, **{"cex.main": main_cex}) # Write the counts myself, so I can control the text size. # Doesn't work. Hard to find right place to label. #rnames = list(R("rownames(v$centers)")) #for i in range(len(sets)): # x = sets[i].split("&") # if len(x) == 1: # # If this is a single category, plot label in middle of circle. # i = rnames.index(x[0]) # x = R("v$centers[%d, 1]" % (i+1))[0] # y = R("v$centers[%d, 2]" % (i+1))[0] # R_fn("text", x, y, weights[i], cex=cex) #for i in range(len(weights)): # x = R("v$centers[%d, 1]" % (i+1))[0] # y = R("v$centers[%d, 2]" % (i+1))[0] # R_fn("text", x, y, weights[i], cex=cex) #for i in range(len(sets)): # if sets[i].find("&") >= 0: # continue # x = R("v$centers[%d, 1]" % (i+1))[0] # y = R("v$centers[%d, 2]" % (i+1))[0] # R_fn("text", x, y, sets[i], cex=cat_cex) # Draw the legend so I know what is what. R_fn("match", varnames, R_var("LABS"), RETVAL="O") #R_fn( # "legend", "topleft", inset=0, legend=all_labels, # fill=R_var("COL[O]")) R_fn("dev.off")
def plot_waterfall( filename, scores, phenotypes, group_names, sample_names, p_value, gene_id, mar_bottom, mar_left, mar_top, xlabel_off): import os from genomicode import jmath from genomicode.jmath import R_fn, R_var, R_equals from genomicode import config from genomicode import colorlib import analyze_clinical_outcome as aco # Sort by increasing score. O = jmath.order(scores) scores = [scores[i] for i in O] phenotypes = [phenotypes[i] for i in O] sample_names = [sample_names[i] for i in O] # Plot the colors. assert len(group_names) >= 2 colors = ['#1533AD', '#FFB300'] if len(group_names) > 2: x = colorlib.bild_colors(len(group_names)) x = [aco.colortuple2hex(*x) for x in x] colors = x xlabel_size = 1.0 height = 1600 width = 1600 R = jmath.start_R() path = config.changlab_Rlib plotlib = os.path.join(path, "plotlib.R") assert os.path.exists(plotlib), "I cannot find: %s" % plotlib R_fn("source", plotlib) #main = R_var("NA") main = gene_id sub = "" #sub = "%.2g" % p_value xlab = "" ylab = "Gene Expression" labels = sample_names col = [colors[group_names.index(x)] for x in phenotypes] x = range(1, len(scores)+1) y = scores r = (max(y)-min(y))*0.10 mn = min(y)-r mx = max(y)+r ylim = (mn, mx) lwd = 2 las = 3 # vertical labels cex_labels = 1.25*xlabel_size cex_ytick = 1.5 #cex_legend = 1 cex_xlab = 2.0 cex_ylab = 2.0 cex_sub = 2.0 legend_x = "topleft" R_equals(labels, "labels") R_equals(y, "y") bm_type = "png16m" if filename.lower().endswith(".pdf"): bm_type = "pdfwrite" R_fn( "bitmap", filename, type=bm_type, height=height, width=width, units="px", res=300) # Set the margins. xlabel_bottom = 2.0 if xlabel_off: R_equals(R_var("FALSE"), "labels") xlabel_bottom = 0.5 x = 5*mar_bottom*xlabel_bottom, 5*mar_left, 4*mar_top, 2 mar = [x+0.1 for x in x] R_fn("par", mar=mar, RETVAL="op") R_fn( "barplot", R_var("y"), xlab="", ylab="", axes=R_var("FALSE"), ylim=ylim, xpd=R_var("FALSE"), RETVAL="mp") # Make plot area solid white. jmath.R('usr <- par("usr")') jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")') R_fn("box", lwd=lwd) mgp = 3, 1.5, 0 R_fn("par", mgp=mgp, RETVAL="op2") R_fn( "axis", 1, lwd=lwd, labels=R_var("labels"), at=R_var("mp"), las=las, **{ "cex.axis" : cex_labels }) R("par(op2)") R_fn("axis", 2, lwd=lwd, **{ "cex.axis" : cex_ytick }) R_fn( "title", main=main, sub=sub, xlab=xlab, ylab="", **{ "cex.lab" : cex_xlab, "cex.main" : 2.0, "cex.sub" : cex_sub, "col.sub" : "#A60400" }) R_fn("title", ylab=ylab, **{ "cex.lab" : cex_ylab } ) R_fn( "barplot", R_var("y"), col=col, xlab="", ylab="", axes=R_var("FALSE"), ylim=ylim, add=R_var("TRUE"), xpd=R_var("FALSE")) R_fn( "legend", legend_x, legend=group_names, fill=colors, inset=0.05, bg="#FFFFFF") R("par(op)") R_fn("dev.off")