def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns.") parser.add_option("--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option("-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option("-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option("-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option("-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults( hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R( """cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn("correlation not computed for columns %i(%s) and %i(%s): %s" % ( x, headers[x], y, headers[y], msg)) options.stdout.write("%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2( 'cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2( 'df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % ( point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R( """new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""") mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""") R("""pred.w.clim <- predict(mod, new, interval="confidence")""") R( """matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""") R.mtext( "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"][ "(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R( """panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""") else: R( """panel.hist <- function( x,y,... ) { points(x,y,...); }""") # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R( """op <- par(mar=c(11,4,4,2))""") # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf)] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % ( a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin.") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms.""") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option("-n", "--norm-test", dest="norm_test", action="store_true", help="""test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write("# creating %i samples from normal distribution with mean %f and stddev %f\n" % ( len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write("# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test( values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test( values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created.") values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1)] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ( "','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_table2scatter.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-c", "--columns", dest="columns", type="string", help= "columns to take from table. Choices are 'all', 'all-but-first' or a ','-separated list of columns." ) parser.add_option( "--logscale", dest="logscale", type="string", help="log-transform one or both axes [default=%Default].") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file [default=%default].", metavar="FILE") parser.add_option("-f", "--file", dest="input_filename", type="string", help="filename with table data [default=%default].", metavar="FILE") parser.add_option("-2", "--file2", dest="input_filename2", type="string", help="additional data file [default=%default].", metavar="FILE") parser.add_option( "-s", "--stats", dest="statistics", type="choice", choices=("correlation", "spearman", "pearson", "count"), help="statistical quantities to compute [default=%default]", action="append") parser.add_option("-p", "--plot", dest="plot", type="choice", choices=("scatter", "pairs", "panel", "bar", "bar-stacked", "bar-besides", "1_vs_x", "matched", "boxplot", "scatter+marginal", "scatter-regression"), help="plots to plot [default=%default]", action="append") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="min threshold to use for counting method [default=%default].") parser.add_option( "-o", "--colours", dest="colours", type="int", help="column with colour information [default=%default].") parser.add_option( "-l", "--plot-labels", dest="labels", type="string", help="column labels for x and y in matched plots [default=%default].") parser.add_option("-d", "--add-diagonal", dest="add_diagonal", action="store_true", help="add diagonal to plot [default=%default].") parser.add_option("-e", "--plot-legend", dest="legend", type="int", help="column with legend [default=%default].") parser.add_option("-r", "--options", dest="r_options", type="string", help="R plotting options [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("full", "sparse"), help="output format [default=%default].") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.add_option("", "--xrange", dest="xrange", type="string", help="x viewing range of plot [default=%default].") parser.add_option("", "--yrange", dest="yrange", type="string", help="y viewing range of plot[default=%default].") parser.add_option("--allow-empty-file", dest="fail_on_empty", action="store_false", help="do not fail on empty input [default=%default].") parser.add_option("--fail-on-empty", dest="fail_on_empty", action="store_true", help="fail on empty input [default=%default].") parser.set_defaults(hardcopy=None, input_filename="", input_filename2=None, columns="all", logscale=None, statistics=[], plot=[], threshold=0.0, labels="x,y", colours=None, diagonal=False, legend=None, title=None, xrange=None, yrange=None, r_options="", fail_on_empty=True, format="full") (options, args) = E.Start(parser) if len(args) == 1 and not options.input_filename: options.input_filename = args[0] if options.columns not in ("all", "all-but-first"): options.columns = [int(x) - 1 for x in options.columns.split(",")] if options.colours: options.colours -= 1 if options.legend: options.legend -= 1 table = {} headers = [] # read data matrix if options.input_filename: lines = IOTools.openFile(options.input_filename, "r").readlines() else: # note: this will not work for interactive viewing, but # creating hardcopy plots works. lines = sys.stdin.readlines() lines = [x for x in lines if x[0] != "#"] if len(lines) == 0: if options.fail_on_empty: raise IOError("no input") E.warn("empty input") E.Stop() return matrix, headers, colours, legend = readTable(lines, "matrix", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) if options.input_filename2: # read another matrix (should be of the same format. matrix2, headers2, colours2, legend2 = readTable( lines, "matrix2", take_columns=options.columns, headers=True, colours=options.colours, row_names=options.legend) R.assign("headers", headers) ndata = R("""length( matrix[,1] )""")[0] if options.loglevel >= 1: options.stdlog.write("# read matrix: %ix%i\n" % (len(headers), ndata)) if colours: R.assign("colours", colours) for method in options.statistics: if method == "correlation": cor = R.cor(matrix, use="pairwise.complete.obs") writeMatrix(sys.stdout, cor, headers=headers, format="%5.2f") elif method == "pearson": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "n", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): try: result = R("""cor.test( matrix[,%i], matrix[,%i] )""" % (x + 1, y + 1)) except rpy.RPyException as msg: E.warn( "correlation not computed for columns %i(%s) and %i(%s): %s" % (x, headers[x], y, headers[y], msg)) options.stdout.write( "%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\n" % (headers[x], headers[y], "na", "na", "na", 0, "na", "na")) else: options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result.rx2('estimate').rx2('cor')[0], Stats.getSignificance( float(result.rx2('p.value')[0])), result.rx2('p.value')[0], result.rx2('parameter').rx2('df')[0], result.rx2('method')[0], result.rx2('alternative')[0])) elif method == "spearman": options.stdout.write("\t".join(("var1", "var2", "coeff", "passed", "pvalue", "method", "alternative")) + "\n") for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): result = R( """cor.test( matrix[,%i], matrix[,%i], method='spearman')""" % (x + 1, y + 1)) options.stdout.write( "%s\t%s\t%6.4f\t%s\t%e\t%i\t%s\t%s\n" % (headers[x], headers[y], result['estimate']['rho'], Stats.getSignificance(float(result['p.value'])), result['p.value'], result['parameter']['df'], result['method'], result['alternative'])) elif method == "count": # number of shared elements > threshold m, r, c = MatlabTools.ReadMatrix(open(options.input_filename, "r"), take=options.columns, headers=True) mask = numpy.greater(m, options.threshold) counts = numpy.dot(numpy.transpose(mask), mask) writeMatrix(options.stdout, counts, headers=c, format="%i") if options.plot: # remove columns that are completely empty if "pairs" in options.plot: colsums = R('''colSums( is.na(matrix ))''') take = [x for x in range(len(colsums)) if colsums[x] != ndata] if take: E.warn("removing empty columns %s before plotting" % str(take)) matrix = R.subset(matrix, select=[x + 1 for x in take]) R.assign("""matrix""", matrix) headers = [headers[x] for x in take] if legend: legend = [headers[x] for x in take] if options.r_options: extra_options = ", %s" % options.r_options else: extra_options = "" if options.legend is not None and len(legend): extra_options += ", legend=c('%s')" % "','".join(legend) if options.labels: xlabel, ylabel = options.labels.split(",") extra_options += ", xlab='%s', ylab='%s'" % (xlabel, ylabel) else: xlabel, ylabel = "", "" if options.colours: extra_options += ", col=colours" if options.logscale: extra_options += ", log='%s'" % options.logscale if options.xrange: extra_options += ", xlim=c(%f,%f)" % tuple( map(float, options.xrange.split(","))) if options.yrange: extra_options += ", ylim=c(%f,%f)" % tuple( map(float, options.yrange.split(","))) if options.hardcopy: if options.hardcopy.endswith(".eps"): R.postscript(options.hardcopy) elif options.hardcopy.endswith(".png"): R.png(options.hardcopy, width=1024, height=768, type="cairo") elif options.hardcopy.endswith(".jpg"): R.jpg(options.hardcopy, width=1024, height=768, type="cairo") for method in options.plot: if ndata < 100: point_size = "1" pch = "o" elif ndata < 1000: point_size = "1" pch = "o" else: point_size = "0.5" pch = "." if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options)) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R("""new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""" ) mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""" ) R("""pred.w.clim <- predict(mod, new, interval="confidence")""" ) R("""matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""" ) R.mtext("y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"]["(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R("""panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""" ) else: R("""panel.hist <- function( x,y,... ) { points(x,y,...); }""" ) # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: # "X11 used font size 8 when 2 was requested" or # similar) if options.colours: R.pairs(matrix, pch=pch, col=colours, main=options.title, panel="panel.hist", labels=headers, cex_labels=2.0) else: R.pairs(matrix, pch=pch, panel="panel.hist", main=options.title, labels=headers, cex_labels=2.0) elif method == "boxplot": extra_options += ",main='%s'" % options.title # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""boxplot( matrix %s)""" % extra_options) elif method == "bar" or method == "bar-stacked": if not options.colours: extra_options += ", col=rainbow(5)" # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), %s)""" % extra_options) elif method == "bar-besides": if not options.colours: extra_options += ", col=rainbow(%i)" % ndata # set vertical orientation if max([len(x) for x in headers]) > 40 / len(headers): # remove xlabel: extra_options = re.sub(", xlab='[^']+'", "", extra_options) extra_options += ", names.arg=headers, las=2" R("""op <- par(mar=c(11,4,4,2))""" ) # the 10 allows the names.arg below the barplot R("""barplot(as.matrix(matrix), beside=TRUE %s)""" % extra_options) elif method == "scatter+marginal": if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R("""matrix""") R(""" x <- matrix[,1]; y <- matrix[,2]; xhist <- hist(x, breaks=20, plot=FALSE); yhist <- hist(y, breaks=20, plot=FALSE); top <- max(c(xhist$counts, yhist$counts)); nf <- layout(matrix(c(2,0,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), respect=TRUE ); par(mar=c(3,3,1,1)) ; plot(x, y, cex=%s, pch="o" %s) ; par(mar=c(0,3,1,1)) ; barplot(xhist$counts, axes=FALSE, ylim=c(0, top), space=0 ) ; par(mar=c(3,0,1,1)) ; title(main='%s'); barplot(yhist$counts, axes=FALSE, xlim=c(0, top), space=0, horiz=TRUE ) ; title(main='%s'); """ % (point_size, extra_options, xlabel, ylabel)) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) elif method in ("panel", "1_vs_x", "matched"): if method == "panel": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): pairs.append((x, y)) elif method == "1_vs_x": pairs = [] for x in range(1, len(headers)): pairs.append((0, x)) # print matching columns elif method == "matched": pairs = [] for x in range(len(headers) - 1): for y in range(x + 1, len(headers)): if headers[x] == headers[y]: pairs.append((x, y)) break w = int(math.ceil(math.sqrt(len(pairs)))) h = int(math.ceil(float(len(pairs)) / w)) PosInf = 1e300000 NegInf = -1e300000 xlabel, ylabel = options.labels.split(",") R("""layout(matrix(seq(1,%i), %i, %i, byrow = TRUE))""" % (w * h, w, h)) for a, b in pairs: new_matrix = [ x for x in zip( list(matrix[a].values())[0], list(matrix[b].values())[0]) if x[0] not in (float("nan"), PosInf, NegInf) and x[1] not in (float("nan"), PosInf, NegInf) ] try: R("""plot(matrix[,%i], matrix[,%i], main='%s versus %s', cex=0.5, pch=".", xlab='%s', ylab='%s' )""" % (a + 1, b + 1, headers[b], headers[a], xlabel, ylabel)) except rpy.RException as msg: print("could not plot %s versus %s: %s" % (headers[b], headers[a], msg)) if options.hardcopy: R['dev.off']() E.info("matrix added as >matrix< in R.") if not options.hardcopy: if options.input_filename: interpreter = code.InteractiveConsole(globals()) interpreter.interact() else: E.info( "can not start new interactive session as input has come from stdin." ) E.Stop()
if method == "scatter": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options) ) if method == "scatter-regression": R("""plot( matrix[,1], matrix[,2], cex=%s, pch="o" %s)""" % (point_size, extra_options) ) dat = R("""dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R("""new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""") mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""") R("""pred.w.clim <- predict(mod, new, interval="confidence")""") R("""matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""") R.mtext( "y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"]["(Intercept)"], R("""cor( dat )[2]"""), ndata ), 3, cex = 1.0) elif method == "pairs": if options.add_diagonal: R( """panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""" ) else: R( """panel.hist <- function( x,y,... ) { points(x,y,...); }""" ) # There used to be a argument na_action="na.omit", but removed this # as there appeared error messages saying "na.action is not a graphical parameter" # and the plots showed occasionally the wrong scale. # cex=point_size also caused trouble (error message: "X11 used font size 8 when 2 was requested" or similar) if options.colours:
(point_size, extra_options)) dat = R( """dat <- data.frame(x = matrix[,1], y = matrix[,2])""") R("""new <- data.frame(x = seq( min(matrix[,1]), max(matrix[,1]), (max(matrix[,1]) - min(matrix[,1])) / 100))""" ) mod = R("""mod <- lm( y ~ x, dat)""") R("""predict(mod, new, se.fit = TRUE)""") R("""pred.w.plim <- predict(mod, new, interval="prediction")""" ) R("""pred.w.clim <- predict(mod, new, interval="confidence")""" ) R("""matpoints(new$x,cbind(pred.w.clim, pred.w.plim[,-1]), lty=c(1,2,2,3,3), type="l")""" ) R.mtext("y = %f * x + %f, r=%6.4f, n=%i" % (mod["coefficients"]["x"], mod["coefficients"]["(Intercept)"], R("""cor( dat )[2]"""), ndata), 3, cex=1.0) elif method == "pairs": if options.add_diagonal: R("""panel.hist <- function( x,y,... ) { points(x,y,...); abline(0,1); }""" ) else: R("""panel.hist <- function( x,y,... ) { points(x,y,...); }""" ) # There used to be a argument na_action="na.omit", but # removed this as there appeared error messages saying # "na.action is not a graphical parameter" and the # plots showed occasionally the wrong scale.