def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.start(parser, add_pipe_options=True) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print("## Results for %s" % result['method']) for x in ['p.value', 'statistic', 'alternative', 'method']: print(x, result[x]) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: diff_transcript_sets.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percent columns") parser.add_option("-d", "--dump-sets", dest="dump_sets", action="append", type="choice", choices=("rest_genes1", "rest_genes2", "intersection", "union"), help="dump sets of transcripts/genes") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern to use for dumped sets. Should contain one %s.") parser.set_defaults( separator="|", add_percent="False", dump_sets=[], output_pattern="%s", ) (options, args) = E.start(parser) options.filename1, options.filename2 = args ids1, nerrors1 = IOTools.ReadList(open(options.filename1, "r")) ids2, nerrors2 = IOTools.ReadList(open(options.filename2, "r")) genes1, transcripts1 = countGenesTranscripts(ids1, options) genes2, transcripts2 = countGenesTranscripts(ids2, options) options.stdout.write( "species\tngenes1\tntranscripts1\tngenes2\tntranscripts2\ttr_inter\ttr_union\ttr_rest1\ttr_rest2\ttr_inter\tg_union\tg_rest1\tg_rest2" ) options.stdout.write("\ttr_rest1\ttr_rest2\tg_rest1\tg_rest2") options.stdout.write("\n") for species in set(genes1.keys()).union(set(genes2.keys())): nt1, nt2, ng1, ng2 = "na", "na", "na", "na" if species in genes1: g1 = genes1[species] t1 = transcripts1[species] nt1 = "%i" % len(transcripts1[species]) ng1 = "%i" % len(genes1[species]) else: t1, g1 = None, None if species in genes2: g2 = genes2[species] t2 = transcripts2[species] nt2 = "%i" % len(transcripts2[species]) ng2 = "%i" % len(genes2[species]) else: t2, g2 = None, None if species in transcripts1 and transcripts2: ct = "%i" % len(t1.intersection(t2)) ut = "%i" % len(t2.union(t1)) rt1 = "%i" % len(t1.difference(t2)) rt2 = "%i" % len(t2.difference(t1)) else: ct, ut, rt1, rt2 = ["na"] * 4 if species in genes1 and genes2: cg = "%i" % len(g1.intersection(g2)) ug = "%i" % len(g2.union(g1)) rg1 = "%i" % len(g1.difference(g2)) rg2 = "%i" % len(g2.difference(g1)) else: cg, ug, rg1, rg2 = ["na"] * 4 options.stdout.write("\t".join((species, nt1, ng1, nt2, ng2))) options.stdout.write("\t") options.stdout.write("\t".join((ct, ut, rt1, rt2))) options.stdout.write("\t") options.stdout.write("\t".join((cg, ug, rg1, rg2))) if options.add_percent: if species in genes1 and genes2: rg1 = "%5.2f" % (100.0 * len(g1.difference(g2)) / len(g1)) rg2 = "%5.2f" % (100.0 * len(g2.difference(g1)) / len(g2)) if species in transcripts1 and transcripts2: rt1 = "%5.2f" % (100.0 * len(t1.difference(t2)) / len(t1)) rt2 = "%5.2f" % (100.0 * len(t2.difference(t1)) / len(t2)) options.stdout.write("\t") options.stdout.write("\t".join((rt1, rt2, rg1, rg2))) options.stdout.write("\n") for choice in options.dump_sets: output_set = None if choice == "rest_genes1" and g1 and g2: output_set = getTranscriptsForGenes(g1.difference(g2), ids1, options) elif choice == "rest_genes2" and g1 and g2: output_set = getTranscriptsForGenes(g2.difference(g1), ids2, options) if output_set: outfile = IOTools.open_file(options.output_pattern % (choice), "w") for x in output_set: outfile.write("%s\n" % (x, )) outfile.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use [t-test=t-test,wilcox=wilcox]", choices=("t-test", "wilcox")) parser.add_option("-1", "--infile", dest="filename_input", type="string", help="input filename with vector of values.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename with vector of values.") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.set_defaults( method="t-test", filename_input=None, header="value", ) (options, args) = E.start(parser, add_pipe_options=True) if options.filename_input: infile = IOTools.open_file(options.filename_input, "r") else: infile = sys.stdin values, errors = IOTools.ReadList(infile, map_function=float) if options.filename_input: infile.close() if errors: E.warn("errors in input: %s" % ";".join(map(str, errors))) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.filename_input2: infile = IOTools.open_file(options.filename_input2, "r") values2, errors2 = IOTools.ReadList(infile, map_function=float) infile.close() else: values2 = None stat = Stats.Summary(values) power, diff_at_power95 = None, None if options.method == "t-test": if values2: result = R.t_test(values, values2, *xargs, **kwargs) else: result = R.t_test(values, *xargs, **kwargs) # compute power of test power = R.power_t_test(n=len(values), delta=abs(stat["mean"]), sd=stat["stddev"], sig_level=0.05)['power'] diff_at_power95 = R.power_t_test(n=len(values), power=0.95, sd=stat["stddev"], sig_level=0.05)['delta'] if options.method == "wilcox": result = R.wilcox_test(values, *xargs, **kwargs) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key, value in sorted(result.items()): if key == "data.name": continue if key == "p.value": options.stdout.write("%s\t%5.2e\n" % (str(key), value)) else: options.stdout.write("%s\t%s\n" % (str(key), str(value))) for key, value in list(stat.items()): options.stdout.write("%s\t%s\n" % (str(key), str(value))) if power: options.stdout.write("1-power\t%5.2e\n" % (1.0 - power)) options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "normalize-by-min-diagonal", "normalize-by-column", "log", "ln", "negzero2value", "set-diagonal", "subtract-matrix", "mix-matrix", "normalize-by-matrix", "normalize-by-column-max", "normalize-by-row-max", "normalize-by-column-min", "normalize-by-row-min", "normalize-by-column-median", "normalize-by-row-median", "normalize-by-column-mean", "normalize-by-row-mean", "normalize-by-column-total", "normalize-by-row-total", "correspondence-analysis", "normalize-by-value", "add-value", "sort-rows", "sort-columns", "transpose", "upper-bound", "lower-bound", "subtract-first-col", "multiply-by-value", "divide-by-value", "mask-rows", "mask-columns", "mask-rows-and-columns", "symmetrize-mean", "symmetrize-max", "symmetrize-min", ), help="""method to use [default=%default]""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by [default=%default].") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default=%default].") parser.add_option("--rows-tsv-file", dest="filename_rows", type="string", help="filename with rows to mask [default=%default].") parser.add_option("--columns-tsv-file", dest="filename_columns", type="string", help="filename with columns to mask [default=%default].") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--header-names", dest="headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--no-headers", dest="headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("full", "sparse", "phylip"), help="""input format for matrix.""") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("full", "sparse", "phylip"), help="""output format for matrix.""") parser.add_option( "--missing-value", dest="missing", type="float", help= "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]." ) parser.set_defaults( methods=[], scale=1.0, headers=True, format="%6.4f", output_format="full", input_format="full", value=0.0, parameters="", write_separators=True, filename_rows=None, filename_columns=None, missing=None, ) (options, args) = E.start(parser) options.parameters = options.parameters.split(",") lines = [x for x in sys.stdin.readlines() if x[0] != "#"] if len(lines) == 0: raise IOError("no input") chunks = [x for x in range(len(lines)) if lines[x][0] == ">"] if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) if options.filename_rows: row_names, n = IOTools.ReadList(open(options.filename_rows, "r")) if options.filename_columns: column_names, n = IOTools.ReadList(open(options.filename_columns, "r")) for chunk in range(len(chunks) - 1): try: raw_matrix, row_headers, col_headers = MatlabTools.readMatrix( StringIO("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])), format=options.input_format, headers=options.headers, missing=options.missing) except ValueError as msg: E.warn("matrix could not be read: %s" % msg) continue nrows, ncols = raw_matrix.shape E.debug("read matrix: %i x %i, %i row titles, %i colum titles" % (nrows, ncols, len(row_headers), len(col_headers))) parameter = 0 for method in options.methods: matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape) if method in ("normalize-by-matrix", "subtract-matrix", "mix-matrix", "add-matrix"): other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix( open(options.parameters[parameter], "r"), headers=options.headers) other_nrows, other_ncols = other_matrix.shape if options.loglevel >= 2: options.stdlog.write( "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n" % (options.parameters[parameter], other_nrows, other_ncols, len(other_row_headers), len(other_col_headers))) parameter += 1 elif method == "normalize-by-min-diagonal": for x in range(nrows): for y in range(ncols): m = min(raw_matrix[x, x], raw_matrix[y, y]) if m > 0: matrix[x, y] = raw_matrix[x, y] / m elif method == "normalize-by-column": if nrows != ncols: raise ValueError("only supported for symmeric matrices") for x in range(nrows): for y in range(ncols): if raw_matrix[y, y] > 0: matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y] elif method == "normalize-by-value": matrix = raw_matrix / float(options.parameters[parameter]) parameter += 1 elif method == "normalize-by-row": if nrows != ncols: raise ValueError("only supported for symmeric matrices") for x in range(nrows): for y in range(ncols): if raw_matrix[y, y] > 0: matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x] elif method == "subtract-first-col": for x in range(nrows): for y in range(ncols): matrix[x, y] -= raw_matrix[x, 0] elif method.startswith("normalize-by-column"): if method.endswith("max"): f = max elif method.endswith("min"): f = min elif method.endswith("median"): f = scipy.median elif method.endswith("mean"): f = scipy.mean elif method.endswith("total"): f = sum for y in range(ncols): m = f(matrix[:, y]) if m != 0: for x in range(nrows): matrix[x, y] = matrix[x, y] / m elif method.startswith("normalize-by-row"): if method.endswith("max"): f = max elif method.endswith("min"): f = min elif method.endswith("median"): f = scipy.median elif method.endswith("mean"): f = scipy.mean elif method.endswith("total"): f = sum for x in range(nrows): m = f(matrix[x, :]) if m != 0: for y in range(ncols): matrix[x, y] = raw_matrix[x, y] / m elif method == "negzero2value": # set zero/negative values to a value for x in range(nrows): for y in range(ncols): if matrix[x, y] <= 0: matrix[x, y] = options.value elif method == "minmax": # set zero/negative values to a value for x in range(nrows): for y in range(ncols): matrix[x, y], matrix[y, x] = \ min(matrix[x, y], matrix[y, x]), \ max(matrix[x, y], matrix[y, x]) elif method == "log": # apply log to all values. for x in range(nrows): for y in range(ncols): if matrix[x, y] > 0: matrix[x, y] = math.log10(matrix[x, y]) elif method == "ln": for x in range(nrows): for y in range(ncols): if matrix[x, y] > 0: matrix[x, y] = math.log(matrix[x, y]) elif method == "transpose": matrix = numpy.transpose(matrix) row_headers, col_headers = col_headers, row_headers nrows, ncols = ncols, nrows elif method == "mul": matrix = numpy.dot(matrix, numpy.transpose(matrix)) col_headers = row_headers elif method == "multiply-by-value": matrix *= options.value elif method == "divide-by-value": matrix /= options.value elif method == "add-value": matrix += options.value elif method == "angle": # write angles between col vectors v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0)) matrix = numpy.dot(numpy.transpose(matrix), matrix) row_headers = col_headers nrows = ncols for x in range(nrows): for y in range(ncols): matrix[x, y] /= v1[x] * v1[y] elif method == "euclid": # convert to euclidean distance matrix matrix = numpy.zeros((ncols, ncols), numpy.float) for c1 in range(0, ncols - 1): for c2 in range(c1 + 1, ncols): for r in range(0, nrows): d = raw_matrix[r][c1] - raw_matrix[r][c2] matrix[c1, c2] += (d * d) matrix[c2, c1] = matrix[c1, c2] matrix = numpy.sqrt(matrix) row_headers = col_headers nrows = ncols elif method.startswith("symmetrize"): f = method.split("-")[1] if f == "max": f = max elif f == "min": f = min elif f == "mean": f = lambda x, y: float(x + y) / 2 if nrows != ncols: raise ValueError( "symmetrize only available for symmetric matrices") if row_headers != col_headers: raise ValueError( "symmetrize not available for permuted matrices") for x in range(nrows): for y in range(ncols): matrix[x, y] = matrix[y, x] = f(matrix[x, y], matrix[y, x]) elif method == "sub": matrix = options.value - matrix elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[parameter]) new_value = float(options.parameters[parameter + 1]) parameter += 2 if method == "upper-bound": for x in range(nrows): for y in range(ncols): if matrix[x, y] > boundary: matrix[x, y] = new_value else: for x in range(nrows): for y in range(ncols): if matrix[x, y] < boundary: matrix[x, y] = new_value elif method == "subtract-matrix": matrix = matrix - other_matrix elif method == "add-matrix": matrix = matrix + other_matrix elif method == "normalize-by-matrix": # set 0s to 1 in the other matrix for x in range(nrows): for y in range(ncols): if other_matrix[x, y] == 0: other_matrix[x, y] = 1.0 matrix = matrix / other_matrix elif method == "mix-matrix": for x in range(len(other_row_headers) - 1): for y in range(x + 1, len(other_col_headers)): matrix[x, y] = other_matrix[x, y] elif method == "set-diagonal": value = float(options.parameters[parameter]) for x in range(min(nrows, ncols)): matrix[x, x] = value parameter += 1 elif method == "transpose": matrix = numpy.transpose(raw_matrix) row_headers, col_headers = col_headers, row_headers elif method == "correspondence-analysis": row_indices, col_indices = CorrespondenceAnalysis.GetIndices( raw_matrix) map_row_new2old = numpy.argsort(row_indices) map_col_new2old = numpy.argsort(col_indices) matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix( raw_matrix, map_row_new2old, map_col_new2old, row_headers=row_headers, col_headers=col_headers) elif method == "mask-rows": r = set(row_names) for x in range(len(row_headers)): if row_headers[x] in r: matrix[x, :] = options.value elif method == "mask-columns": r = set(column_names) for x in range(len(col_headers)): if col_headers[x] in r: matrix[:, x] = options.value elif method == "mask-rows-and-columns": r = set(row_names) c = set(column_names) for x in range(len(row_headers)): for y in range(len(col_headers)): if row_headers[x] in r and col_headers[y] in c: matrix[x, y] = options.value raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape) else: # for simple re-formatting jobs matrix = raw_matrix if options.write_separators: options.stdout.write(lines[chunks[chunk]]) MatlabTools.writeMatrix(sys.stdout, matrix, value_format=options.format, format=options.output_format, row_headers=row_headers, col_headers=col_headers) E.stop()