def check(self, method): '''check for length equality and elementwise equality.''' a = R['p.adjust'](self.pvalues, method=method) b = Stats.adjustPValues(self.pvalues, method=method) self.assertEqual(len(a), len(b)) for x, y in zip(a, b): self.assertAlmostEqual(x, y)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: table2table.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep"), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by.") parser.add_option("-f", "--format", dest="format", type="string", help="output number format.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--headers", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_option("--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default].") parser.add_option("--transpose-format", dest="transpose_format", type="choice", choices=("default", "separated", ), help="input format of un-transposed table") parser.add_option("--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over several rows.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("--columns", dest="columns", type="string", help="columns to use.") parser.add_option("--file", dest="file", type="string", help="columns to test from table.", metavar="FILE") parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns.", metavar="DELIM") parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("--group", dest="group_column", type="int", help="group values by column. Supply an integer column [default=%default]") parser.add_option("--group-function", dest="group_function", type="choice", choices=( "min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns.") parser.add_option("--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable [%default].") parser.add_option("--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix.") parser.add_option("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default].") parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_option("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_option("--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=( "BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--fdr-add-column", dest="fdr_add_column", type="string", help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are multiple columns [%default]") # IMS: add option to use a column as the row id in flatten parser.add_option("--id-column", dest="id_column", type="string", help="list of column(s) to use as the row id when flattening the table. " "If None, then row number is used. [default=%default].") parser.add_option("--variable-name", dest="variable_name", type="string", help="the column header for the 'variable' column when flattening [default=%default].") parser.add_option("--value-name", dest="value_name", type="string", help="the column header for the 'value' column when flattening [default=%default].") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format="%5.2f", value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (options, args) = E.Start(parser, add_pipe_options=True) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if options.methods == ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write(line) elif options.transpose or "transpose" in options.methods: readAndTransposeTable(options.stdin, options) elif options.flatten_table: # IMS: bug fixed to make work. Also added options for keying on a particular # and adding custom column headings fields, table = CSV.ReadTable( options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) if options.id_column: id_columns = map( lambda x: int(x) - 1, options.id_column.split(",")) id_header = "\t".join([fields[id_column] for id_column in id_columns]) options.columns = [ x for x in options.columns if x not in id_columns] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name)) for x, row in enumerate(table): if options.id_column: row_id = "\t".join([row[int(x) - 1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write( "%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif options.as_column: fields, table = CSV.ReadTable( options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) table = zip(*table) options.stdout.write("value\n") for column in options.columns: options.stdout.write("\n".join(table[column]) + "\n") elif options.split_fields: # split comma separated fields fields, table = CSV.ReadTable(options.stdin, with_header=options.has_headers, as_rows=True) options.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(options.separator) for x in row] for d in itertools.product(*row): options.stdout.write("%s\n" % "\t".join(d)) elif options.group: readAndGroupTable(options.stdin, options) elif options.join_column: readAndJoinTable(options.stdin, options) elif options.expand_table: readAndExpandTable(options.stdin, options) elif options.collapse_table is not None: readAndCollapseTable(options.stdin, options, options.collapse_table) elif "grep" in options.methods: options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) patterns = [] if options.file: infile = open(options.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(options.delimiter)[0]) else: patterns = args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or (found and not options.invert_match): print line[:-1] else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.ReadTable( options.stdin, with_header=options.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) options.columns = getColumns(fields, options.columns) # convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map(lambda x: x / value, table[c]) elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map(lambda x: x * value, table[c]) elif method == "normalize-by-max": for c in options.columns: m = max(table[c]) table[c] = map(lambda x: x / m, table[c]) elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") for x in range(0, len(options.columns) - 1): for y in range(x + 1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], options.format % e1, options.format % e2, options.format % ((e1 + e2) / 2))) E.Stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = zip(tt, range(nrows)) t.sort() for i, n in zip(map(lambda x: x[1], t), range(nrows)): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % str( max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % str( min(pvalues)) # convert to str to avoid test for float downstream qvalues = map( str, Stats.adjustPValues(pvalues, method=options.fdr_method)) if options.fdr_add_column is None: x = 0 for c in options.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append(options.fdr_add_column) else: for co in options.columns: fields.append(options.fdr_add_column + fields[c]) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.ReadTable( open(other_table_name, "r"), with_header=options.has_headers, as_rows=False) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value # convert back for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = options.format % table[c][r] options.stdout.write("\t".join(fields) + "\n") if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join([table[c][r] for c in range(ncols)]) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join([table[c][r] for c in range(ncols)]) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep"), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by.") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default]") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--header-names", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_option( "--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default].") parser.add_option("--transpose-format", dest="transpose_format", type="choice", choices=( "default", "separated", ), help="input format of un-transposed table") parser.add_option( "--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over " "several rows.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("--columns", dest="columns", type="string", help="columns to use.") parser.add_option("--file", dest="file", type="string", help="columns to test from table.", metavar="FILE") parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns.", metavar="DELIM") parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("--group", dest="group_column", type="int", help="group values by column. Supply an integer column " "[default=%default]") parser.add_option("--group-function", dest="group_function", type="choice", choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns.") parser.add_option( "--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable " "[%default].") parser.add_option("--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix.") parser.add_option("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default].") parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_option("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_option( "--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default].") parser.add_option( "--fdr-method", dest="fdr_method", type="choice", choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--fdr-add-column", dest="fdr_add_column", type="string", help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are " "multiple columns [%default]") # IMS: add option to use a column as the row id in flatten parser.add_option( "--id-column", dest="id_column", type="string", help="list of column(s) to use as the row id when flattening " "the table. If None, then row number is used. [default=%default].") parser.add_option( "--variable-name", dest="variable_name", type="string", help="the column header for the 'variable' column when flattening " "[default=%default].") parser.add_option( "--value-name", dest="value_name", type="string", help="the column header for the 'value' column when flattening " "[default=%default].") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format=None, value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (options, args) = E.start(parser, add_pipe_options=True) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if options.methods == ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write(line) elif options.transpose or "transpose" in options.methods: readAndTransposeTable(options.stdin, options) elif options.flatten_table: # IMS: bug fixed to make work. Also added options for keying # on a particular and adding custom column headings fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) if options.id_column: id_columns = [int(x) - 1 for x in options.id_column.split(",")] id_header = "\t".join( [fields[id_column] for id_column in id_columns]) options.columns = [ x for x in options.columns if x not in id_columns ] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name)) for x, row in enumerate(table): if options.id_column: row_id = "\t".join( [row[int(x) - 1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write("%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif options.as_column: fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) table = list(zip(*table)) options.stdout.write("value\n") for column in options.columns: options.stdout.write("\n".join(table[column]) + "\n") elif options.split_fields: # split comma separated fields fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(options.separator) for x in row] for d in itertools.product(*row): options.stdout.write("%s\n" % "\t".join(d)) elif options.group: readAndGroupTable(options.stdin, options) elif options.join_column: readAndJoinTable(options.stdin, options) elif options.expand_table: readAndExpandTable(options.stdin, options) elif options.collapse_table is not None: readAndCollapseTable(options.stdin, options, options.collapse_table) elif "grep" in options.methods: options.columns = [int(x) - 1 for x in options.columns.split(",")] patterns = [] if options.file: infile = IOTools.open_file(options.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(options.delimiter)[0]) else: patterns = args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or ( found and not options.invert_match): print(line[:-1]) else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) options.columns = getColumns(fields, options.columns) # convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = [x / value for x in table[c]] elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = [x * value for x in table[c]] elif method == "normalize-by-max": for c in options.columns: m = max(table[c]) table[c] = [x / m for x in table[c]] elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") format = options.format if format is None: format = "%f" for x in range(0, len(options.columns) - 1): for y in range(x + 1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) options.stdout.write( "%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], format % e1, format % e2, format % ((e1 + e2) / 2))) E.stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = list(zip(tt, list(range(nrows)))) t.sort() for i, n in zip([x[1] for x in t], list(range(nrows))): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \ str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \ str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = list( map( str, Stats.adjustPValues(pvalues, method=options.fdr_method))) if options.fdr_add_column is None: x = 0 for c in options.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append(options.fdr_add_column) else: for co in options.columns: fields.append(options.fdr_add_column + fields[c]) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.readTable( IOTools.open_file(other_table_name, "r"), with_header=options.has_headers, as_rows=False) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value # convert back if options.format is not None: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = format % table[c][r] options.stdout.write("\t".join(fields) + "\n") if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") E.stop()
else: for c in options.columns: for r in range(nrows): if type(table[c][r]) == types.FloatType and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend( table[c] ) assert max(pvalues) <= 1.0, "pvalues > 1 in table" assert min(pvalues) >= 0, "pvalue < 0 in table" # convert to str to avoid test for float downstream qvalues = map(str, Stats.adjustPValues( pvalues, method = options.fdr_method )) x = 0 for c in options.columns: table[c] = qvalues[x:x+nrows] x += nrows elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.ReadTable( open(other_table_name, "r"), with_header = options.has_headers, as_rows = False ) # convert all values to float for c in options.columns: for r in range(nrows):