def read_and_expand_table(infile, args): '''splits fields in table at separator. If a field in a row contains multiple values, the row is expanded into multiple rows such that all values have space. ''' fields, table = CSV.readTable(infile, with_header=args.has_headers, as_rows=True) args.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(args.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): args.stdout.write("\t".join([d[n] for d in data]) + "\n")
def read_and_join_table(infile, args): fields, table = CSV.readTable(infile, with_header=args.has_headers, as_rows=True) join_column = args.join_column - 1 join_name = args.join_column_name - 1 join_rows = list(set([x[join_column] for x in table])) join_rows.sort() join_names = list(set([x[join_name] for x in table])) join_names.sort() join_columns = list( set(range(len(fields))).difference(set((join_column, join_name)))) join_columns.sort() new_table = [] map_old2new = {} map_name2start = {} x = 1 for name in join_names: map_name2start[name] = x x += len(join_columns) row_width = len(join_columns) * len(join_names) for x in join_rows: map_old2new[x] = len(map_old2new) new_row = [ x, ] + ["na"] * row_width new_table.append(new_row) for row in table: row_index = map_old2new[row[join_column]] start = map_name2start[row[join_name]] for x in join_columns: new_table[row_index][start] = row[x] start += 1 # print new table args.stdout.write(fields[join_column]) for name in join_names: for column in join_columns: args.stdout.write("\t%s%s%s" % (name, args.separator, fields[column])) args.stdout.write("\n") for row in new_table: args.stdout.write("\t".join(row) + "\n")
def read_and_collapse_table(infile, args, missing_value=""): '''collapse a table. Collapse a table of two columns with row names in the first column. Outputs a table with multiple columns for each row name. ''' fields, table = CSV.readTable(infile, with_header=args.has_headers, as_rows=True) if len(fields) != 2: raise NotImplementedError("can only work on tables with two columns") values = collections.defaultdict(list) # column header after which to add separator = table[0][0] row_names = set([x[0] for x in table]) row_name, value = table[0] values[row_name].append(value) added = set([row_name]) for row_name, value in table[1:]: if row_name == separator: for r in row_names: if r not in added: values[r].append(missing_value) added = set() values[row_name].append(value) added.add(row_name) for r in row_names: if r not in added: values[r].append(missing_value) sizes = set([len(x) for x in list(values.values())]) assert len(sizes) == 1, "unequal number of row_names" size = list(sizes)[0] args.stdout.write("row\t%s\n" % ("\t".join(["column_%i" % x for x in range(size)]))) for key, row in list(values.items()): args.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
def getGODescriptions(infile): '''build dictionary mapping GOids to types and descriptions. Arguments --------- infile : string Filename of table with GO assignments Returns ------- mapping : dict Dictionary mapping GOid to GOtype and GOdescription. ''' with iotools.open_file(infile) as inf: fields, table = csvutils.readTable(inf, as_rows=False) return dict([ (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[ fields.index("go_id")], table[fields.index("description")]) ])
def computeFDR(infile, args): '''compute FDR on a table. ''' fields, table = CSV.readTable(infile, with_header=args.has_headers, as_rows=True) args.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(args.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): args.stdout.write("\t".join([d[n] for d in data]) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (args, unknown) = E.start(parser, add_csv_options=True, unknowns=True) if len(args) != 2: raise ValueError("please specify two files to join") args.filename1, args.filename2 = unknown table1 = readTable(iotools.open_file(args.filename1, "r")) table2 = readTable(iotools.open_file(args.filename2, "r")) if args.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = args.stdout # build new field list new_fields = [] for x in args.join_fields1: new_fields.append(x) for x in fields1: if x not in args.join_fields1: new_fields.append(x) if x not in args.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=args.csv_dialect, lineterminator=args.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if args.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=args.csv_dialect) print("\t".join(fields)) first_row = True for row in reader: row = iotools.convertDictionary(row) writer.writerow(row) E.stop()
def read_and_group_table(infile, args): """read table from infile and group. """ fields, table = CSV.readTable(infile, with_header=args.has_headers, as_rows=True) args.columns = get_columns(fields, args.columns) assert args.group_column not in args.columns converter = float new_fields = [fields[args.group_column] ] + [fields[x] for x in args.columns] if args.group_function == "min": f = min elif args.group_function == "max": f = max elif args.group_function == "sum": def f(z): return reduce(lambda x, y: x + y, z) elif args.group_function == "mean": f = numpy.mean elif args.group_function == "cat": def f(x): return ";".join([y for y in x if y != ""]) converter = str elif args.group_function == "uniq": def f(x): return ";".join([y for y in set(x) if y != ""]) converter = str elif args.group_function == "stats": # Stats lives in cgat-apps/CGAT def f(x): return str(Stats.DistributionalParameters(x)) # update headers new_fields = [fields[args.group_column]] for c in args.columns: new_fields += list([ "%s_%s" % (fields[c], x) for x in Stats.DistributionalParameters().getHeaders() ]) # convert values to floats (except for group_column) # Delete rows with unconvertable values and not in args.columns new_table = [] for row in table: skip = False new_row = [row[args.group_column]] for c in args.columns: if row[c] == args.missing_value: new_row.append(row[c]) else: try: new_row.append(converter(row[c])) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.groupTable(table, group_column=0, group_function=f) args.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: args.stdout.write("\t".join(map(str, row)) + "\n")
def main(argv=None): """script main. parses command line args in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser() parser.add_argument("--version", action='version', version='%(prog)s {version}'.format(version="1.0")) parser.add_argument("-m", "--method", dest="methods", type=str, action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep", "randomize-rows"), help="""actions to perform on table.""") parser.add_argument("-s", "--scale", dest="scale", type=float, help="factor to scale matrix by.") parser.add_argument("-f", "--format", dest="format", type=str, help="output number format") parser.add_argument("-p", "--parameters", dest="parameters", type=str, help="Parameters for various functions.") parser.add_argument("-t", "--header-names", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_argument("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_argument( "--set-transpose-field", dest="set_transpose_field", type=str, help="set first field (row 1 and col 1) to this value [%default].") parser.add_argument("--transpose-format", dest="transpose_format", type=str, choices=( "default", "separated", ), help="input format of un-transposed table") parser.add_argument( "--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over " "several rows.") parser.add_argument("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_argument("--columns", dest="columns", type=str, help="columns to use.") parser.add_argument("--file", dest="file", type=str, help="columns to test from table.", metavar="FILE") parser.add_argument("-d", "--delimiter", dest="delimiter", type=str, help="delimiter of columns.", metavar="DELIM") parser.add_argument("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_argument("--sort-by-rows", dest="sort_rows", type=str, help="output order for rows.") parser.add_argument("-a", "--value", dest="value", type=float, help="value to use for various algorithms.") parser.add_argument( "--group", dest="group_column", type=int, help="group values by column. Supply an integer column ") parser.add_argument("--group-function", dest="group_function", type=str, choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_argument("--join-table", dest="join_column", type=int, help="join rows in a table by columns.") parser.add_argument( "--collapse-table", dest="collapse_table", type=str, help="collapse a table. Value determines the missing variable ") parser.add_argument("--join-column-name", dest="join_column_name", type=int, help="use this column as a prefix.") parser.add_argument("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table.") parser.add_argument("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_argument("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_argument("--separator", dest="separator", type=str, help="separator for multi-valued fields.") parser.add_argument( "--fdr-method", dest="fdr_method", type=str, choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling " "the fdr.") parser.add_argument( "--fdr-add-column", dest="fdr_add_column", type=str, help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are " "multiple columns") # IMS: add option to use a column as the row id in flatten parser.add_argument( "--id-column", dest="id_column", type=str, help="list of column(s) to use as the row id when flattening " "the table. If None, then row number is used.") parser.add_argument( "--variable-name", dest="variable_name", type=str, help="the column header for the 'variable' column when flattening ") parser.add_argument( "--value-name", dest="value_name", type=str, help="the column header for the 'value' column when flattening ") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format=None, value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (args, unknown) = E.start(parser, unknowns=True) args.parameters = args.parameters.split(",") if args.group_column: args.group = True args.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if args.methods == ["remove-header"]: first = True for line in args.stdin: if line[0] == "#": continue if first: first = False continue args.stdout.write(line) elif args.transpose or "transpose" in args.methods: read_and_transpose_table(args.stdin, args) elif args.flatten_table: # IMS: bug fixed to make work. Also added options for keying # on a particular and adding custom column headings fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=True) args.columns = get_columns(fields, args.columns) if args.id_column: id_columns = [int(x) - 1 for x in args.id_column.split(",")] id_header = "\t".join( [fields[id_column] for id_column in id_columns]) args.columns = [x for x in args.columns if x not in id_columns] else: id_header = "row" args.stdout.write("%s\t%s\t%s\n" % (id_header, args.variable_name, args.value_name)) for x, row in enumerate(table): if args.id_column: row_id = "\t".join( [row[int(x) - 1] for x in args.id_column.split(",")]) else: row_id = str(x) for y in args.columns: args.stdout.write("%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif args.as_column: fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=True) args.columns = get_columns(fields, args.columns) table = list(zip(*table)) args.stdout.write("value\n") for column in args.columns: args.stdout.write("\n".join(table[column]) + "\n") elif args.split_fields: # split comma separated fields fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=True) args.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(args.separator) for x in row] for d in itertools.product(*row): args.stdout.write("%s\n" % "\t".join(d)) elif args.group: read_and_group_table(args.stdin, args) elif args.join_column: read_and_join_table(args.stdin, args) elif args.expand_table: read_and_expand_table(args.stdin, args) elif args.collapse_table is not None: read_and_collapse_table(args.stdin, args, args.collapse_table) elif "randomize-rows" in args.methods: read_and_randomize_rows(args.stdin, args) elif "grep" in args.methods: args.columns = [int(x) - 1 for x in args.columns.split(",")] patterns = [] if args.file: infile = iotools.open_file(args.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(args.delimiter)[0]) else: patterns = args for line in args.stdin: data = line[:-1].split(args.delimiter) found = False for c in args.columns: if data[c] in patterns: found = True break if (not found and args.invert_match) or (found and not args.invert_match): print(line[:-1]) else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) args.columns = get_columns(fields, args.columns) # convert all values to float for c in args.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in args.methods: if method == "normalize-by-value": value = float(args.parameters[0]) del args.parameters[0] for c in args.columns: table[c] = [x / value for x in table[c]] elif method == "multiply-by-value": value = float(args.parameters[0]) del args.parameters[0] for c in args.columns: table[c] = [x * value for x in table[c]] elif method == "normalize-by-max": for c in args.columns: m = max(table[c]) table[c] = [x / m for x in table[c]] elif method == "kullback-leibler": args.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") format = args.format if format is None: format = "%f" for x in range(0, len(args.columns) - 1): for y in range(x + 1, len(args.columns)): c1 = args.columns[x] c2 = args.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) args.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], format % e1, format % e2, format % ((e1 + e2) / 2))) E.stop() sys.exit(0) elif method == "rank": for c in args.columns: tt = table[c] t = list(zip(tt, list(range(nrows)))) t.sort() for i, n in zip([x[1] for x in t], list(range(nrows))): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(args.parameters[0]) del args.parameters[0] new_value = float(args.parameters[0]) del args.parameters[0] if method == "upper-bound": for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in args.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \ str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \ str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = list( map(str, Stats.adjustPValues(pvalues, method=args.fdr_method))) if args.fdr_add_column is None: x = 0 for c in args.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(args.columns) == 1: fields.append(args.fdr_add_column) else: for co in args.columns: fields.append(args.fdr_add_column + fields[c]) x = 0 for c in args.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(args.columns) elif method == "normalize-by-table": other_table_name = args.parameters[0] del args.parameters[0] other_fields, other_table = CSV.readTable( iotools.open_file(other_table_name, "r"), with_header=args.has_headers, as_rows=False) # convert all values to float for c in args.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = args.missing_value # convert back if args.format is not None: for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = format % table[c][r] args.stdout.write("\t".join(fields) + "\n") if args.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in args.sort_rows.split(","): if x not in old2new: continue r = old2new[x] args.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") else: for r in range(nrows): args.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option("-1", "--join-fields1", dest="join_fields1", type="string", help="join fields in first table.") parser.add_option("-2", "--join-fields2", dest="join_fields2", type="string", help="join fields in second table.") parser.add_option("-m", "--method", dest="method", type="choice", help="set operation to perform.", choices=("intersection", "rest", "union")) parser.set_defaults( remove=False, unique=False, join_fields1=None, join_fields2=None, method="intersection", ) (options, args) = E.start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") if not options.join_fields1 or not options.join_fields2: raise ValueError("please specify at least one join field per table") options.join_fields1 = options.join_fields1.split(",") options.join_fields2 = options.join_fields2.split(",") options.filename1, options.filename2 = args fields1, table1 = readTable(open(options.filename1, "r")) fields2, table2 = readTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout nfields1 = [] for x in range(len(fields1)): if fields1[x] in options.join_fields1: nfields1.append(x) nfields2 = [] for x in range(len(fields2)): if fields2[x] in options.join_fields2: nfields2.append(x) # calculate row indices: double keys are not taken care of here keys = {} for row1 in table1: v = [row1[x] for x in nfields1] key = hashlib.md5("".join(v)).digest() keys[key] = row1 if options.method == "intersection": # build new field list take = list(range(len(fields1))) c = len(take) for x in fields2: if x not in options.join_fields2: take.append(c) c += 1 t = fields1 + fields2 new_fields = [t[x] for x in take] print("\t".join(new_fields)) for row2 in table2: v = [row2[x] for x in nfields2] key = hashlib.md5("".join(v)).digest() if key in keys: new_row = keys[key] + row2 outfile.write("\t".join([new_row[x] for x in take]) + "\n") elif options.method == "rest": new_fields = fields2 print("\t".join(new_fields)) for row2 in table2: v = [row2[x] for x in nfields2] key = hashlib.md5("".join(v)).digest() if key not in keys: outfile.write("\t".join(row2) + "\n") E.stop()