def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codonbias_weights2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--methods", dest="methods", type="string", help="methods to apply.") parser.add_option("--is-frequencies", dest="is_frequencies", action="store_true", help="data is frequencies (default: weights).") parser.add_option("-s", "--sort", dest="sort", type="choice", choices=("percent-difference", "aa"), help="sort order of output table.") parser.add_option( "-g", "--global-sort", dest="global_sort", action="store_true", help="globally sort results (otherwise: by species pair).") parser.set_defaults( \ methods = "", is_frequencies = False, sort = "percent-difference", global_sort= False, ) (options, args) = E.Start(parser) if options.methods: options.methods = options.methods.split(",") fields, table = CSV.ReadTable(sys.stdin) ## convert weights to floats table = CSV.getConvertedTable(table, range(1, len(fields))) for method in options.methods: if method == "overview": if options.is_frequencies: WriteOverviewFrequencies(fields, table, options) else: WriteOverviewWeights(fields, table, options)
def readAndExpandTable( infile, options ): '''splits fields in table at separator. If a field in a row contains multiple values, the row is expanded into multiple rows such that all values have space. ''' fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append( row[x].split( options.separator ) ) nrows = max( [ len(d) for d in data ] ) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write( "\t".join( [ d[n] for d in data ] ) + "\n" )
def readAndGroupTable( infile, options ): """read table from infile and group. """ fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.columns = getColumns( fields, options.columns ) assert options.group_column not in options.columns converter = float new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ] if options.group_function == "min": f = min elif options.group_function == "max": f = max elif options.group_function == "sum": f = lambda z: reduce( lambda x,y: x+y, z) elif options.group_function == "mean": f = scipy.mean elif options.group_function == "cat": f = lambda x: ";".join( [ y for y in x if y != "" ] ) converter = str elif options.group_function == "uniq": f = lambda x: ";".join( [ y for y in set(x) if y != "" ] ) converter = str elif options.group_function == "stats": f = lambda x: str(Stats.DistributionalParameters(x)) # update headers new_fields = [ fields[options.group_column] ] for c in options.columns: new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) ) ## convert values to floats (except for group_column) ## Delete rows with unconvertable values and not in options.columns new_table = [] for row in table: skip = False new_row = [ row[options.group_column] ] for c in options.columns: if row[c] == options.missing_value: new_row.append(row[c]) else: try: new_row.append( converter(row[c]) ) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.GroupTable( table, group_column = 0, group_function = f ) options.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: options.stdout.write( "\t".join( map(str,row) ) + "\n")
def getGODescriptions(infile): '''return dictionary mapping GO category to description and namespace. ''' with IOTools.openFile(infile) as inf: fields, table = CSV.ReadTable(inf, as_rows=False) return dict([ (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[ fields.index("go_id")], table[fields.index("description")]) ])
def readAndJoinTable(infile, options): fields, table = CSV.ReadTable(infile, with_header=options.has_headers, as_rows=True) join_column = options.join_column - 1 join_name = options.join_column_name - 1 join_rows = list(set(map(lambda x: x[join_column], table))) join_rows.sort() join_names = list(set(map(lambda x: x[join_name], table))) join_names.sort() join_columns = list( set(range(len(fields))).difference(set((join_column, join_name)))) join_columns.sort() new_table = [] map_old2new = {} map_name2start = {} x = 1 for name in join_names: map_name2start[name] = x x += len(join_columns) row_width = len(join_columns) * len(join_names) for x in join_rows: map_old2new[x] = len(map_old2new) new_row = [ x, ] + ["na"] * row_width new_table.append(new_row) for row in table: row_index = map_old2new[row[join_column]] start = map_name2start[row[join_name]] for x in join_columns: new_table[row_index][start] = row[x] start += 1 # print new table options.stdout.write(fields[join_column]) for name in join_names: for column in join_columns: options.stdout.write("\t%s%s%s" % (name, options.separator, fields[column])) options.stdout.write("\n") for row in new_table: options.stdout.write("\t".join(row) + "\n")
def readAndCollapseTable(infile, options, missing_value=""): '''collapse a table. Collapse a table of two columns with row names in the first column. Outputs a table with multiple columns for each row name. ''' fields, table = CSV.ReadTable(infile, with_header=options.has_headers, as_rows=True) if len(fields) != 2: raise NotImplementedError("can only work on tables with two columns") values = collections.defaultdict(list) # column header after which to add separator = table[0][0] row_names = set([x[0] for x in table]) row_name, value = table[0] values[row_name].append(value) added = set([row_name]) for row_name, value in table[1:]: if row_name == separator: for r in row_names: if r not in added: values[r].append(missing_value) added = set() values[row_name].append(value) added.add(row_name) for r in row_names: if r not in added: values[r].append(missing_value) sizes = set([len(x) for x in values.values()]) assert len(sizes) == 1, "unequal number of row_names" size = list(sizes)[0] options.stdout.write("row\t%s\n" % ("\t".join(["column_%i" % x for x in range(size)]))) for key, row in values.items(): options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
def computeFDR( infile, options ): '''compute FDR on a table. ''' fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append( row[x].split( options.separator ) ) nrows = max( [ len(d) for d in data ] ) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write( "\t".join( [ d[n] for d in data ] ) + "\n" )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_sites_slr.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--method", dest="method", type="choice", choices=("summary-slr", "summary-filtered", "over-representation", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"), help="method to apply.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("-s", "--filename-sites", dest="filename_sites", type="string", help="filename with sites information.") parser.add_option("-l", "--filename-log", dest="filename_log", type="string", help="filename with logging information.") parser.add_option( "-m", "--filename-mali", dest="filename_mali", type="string", help= "filename of multiple alignment, that was input to SLR. If given, is used to filter indels." ) parser.add_option( "--filter-probability", dest="filter_probability", type="float", help="threshold for probability above which to include positive sites." ) parser.add_option("--no-header", dest="write_header", action="store_false", help="only output header.") parser.add_option("--only-header", dest="only_header", action="store_true", help="only output header.") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="threshold for significance tests [%default].") parser.add_option("--use-adjusted", dest="use_adjusted", action="store_true", help="use SLR adjusted probability values.") parser.add_option("--truncate-sites-list", dest="truncate_sites_list", type="int", help="truncate sites list after ## entries (0 for all).") parser.add_option( "--context-size", dest="context_size", type="int", help="size of left/right context around a selected residue.") parser.set_defaults( prefix=None, filter_probability=0, filter_omega=0, filename_sites="-", filename_log=None, filename_mali=None, significance_threshold=0.05, write_header=True, only_header=False, use_adjusted=False, context_size=0, truncate_sites_list=0, ) (options, args) = E.Start(parser) slr = WrapperSlr.Slr() # write headers if "%s" in options.filename_sites: options.prefix = True if options.method == "summary-slr": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # This uses the thresholds as set in SLR. Use "counts" for filtering # residues based on your own thresholds """) thresholds = "95%", "99%", "95% corrected", "99% corrected" if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "ltree\tomega\tkappa\tlnL\tnsites\tnsyn\tngap\t") options.stdout.write("\t".join( map(lambda x: "npos_" + x.replace(" ", "_"), thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "nneg_" + x.replace(" ", "_"), thresholds))) options.stdout.write("\n") elif options.method == "summary-filtered": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # This method uses the supplied threshold and the multiple alignment to filter. # All positions that are above the threshold (P-Value) and which are located in # indels: >= 1 sequence missing from column, are removed. """) if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "ltree\tomega\tkappa\tlnL\tnsites\tnfiltered\tntotal\tnsyn\tnneg\tnpos\n" ) elif options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # Note: sequence positions are 1-based, but mali positions are 0-based. # Residues in indel positions have been removed and signifnicance was determined according # with a threshold of %5.2e """ % options.significance_threshold) if options.prefix: options.stdout.write("prefix\t") options.stdout.write("cluster\tnsites\tp-value\tsites\n") elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Sites under positive/neutral/negative selection according to SLR # # Note: sequence positions are 1-based, but mali positions are 0-based. # Residues in indel positions have been removed and signifnicance was determined according # with a threshold of %5.2e """ % options.significance_threshold) if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "sequence\tn\taa\tseq_pos\tmali_pos\tcontext\n") elif options.method == "over-representation": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write("""# Genes with over-represented sites. # # This method uses as input the output of summary-filtered. """) if options.only_header: sys.exit(0) if options.method in ("summary-slr", "summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): ninput, noutput, nskipped = 0, 0, 0 if "%s" in options.filename_sites: headers, table = CSV.ReadTable(sys.stdin) fprefix = headers.index("prefix") try: fsignificance = headers.index("p") except ValueError: fsignificance = None for row in table: id = row[fprefix] if fsignificance is not None: p_value = row[fsignificance] else: p_value = None ninput += 1 fn = re.sub("%s", id, options.filename_sites) if not os.path.exists(fn): nskipped += 1 continue lines_sites = open(fn, "r").readlines() if options.filename_log: lines_log = open(re.sub("%s", id, options.filename_log), "r").readlines() result = slr.parseOutput(lines_sites, lines_log) if options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table"): mali = Mali.Mali() mali.readFromFile( open(re.sub("%s", id, options.filename_mali), "r")) else: mali = None ProcessResult(result, options, mali, prefix=id, p_value=p_value) noutput += 1 else: if options.filename_sites == "-": lines_sites = sys.stdin.readlines() else: lines_sites = open(options.filename_sites, "r").readlines() ninput += 1 if options.filename_log: lines_log = open(options.filename_log, "r").readlines() result = slr.parseOutput(lines_sites, lines_log) if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: if options.method == "summary-filtered": raise "please supply a multiple alignment for filtering." mali = None ProcessResult(result, options, mali, prefix=options.prefix) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) else: if options.method == "over-representation": results = [] for line in sys.stdin: if line[0] == "#": continue data = line[:-1].split("\t") if data[0] == "prefix": continue results.append( Result(data[0], int(data[6]), int(data[7]), int(data[8]), int(data[9]), int(data[10]))) # probability of a single site being positive ntotal = sum(map(lambda x: x.mNTotal, results)) npositives = sum(map(lambda x: x.mNPositive, results)) p = float(npositives) / float(ntotal) if options.loglevel >= 1: options.stdlog.write("# sites: total=%i, positive=%i, p=%f\n" % (ntotal, npositives, p)) new_results = [] for result in results: if result.mNTotal == 0: continue # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) # = P (x > X ). r = scipy.stats.binom.sf(result.mNPositive - 1, result.mNTotal, p) result.mSignificance = r if r < options.significance_threshold: new_results.append(result) new_results.sort( lambda x, y: cmp(x.mSignificance, y.mSignificance)) options.stdlog.write(Result().getHeader() + "\n") for result in new_results: options.stdout.write(str(result) + "\n") if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, npos=%i\n" % (len(results), len(new_results))) E.Stop()
def main(): parser = E.OptionParser( version= "%prog version: $Id: analyze_readpositions.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern for additional output files [%default].") parser.set_defaults( length=1000, minimum_coverage=0.90, maximum_reads=[1, 10, 20, 50, 100], output_filename_pattern="%s", normalize=True, ) (options, args) = E.Start(parser, add_csv_options=True) fields, table = CSV.ReadTable(sys.stdin, dictreader=CSV.DictReaderLarge) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) coverage_5prime = numpy.zeros(options.length, numpy.float) coverage_3prime = numpy.zeros(options.length, numpy.float) coverage_maxreads5prime = numpy.zeros(options.length, numpy.float) coverage_maxreads3prime = numpy.zeros(options.length, numpy.float) coverage_full5prime = numpy.zeros(options.length, numpy.float) coverage_full3prime = numpy.zeros(options.length, numpy.float) coverage_min5prime = numpy.zeros(options.length, numpy.float) coverage_min3prime = numpy.zeros(options.length, numpy.float) histograms = [] for x in range(len(options.maximum_reads)): histograms.append([ numpy.zeros(options.length, numpy.float), numpy.zeros(options.length, numpy.float), 0 ]) ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0 for row in table: length, covered, meancov, data, nreads = (int(row["cov_nval"]), float(row["cov_covered"]), float(row["cov_mean"]), row["cov_values"], int(row["nover2"])) ninput += 1 if length < options.length: nlength += 1 continue if data == "na": nskipped += 1 continue noutput += 1 mincov = covered / length values = map(float, data.split(";")) m = max(values) values = [x / m for x in values] coverage_5prime += values[0:1000] coverage_3prime += values[-1000:] if mincov >= 1.0: coverage_full5prime += values[0:1000] coverage_full3prime += values[-1000:] nfull += 1 if meancov >= options.minimum_coverage: coverage_min5prime += values[0:1000] coverage_min3prime += values[-1000:] nmincov += 1 for maxreads in range(len(options.maximum_reads)): if nreads <= options.maximum_reads[maxreads]: histograms[maxreads][0] += values[0:1000] histograms[maxreads][1] += values[-1000:] histograms[maxreads][2] += 1 if options.normalize: for x5, x3 in ((coverage_5prime, coverage_3prime), (coverage_min5prime, coverage_min3prime), (coverage_full5prime, coverage_full3prime)): m = max((max(x5), max(x3))) x3 /= m x5 /= m for x5, x3, c in histograms: m = max((max(x5), max(x3))) x5 /= m x3 /= m outfile = options.stdout outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'", "mincov-3'", "full-5'", "full-3'")) + "\n") for x in range(0, options.length): outfile.write( "\t".join( [ "%6.4f" % x for x in \ (x, coverage_5prime[x], coverage_3prime[x], coverage_min5prime[x], coverage_min3prime[x], coverage_full5prime[x], coverage_full3prime[x] ) ] ) + "\n" ) outfile5 = open(options.output_filename_pattern % "reads5", "w") outfile3 = open(options.output_filename_pattern % "reads3", "w") outfile5.write("\t".join([ "distance", ] + [ "reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads)) ]) + "\n") outfile3.write("\t".join([ "distance", ] + [ "reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads)) ]) + "\n") for x in range(0, options.length): outfile5.write("%i\t%s\n" % (x, "\t".join([ "%6.4f" % histograms[y][0][x] for y in range(len(options.maximum_reads)) ]))) outfile3.write("%i\t%s\n" % (x, "\t".join([ "%6.4f" % histograms[y][1][x] for y in range(len(options.maximum_reads)) ]))) E.info( "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i" %\ (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise "please specify two files to join." options.filename1, options.filename2 = args table1 = CSV.ReadTable(open(options.filename1, "r")) table2 = CSV.ReadTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout # build new field list new_fields = [] for x in options.join_fields1: new_fields.append(x) for x in fields1: if x not in options.join_fields1: new_fields.append(x) if x not in options.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if options.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=options.csv_dialect) print "\t".join(fields) first_row = True for row in reader: row = CSV.ConvertDictionary(row) writer.writerow(row) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option("-1", "--join-fields1", dest="join_fields1", type="string", help="join fields in first table.") parser.add_option("-2", "--join-fields2", dest="join_fields2", type="string", help="join fields in second table.") parser.add_option("-m", "--method", dest="method", type="choice", help="set operation to perform.", choices=("intersection", "rest", "union")) parser.set_defaults( remove=False, unique=False, join_fields1=None, join_fields2=None, method="intersection", ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise "please specify two files to join." if not options.join_fields1 or not options.join_fields2: raise "please specify at least one join field per table." options.join_fields1 = options.join_fields1.split(",") options.join_fields2 = options.join_fields2.split(",") options.filename1, options.filename2 = args fields1, table1 = CSV.ReadTable(open(options.filename1, "r")) fields2, table2 = CSV.ReadTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout nfields1 = [] for x in range(len(fields1)): if fields1[x] in options.join_fields1: nfields1.append(x) nfields2 = [] for x in range(len(fields2)): if fields2[x] in options.join_fields2: nfields2.append(x) ## calculate row indices: double keys are not taken care of here keys = {} for row1 in table1: v = map(lambda x: row1[x], nfields1) key = hashlib.md5("".join(v)).digest() keys[key] = row1 if options.method == "intersection": ## build new field list take = range(len(fields1)) c = len(take) for x in fields2: if x not in options.join_fields2: take.append(c) c += 1 t = fields1 + fields2 new_fields = map(lambda x: t[x], take) print "\t".join(new_fields) for row2 in table2: v = map(lambda x: row2[x], nfields2) key = hashlib.md5("".join(v)).digest() if key in keys: new_row = keys[key] + row2 outfile.write("\t".join(map(lambda x: new_row[x], take)) + "\n") elif options.method == "rest": new_fields = fields2 print "\t".join(new_fields) for row2 in table2: v = map(lambda x: row2[x], nfields2) key = hashlib.md5("".join(v)).digest() if key not in keys: outfile.write("\t".join(row2) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("--column", dest="column", type="int", help="column to split on.") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins to create.") parser.add_option("--method", dest="method", type="choice", choices=("equal-sized-bins", ), help="method to use to bin data.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help= "OUTPUT filename with histogram information on aggregate coverages [%default]." ) parser.set_defaults( has_headers=True, method="equal-sized-bins", column=1, num_bins=4, output_filename_pattern="bin%i", ) (options, args) = E.Start(parser) options.column -= 1 if args: if args[0] == "-": infile = sys.stdin else: infile = open(args[0], "r") else: infile = sys.stdin fields, data = CSV.ReadTable(infile) c = options.column values = [float(x[c]) for x in data] bins = [] if options.method == "equal-sized-bins": increment = int(math.floor(float(len(values)) / options.num_bins)) indices = range(0, len(values)) indices.sort(key=lambda x: values[x]) for x in xrange(len(values)): values[indices[x]] = x bins = range(0, len(values) - increment, increment) elif options.method == "pass": pass E.debug("bins=%s" % str(bins)) outputters = [] for x in xrange(0, len(bins)): outputters.append( Outputter(options.output_filename_pattern % x, fields)) # output tables for x in xrange(0, len(data)): bin = bisect.bisect(bins, values[x]) - 1 outputters[bin].write(data[x]) # stats if options.loglevel >= 1: options.stdlog.write("# bin\tstart\tcounts\tfilename\n") for x in xrange(0, len(bins)): options.stdlog.write( "# %i\t%f\t%i\t%s\n" % (x, bins[x], outputters[x].mCounts, outputters[x].mFilename)) E.info("ninput=%i, noutput=%i" % (len(data), sum((x.mCounts for x in outputters)))) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: table2table.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "transpose", "normalize-by-max","normalize-by-value","multiply-by-value", "percentile","remove-header","normalize-by-table", "upper-bound","lower-bound","kullback-leibler", "expand","compress", "fdr", "grep" ), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by." ) parser.add_option("-f", "--format", dest="format", type="string", help="output number format." ) parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions." ) parser.add_option("-t", "--headers", dest="has_headers", action="store_true", help="matrix has row/column headers." ) parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table." ) parser.add_option("--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default]." ) parser.add_option("--transpose-format", dest="transpose_format", type="choice", choices=("default", "separated", ), help="input format of un-transposed table" ) parser.add_option("--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over several rows." ) parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers." ) parser.add_option("--columns", dest="columns", type="string", help="columns to use." ) parser.add_option( "--file", dest="file", type="string", help="columns to test from table.", metavar="FILE" ) parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns." , metavar="DELIM" ) parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match." ) parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows." ) parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms." ) parser.add_option("--group", dest="group_column", type="int", help="group values by column. Supply an integer column [default=%default]" ) parser.add_option("--group-function", dest="group_function", type="choice", choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by." ) parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns." ) parser.add_option("--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable [%default]." ) parser.add_option("--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix." ) parser.add_option("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default]." ) parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column." ) parser.add_option("--split-fields", dest="split_fields", action="store_true", help="split fields." ) parser.add_option("--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default]." ) parser.add_option( "--fdr-method", dest="fdr_method", type="choice", choices = ( "BH", "bonferroni", "holm", "hommel", "hochberg", "BY" ), help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option( "--fdr-add-column", dest="fdr_add_column", type="string", help = "add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are multiple columns [%default]" ) #IMS: add option to use a column as the row id in flatten parser.add_option("--id-column", dest="id_column", type ="string", help="list of column(s) to use as the row id when flattening the table. " "If None, then row number is used. [default=%default].") parser.add_option("--variable-name", dest="variable_name", type = "string", help="the column header for the 'variable' column when flattening [default=%default].") parser.add_option("--value-name", dest="value_name", type = "string", help="the column header for the 'value' column when flattening [default=%default].") parser.set_defaults( methods = [], scale = 1.0, has_headers = True, format = "%5.2f", value = 0.0, parameters = "", columns = "all", transpose = False, set_transpose_field = None, transpose_format = "default", group = False, group_column = 0, group_function = "mean", missing_value = "na", sort_rows = None, flatten_table= False, collapse_table = None, separator = ";", expand = False, join_column = None, join_column_name = None, compute_fdr = None, as_column = False, fdr_method= "BH", fdr_add_column = None, id_column=None, variable_name="column", value_name="value", file=None, delimiter = "\t", invert_match = False, ) (options, args) = E.Start( parser, add_pipe_options = True ) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### ## if only to remove header, do this quickly if options.methods== ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write( line ) elif options.transpose or "transpose" in options.methods: readAndTransposeTable( options.stdin, options ) elif options.flatten_table: #IMS: bug fixed to make work. Also added options for keying on a particular # and adding custom column headings fields, table = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = True ) options.columns = getColumns( fields, options.columns ) if options.id_column: id_columns = map(lambda x: int(x) -1,options.id_column.split(",")) id_header = "\t".join([fields[id_column] for id_column in id_columns]) options.columns = [x for x in options.columns if x not in id_columns] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" %(id_header, options.variable_name, options.value_name) ) for x, row in enumerate(table): if options.id_column: row_id = "\t".join([row[int(x)-1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write( "%s\t%s\t%s\n" % (row_id,fields[y], row[y] )) elif options.as_column: fields, table = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = True ) options.columns = getColumns( fields, options.columns ) table = zip( *table ) options.stdout.write( "value\n" ) for column in options.columns: options.stdout.write("\n".join( table[column] ) + "\n" ) elif options.split_fields: # split comma separated fields fields, table = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = True ) options.stdout.write( "%s\n" % ("\t".join(fields))) for row in table: row = [ x.split(options.separator) for x in row ] for d in itertools.product( *row ): options.stdout.write( "%s\n" % "\t".join( d ) ) elif options.group: readAndGroupTable( options.stdin, options ) elif options.join_column: readAndJoinTable( options.stdin, options ) elif options.expand_table: readAndExpandTable( options.stdin, options ) elif options.collapse_table != None: readAndCollapseTable( options.stdin, options, options.collapse_table ) elif "grep" in options.methods: options.columns = map(lambda x: int(x)-1, options.columns.split(",")) patterns = [] if options.file: infile = open( options.file, "r") for line in infile: if line[0] == "#": continue patterns.append( line[:-1].split(options.delimiter)[0] ) else: patterns=args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or (found and not options.invert_match): print line[:-1] else: ###################################################################### ###################################################################### ###################################################################### ## Apply remainder of transformations fields, table = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = False ) # convert columns to list table = [ list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError( "table is empty" ) nrows = len(table[0]) E.info( "processing table with %i rows and %i columns" % (nrows, ncols) ) options.columns = getColumns( fields, options.columns ) ## convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float (table[c][r] ) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map( lambda x: x / value, table[c] ) elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map( lambda x: x * value, table[c] ) elif method == "normalize-by-max": for c in options.columns: m = max( table[c] ) table[c] = map( lambda x: x / m, table[c] ) elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") for x in range(0,len(options.columns)-1): for y in range(x+1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log( p / q ) e2 += q * math.log( q / p ) options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], options.format % e1, options.format % e2, options.format % ((e1 + e2) / 2)) ) E.Stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = zip( tt, range(nrows) ) t.sort() for i,n in zip( map(lambda x: x[1], t), range(nrows)): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if type(table[c][r]) == types.FloatType and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if type(table[c][r]) == types.FloatType and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend( table[c] ) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = map(str, Stats.adjustPValues( pvalues, method = options.fdr_method )) if options.fdr_add_column == None: x = 0 for c in options.columns: table[c] = qvalues[x:x+nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append( options.fdr_add_column ) else: for co in options.columns: fields.append( options.fdr_add_column + fields[c] ) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x+nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.ReadTable( open(other_table_name, "r"), with_header = options.has_headers, as_rows = False ) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float (other_table[c][r] ) except ValueError: continue ## set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if type(table[c][r]) == types.FloatType and \ type(other_table[c][r]) == types.FloatType and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value ## convert back for c in options.columns: for r in range(nrows): if type(table[c][r]) == types.FloatType: table[c][r] = options.format % table[c][r] options.stdout.write( "\t".join(fields) + "\n" ) if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join( [ table[c][r] for c in range(ncols) ] ) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join( [ table[c][r] for c in range(ncols) ] ) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--schemas", dest="schemas", type="string", help="schemas in the set.") parser.add_option("-e", "--field-extract", dest="field_extract", type="string", help="pattern for the field to extract.") parser.add_option("-c", "--field-compare", dest="field_compare", type="string", help="pattern for the field to compare.") parser.add_option("-i", "--filename-identifiers", dest="filename_identifiers", type="string", help="identifiers in the positive set.") parser.add_option("-u", "--filename-subset", dest="filename_subset", type="string", help="subset in the positive set.") parser.add_option("--filter-min-ratio", dest="filter_min_ratio", type="float", help="minimum boundary for filter.") parser.add_option("--filter-max-ratio", dest="filter_max_ratio", type="float", help="maximum boundary for filter.") parser.add_option( "-o", "--output-fields", dest="output_fields", type="string", help= "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median." ) parser.add_option( "--output-pattern", dest="output_pattern", type="string", help= "pattern for table headers, should contain %s for schema and %s for field anme." ) parser.add_option( "-f", "--output-format", dest="output_format", type="choice", choices=("table", "list", "values"), help="output format. Tabular form (one row per ortholog) or list form." ) parser.add_option("--format", dest="format", type="string", help="output format for numbers.") parser.add_option("--remove-na", dest="remove_na", action="store_true", help="remove entries with any na values.") parser.set_defaults( field_extract="%s_length", field_compare="%s_length", filename_identifiers=None, filename_subset=None, filter_min_ratio=0.00, filter_max_ratio=0.00, schemas="", output_fields="", output_pattern="%s_%s", output_format="table", format="%6.4f", remove_na=False, ) (options, args) = E.Start(parser, add_csv_options=True) options.schemas = options.schemas.split(",") if not options.schemas: raise "please supply schemas." if options.output_fields: options.output_fields = options.output_fields.split(",") else: options.output_fields = () fields, table = CSV.ReadTable(sys.stdin) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) if options.loglevel >= 1: options.stdlog.write("# read a %i x %i table.\n" % (len(table), len(fields))) if options.filename_subset: subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r")) subset = set(subset) table = filter(lambda x: x[0] in subset, table) if options.loglevel >= 1: options.stdlog.write( "# subset of %i entries reduced table to a %i x %i table.\n" % (len(subset), len(table), len(fields))) if options.filename_identifiers: identifiers, nerrors = IOTools.ReadList( open(options.filename_identifiers, "r")) else: identifiers = [] identifiers = set(identifiers) # extract rows with positive identifiers positive_rows = filter(lambda x: x[0] in identifiers, table) if options.loglevel >= 1: options.stdlog.write( "# subset of %i identifiers gives %i positive entries.\n" % (len(identifiers), len(positive_rows))) if options.output_format == "table": options.stdout.write("id") for schema in options.schemas: if options.output_fields: for field in options.output_fields: options.stdout.write("\t" + options.output_pattern % (schema, field)) else: options.stdout.write("\t%s" % (schema)) options.stdout.write("\n") else: options.stdout.write("schema\tvalue\n") if identifiers: for row in positive_rows: if options.output_format == "table": options.stdout.write(row[0]) for schema in options.schemas: # set fields for extraction f_extract = map_fields2column[options.field_extract % schema] f_compare = map_fields2column[options.field_compare % schema] # get region for extraction if row[f_compare] != "na": r = float(row[f_compare]) if options.filter_min_ratio or options.filter_max_ratio: mi = r * options.filter_min_ratio ma = r * options.filter_max_ratio f = lambda x: x[f_compare] != "na" and float( x[f_compare] ) >= mi and float(x[f_compare]) <= ma and x[ 0] not in identifiers and x[f_extract] != "na" else: f = lambda x: x[0] not in identifiers and x[f_extract ] != "na" # extract values: filter by minimum and maximum range and remove # positive identifiers. v = float(row[f_extract]) values = map(lambda x: float(x[f_extract]), filter(f, table)) stats = Stats.DistributionalParameters(values) else: v = None for field in options.output_fields: if v is not None: if field == "zscore": f = options.format % stats.getZScore(v) elif field == "diff": f = options.format % (v - stats["mean"]) elif field == "reldiff": f = options.format % ( (v - stats["mean"]) / stats["mean"]) elif field == "val": f = options.format % v else: f = options.format % stats[field] else: f = "na" if options.output_format == "table": options.stdout.write("\t%s" % f) elif options.output_format == "list": options.stdout.write("%s\t%s\n" % (schema, f)) elif options.output_format == "values": options.stdout.write( "%s\t%s\t%5.2f\t%s\n" % (row[0], schema, v, ",".join( map(lambda x: options.format % x, values)))) if options.output_format == "table": options.stdout.write("\n") else: extract_columns = [] for schema in options.schemas: extract_columns.append(map_fields2column[options.field_extract % schema]) # simply dump a subset of values for row in table: skip = False if options.filter_min_ratio or options.filter_max_ratio: master = options.schemas[0] v = row[map_fields2column[options.field_compare % master]] if v == "na": continue v = float(v) mi = v * options.filter_min_ratio ma = v * options.filter_max_ratio for schema in options.schemas[1:]: r = row[map_fields2column[options.field_compare % schema]] if r == "na": if options.remove_na: skip = True continue r = float(r) if r < mi or r > ma: skip = True if options.loglevel >= 3: if options.format == "table": options.stdout.write("* ") options.stdout.write("%s\t" % row[0]) options.stdout.write("\t".join( [row[y] for y in extract_columns])) options.stdout.write("\n") break if skip: continue if options.output_format == "table": options.stdout.write("%s\t" % row[0]) options.stdout.write("\t".join( [row[y] for y in extract_columns])) options.stdout.write("\n") elif options.output_format == "list": has_na = False for x in range(len(options.schemas)): v = row[extract_columns[x]] if v == "na": has_na = True if has_na and options.remove_na: continue for x in range(len(options.schemas)): options.stdout.write( "%s\t%s\n" % (options.schemas[x], row[extract_columns[x]])) E.Stop()