def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codonbias_weights2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--methods", dest="methods", type="string", help="methods to apply.") parser.add_option("--is-frequencies", dest="is_frequencies", action="store_true", help="data is frequencies (default: weights).") parser.add_option("-s", "--sort", dest="sort", type="choice", choices=("percent-difference", "aa"), help="sort order of output table.") parser.add_option( "-g", "--global-sort", dest="global_sort", action="store_true", help="globally sort results (otherwise: by species pair).") parser.set_defaults( \ methods = "", is_frequencies = False, sort = "percent-difference", global_sort= False, ) (options, args) = E.Start(parser) if options.methods: options.methods = options.methods.split(",") fields, table = CSV.ReadTable(sys.stdin) ## convert weights to floats table = CSV.getConvertedTable(table, range(1, len(fields))) for method in options.methods: if method == "overview": if options.is_frequencies: WriteOverviewFrequencies(fields, table, options) else: WriteOverviewWeights(fields, table, options)
def readAndGroupTable( infile, options ): """read table from infile and group. """ fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.columns = getColumns( fields, options.columns ) assert options.group_column not in options.columns converter = float new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ] if options.group_function == "min": f = min elif options.group_function == "max": f = max elif options.group_function == "sum": f = lambda z: reduce( lambda x,y: x+y, z) elif options.group_function == "mean": f = scipy.mean elif options.group_function == "cat": f = lambda x: ";".join( [ y for y in x if y != "" ] ) converter = str elif options.group_function == "uniq": f = lambda x: ";".join( [ y for y in set(x) if y != "" ] ) converter = str elif options.group_function == "stats": f = lambda x: str(Stats.DistributionalParameters(x)) # update headers new_fields = [ fields[options.group_column] ] for c in options.columns: new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) ) ## convert values to floats (except for group_column) ## Delete rows with unconvertable values and not in options.columns new_table = [] for row in table: skip = False new_row = [ row[options.group_column] ] for c in options.columns: if row[c] == options.missing_value: new_row.append(row[c]) else: try: new_row.append( converter(row[c]) ) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.GroupTable( table, group_column = 0, group_function = f ) options.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: options.stdout.write( "\t".join( map(str,row) ) + "\n")
def ReadGeneLists(filename_genes, gene_pattern=None): """read gene lists from filename in matrix. returns a tuple (list of all genes, dictionary of gene lists) """ if filename_genes == "-": infile = sys.stdin else: infile = IOTools.openFile(filename_genes, "r") headers, table = CSV.readTable(infile.readlines(), as_rows=False) if filename_genes != "-": infile.close() all_genes = table[0] # if there is only a single column, add a dummy column if len(table) == 1: table.append([1] * len(table[0])) headers.append("foreground") E.info("read %i genes from %s" % (len(all_genes), filename_genes)) if gene_pattern: rx = re.compile(gene_pattern) all_genes = [rx.search(x).groups()[0] for x in all_genes] gene_lists = collections.OrderedDict() for header, col in zip(headers[1:], table[1:]): s = list(set([x for x, y in zip(all_genes, col) if y != "0"])) gene_lists[header] = set(s) return all_genes, gene_lists
def readAndExpandTable(infile, options): '''splits fields in table at separator. If a field in a row contains multiple values, the row is expanded into multiple rows such that all values have space. ''' fields, table = CSV.readTable( infile, with_header=options.has_headers, as_rows=True) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(options.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write("\t".join([d[n] for d in data]) + "\n")
def getGODescriptions(infile): """build dictionary mapping GOids to types and descriptions. Arguments --------- infile : string Filename of table with GO assignments Returns ------- mapping : dict Dictionary mapping GOid to GOtype and GOdescription. """ with IOTools.openFile(infile) as inf: fields, table = CSV.readTable(inf, as_rows=False) return dict( [ (y, (x, z)) for x, y, z in zip( table[fields.index("go_type")], table[fields.index("go_id")], table[fields.index("description")] ) ] )
def readAndExpandTable(infile, options): '''splits fields in table at separator. If a field in a row contains multiple values, the row is expanded into multiple rows such that all values have space. ''' fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(options.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write("\t".join([d[n] for d in data]) + "\n")
def buildSelectStatementfromPed(filter_type, pedfile, template): '''Build a select statement from a template and a pedigree file''' pedigree = csv.DictReader( IOTools.openFile(pedfile), delimiter='\t', fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status']) affecteds = [] unaffecteds = [] parents = [] select = None # loop over pedigree file and establish relationships for row in pedigree: if row['status'] == '2': if filter_type == "denovo": father = row['father'] mother = row['mother'] proband = row['sample'] elif filter_type == "dominant" or filter_type == "recessive": affecteds += [row['sample']] if filter_type == "recessive": parents += [row['father'], row['mother']] if row['status'] == '1': if filter_type == "dominant": unaffecteds += [row['sample']] elif filter_type == "recessive": if row['sample'] not in parents: unaffecteds += [row['sample']] # Build select statement from template if filter_type == "denovo": select = template.replace("father", father) select = select.replace("mother", mother) select = select.replace("proband", proband) elif filter_type == "dominant": affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds) if len(unaffecteds) == 0: unaffecteds_exp = '' else: unaffecteds_exp = '&&vc.getGenotype("' + \ ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \ '").isHomRef()' select = template.replace("affecteds_exp", affecteds_exp) select = select.replace("unaffecteds_exp", unaffecteds_exp) elif filter_type == "recessive": affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds) unaffecteds_exp = '").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds) if len(parents) == 0: parents_exp = '' else: parents_exp = '&&vc.getGenotype("' + \ ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \ '").getPL().1==0' select = template.replace("affecteds_exp", affecteds_exp) select = select.replace("unaffecteds_exp", unaffecteds_exp) select = select.replace("parents_exp", parents_exp) return select
def getGODescriptions(infile): '''return dictionary mapping GO category to description and namespace. ''' with IOTools.openFile(infile) as inf: fields, table = CSV.ReadTable(inf, as_rows=False) return dict([ (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[ fields.index("go_id")], table[fields.index("description")]) ])
def readAndJoinTable(infile, options): fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) join_column = options.join_column - 1 join_name = options.join_column_name - 1 join_rows = list(set([x[join_column] for x in table])) join_rows.sort() join_names = list(set([x[join_name] for x in table])) join_names.sort() join_columns = list( set(range(len(fields))).difference(set((join_column, join_name)))) join_columns.sort() new_table = [] map_old2new = {} map_name2start = {} x = 1 for name in join_names: map_name2start[name] = x x += len(join_columns) row_width = len(join_columns) * len(join_names) for x in join_rows: map_old2new[x] = len(map_old2new) new_row = [ x, ] + ["na"] * row_width new_table.append(new_row) for row in table: row_index = map_old2new[row[join_column]] start = map_name2start[row[join_name]] for x in join_columns: new_table[row_index][start] = row[x] start += 1 # print new table options.stdout.write(fields[join_column]) for name in join_names: for column in join_columns: options.stdout.write("\t%s%s%s" % (name, options.separator, fields[column])) options.stdout.write("\n") for row in new_table: options.stdout.write("\t".join(row) + "\n")
def row_iter(rows, reader): for row in rows: yield quoteRow(row, take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) for data in reader: yield quoteRow(CSV.ConvertDictionary(data, map=options.map), take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value)
def readAndJoinTable(infile, options): fields, table = CSV.readTable( infile, with_header=options.has_headers, as_rows=True) join_column = options.join_column - 1 join_name = options.join_column_name - 1 join_rows = list(set(map(lambda x: x[join_column], table))) join_rows.sort() join_names = list(set(map(lambda x: x[join_name], table))) join_names.sort() join_columns = list( set(range(len(fields))).difference(set((join_column, join_name)))) join_columns.sort() new_table = [] map_old2new = {} map_name2start = {} x = 1 for name in join_names: map_name2start[name] = x x += len(join_columns) row_width = len(join_columns) * len(join_names) for x in join_rows: map_old2new[x] = len(map_old2new) new_row = [x, ] + ["na"] * row_width new_table.append(new_row) for row in table: row_index = map_old2new[row[join_column]] start = map_name2start[row[join_name]] for x in join_columns: new_table[row_index][start] = row[x] start += 1 # print new table options.stdout.write(fields[join_column]) for name in join_names: for column in join_columns: options.stdout.write( "\t%s%s%s" % (name, options.separator, fields[column])) options.stdout.write("\n") for row in new_table: options.stdout.write("\t".join(row) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: codonbias_weights2tsv.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("--methods", dest="methods", type="string", help="methods to apply.") parser.add_option("--is-frequencies", dest="is_frequencies", action="store_true", help="data is frequencies (default: weights).") parser.add_option("-s", "--sort", dest="sort", type="choice", choices=("percent-difference", "aa"), help="sort order of output table.") parser.add_option("-g", "--global-sort", dest="global_sort", action="store_true", help="globally sort results (otherwise: by species pair).") parser.set_defaults( methods="", is_frequencies=False, sort="percent-difference", global_sort=False, ) (options, args) = E.Start(parser) if options.methods: options.methods = options.methods.split(",") fields, table = CSV.ReadTable(sys.stdin) # convert weights to floats table = CSV.getConvertedTable(table, range(1, len(fields))) for method in options.methods: if method == "overview": if options.is_frequencies: WriteOverviewFrequencies(fields, table, options) else: WriteOverviewWeights(fields, table, options)
def readAndCollapseTable(infile, options, missing_value=""): '''collapse a table. Collapse a table of two columns with row names in the first column. Outputs a table with multiple columns for each row name. ''' fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) if len(fields) != 2: raise NotImplementedError("can only work on tables with two columns") values = collections.defaultdict(list) # column header after which to add separator = table[0][0] row_names = set([x[0] for x in table]) row_name, value = table[0] values[row_name].append(value) added = set([row_name]) for row_name, value in table[1:]: if row_name == separator: for r in row_names: if r not in added: values[r].append(missing_value) added = set() values[row_name].append(value) added.add(row_name) for r in row_names: if r not in added: values[r].append(missing_value) sizes = set([len(x) for x in list(values.values())]) assert len(sizes) == 1, "unequal number of row_names" size = list(sizes)[0] options.stdout.write("row\t%s\n" % ("\t".join(["column_%i" % x for x in range(size)]))) for key, row in list(values.items()): options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
def readAndCollapseTable(infile, options, missing_value=""): '''collapse a table. Collapse a table of two columns with row names in the first column. Outputs a table with multiple columns for each row name. ''' fields, table = CSV.readTable( infile, with_header=options.has_headers, as_rows=True) if len(fields) != 2: raise NotImplementedError("can only work on tables with two columns") values = collections.defaultdict(list) # column header after which to add separator = table[0][0] row_names = set([x[0] for x in table]) row_name, value = table[0] values[row_name].append(value) added = set([row_name]) for row_name, value in table[1:]: if row_name == separator: for r in row_names: if r not in added: values[r].append(missing_value) added = set() values[row_name].append(value) added.add(row_name) for r in row_names: if r not in added: values[r].append(missing_value) sizes = set([len(x) for x in values.values()]) assert len(sizes) == 1, "unequal number of row_names" size = list(sizes)[0] options.stdout.write( "row\t%s\n" % ("\t".join(["column_%i" % x for x in range(size)]))) for key, row in values.items(): options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
def iterateMacs2Peaks(infile): '''iterate over peaks.xls file and return parsed data. pvalues and fdr are converted to values between 0 and 1 from their -log10 values. ''' for row in CSV.DictReader(infile, dialect='excel-tab'): # these are 1-based coordinates # macs can have negative start coordinates # start try: yield Macs2Peak._make( (row['chr'], max(int(row['start']) - 1, 0), int(row['end']), int(row['length']), float(row['pileup']), math.pow(10, -float(row['-log10(pvalue)'])), float(row['fold_enrichment']), math.pow(10, -float(row['-log10(qvalue)'])), row['name'])) except KeyError, msg: raise KeyError("%s: %s" % (msg, row))
def getGODescriptions(infile): '''build dictionary mapping GOids to types and descriptions. Arguments --------- infile : string Filename of table with GO assignments Returns ------- mapping : dict Dictionary mapping GOid to GOtype and GOdescription. ''' with IOTools.openFile(infile) as inf: fields, table = CSV.readTable(inf, as_rows=False) return dict([ (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[ fields.index("go_id")], table[fields.index("description")]) ])
def computeFDR( infile, options ): '''compute FDR on a table. ''' fields, table = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True ) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append( row[x].split( options.separator ) ) nrows = max( [ len(d) for d in data ] ) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write( "\t".join( [ d[n] for d in data ] ) + "\n" )
def computeFDR(infile, options): '''compute FDR on a table. ''' fields, table = CSV.readTable( infile, with_header=options.has_headers, as_rows=True) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(options.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write("\t".join([d[n] for d in data]) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("--column", dest="column", type="int", help="column to split on.") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins to create.") parser.add_option("--method", dest="method", type="choice", choices=("equal-sized-bins", ), help="method to use to bin data.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help= "OUTPUT filename with histogram information on aggregate coverages [%default]." ) parser.set_defaults( has_headers=True, method="equal-sized-bins", column=1, num_bins=4, output_filename_pattern="bin%i", ) (options, args) = E.Start(parser) options.column -= 1 if args: if args[0] == "-": infile = sys.stdin else: infile = open(args[0], "r") else: infile = sys.stdin fields, data = CSV.ReadTable(infile) c = options.column values = [float(x[c]) for x in data] bins = [] if options.method == "equal-sized-bins": increment = int(math.floor(float(len(values)) / options.num_bins)) indices = range(0, len(values)) indices.sort(key=lambda x: values[x]) for x in xrange(len(values)): values[indices[x]] = x bins = range(0, len(values) - increment, increment) elif options.method == "pass": pass E.debug("bins=%s" % str(bins)) outputters = [] for x in xrange(0, len(bins)): outputters.append( Outputter(options.output_filename_pattern % x, fields)) # output tables for x in xrange(0, len(data)): bin = bisect.bisect(bins, values[x]) - 1 outputters[bin].write(data[x]) # stats if options.loglevel >= 1: options.stdlog.write("# bin\tstart\tcounts\tfilename\n") for x in xrange(0, len(bins)): options.stdlog.write( "# %i\t%f\t%i\t%s\n" % (x, bins[x], outputters[x].mCounts, outputters[x].mFilename)) E.info("ninput=%i, noutput=%i" % (len(data), sum((x.mCounts for x in outputters)))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep"), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by.") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default]") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--header-names", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_option( "--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default].") parser.add_option("--transpose-format", dest="transpose_format", type="choice", choices=( "default", "separated", ), help="input format of un-transposed table") parser.add_option( "--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over " "several rows.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("--columns", dest="columns", type="string", help="columns to use.") parser.add_option("--file", dest="file", type="string", help="columns to test from table.", metavar="FILE") parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns.", metavar="DELIM") parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("--group", dest="group_column", type="int", help="group values by column. Supply an integer column " "[default=%default]") parser.add_option("--group-function", dest="group_function", type="choice", choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns.") parser.add_option( "--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable " "[%default].") parser.add_option("--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix.") parser.add_option("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default].") parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_option("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_option( "--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default].") parser.add_option( "--fdr-method", dest="fdr_method", type="choice", choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--fdr-add-column", dest="fdr_add_column", type="string", help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are " "multiple columns [%default]") # IMS: add option to use a column as the row id in flatten parser.add_option( "--id-column", dest="id_column", type="string", help="list of column(s) to use as the row id when flattening " "the table. If None, then row number is used. [default=%default].") parser.add_option( "--variable-name", dest="variable_name", type="string", help="the column header for the 'variable' column when flattening " "[default=%default].") parser.add_option( "--value-name", dest="value_name", type="string", help="the column header for the 'value' column when flattening " "[default=%default].") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format=None, value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (options, args) = E.Start(parser, add_pipe_options=True) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if options.methods == ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write(line) elif options.transpose or "transpose" in options.methods: readAndTransposeTable(options.stdin, options) elif options.flatten_table: # IMS: bug fixed to make work. Also added options for keying # on a particular and adding custom column headings fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) if options.id_column: id_columns = [int(x) - 1 for x in options.id_column.split(",")] id_header = "\t".join( [fields[id_column] for id_column in id_columns]) options.columns = [ x for x in options.columns if x not in id_columns ] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name)) for x, row in enumerate(table): if options.id_column: row_id = "\t".join( [row[int(x) - 1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write("%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif options.as_column: fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) table = list(zip(*table)) options.stdout.write("value\n") for column in options.columns: options.stdout.write("\n".join(table[column]) + "\n") elif options.split_fields: # split comma separated fields fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(options.separator) for x in row] for d in itertools.product(*row): options.stdout.write("%s\n" % "\t".join(d)) elif options.group: readAndGroupTable(options.stdin, options) elif options.join_column: readAndJoinTable(options.stdin, options) elif options.expand_table: readAndExpandTable(options.stdin, options) elif options.collapse_table is not None: readAndCollapseTable(options.stdin, options, options.collapse_table) elif "grep" in options.methods: options.columns = [int(x) - 1 for x in options.columns.split(",")] patterns = [] if options.file: infile = IOTools.openFile(options.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(options.delimiter)[0]) else: patterns = args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or ( found and not options.invert_match): print(line[:-1]) else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) options.columns = getColumns(fields, options.columns) # convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = [x / value for x in table[c]] elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = [x * value for x in table[c]] elif method == "normalize-by-max": for c in options.columns: m = max(table[c]) table[c] = [x / m for x in table[c]] elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") format = options.format if format is None: format = "%f" for x in range(0, len(options.columns) - 1): for y in range(x + 1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) options.stdout.write( "%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], format % e1, format % e2, format % ((e1 + e2) / 2))) E.Stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = list(zip(tt, list(range(nrows)))) t.sort() for i, n in zip([x[1] for x in t], list(range(nrows))): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \ str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \ str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = list( map( str, Stats.adjustPValues(pvalues, method=options.fdr_method))) if options.fdr_add_column is None: x = 0 for c in options.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append(options.fdr_add_column) else: for co in options.columns: fields.append(options.fdr_add_column + fields[c]) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.readTable( IOTools.openFile(other_table_name, "r"), with_header=options.has_headers, as_rows=False) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value # convert back if options.format is not None: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = format % table[c][r] options.stdout.write("\t".join(fields) + "\n") if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("--column", dest="column", type="int", help="column to split on.") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins to create.") parser.add_option("--method", dest="method", type="choice", choices=("equal-sized-bins",), help="method to use to bin data.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.set_defaults( has_headers=True, method="equal-sized-bins", column=1, num_bins=4, output_filename_pattern="bin%i", ) (options, args) = E.Start(parser) options.column -= 1 if args: if args[0] == "-": infile = sys.stdin else: infile = IOTools.openFile(args[0], "r") else: infile = sys.stdin fields, data = CSV.readTable(infile) c = options.column values = [float(x[c]) for x in data] bins = [] if options.method == "equal-sized-bins": increment = int(math.floor(float(len(values)) / options.num_bins)) indices = list(range(0, len(values))) indices.sort(key=lambda x: values[x]) for x in range(len(values)): values[indices[x]] = x bins = list(range(0, len(values) - increment, increment)) elif options.method == "pass": pass E.debug("bins=%s" % str(bins)) outputters = [] for x in range(0, len(bins)): outputters.append( Outputter(options.output_filename_pattern % x, fields)) # output tables for x in range(0, len(data)): bin = bisect.bisect(bins, values[x]) - 1 outputters[bin].write(data[x]) # stats if options.loglevel >= 1: options.stdlog.write("# bin\tstart\tcounts\tfilename\n") for x in range(0, len(bins)): options.stdlog.write("# %i\t%f\t%i\t%s\n" % ( x, bins[x], outputters[x].mCounts, outputters[x].mFilename)) E.info("ninput=%i, noutput=%i" % (len(data), sum((x.mCounts for x in outputters)))) E.Stop()
choices=("percent-difference", "aa"), help="sort order of output table.") parser.add_option( "-g", "--global-sort", dest="global_sort", action="store_true", help="globally sort results (otherwise: by species pair).") parser.set_defaults( \ methods = "", is_frequencies = False, sort = "percent-difference", global_sort= False, ) (options, args) = E.Start( parser ) if options.methods: options.methods = options.methods.split(",") fields, table = CSV.ReadTable(sys.stdin) ## convert weights to floats table = CSV.getConvertedTable( table, range( 1, len(fields) ) ) for method in options.methods: if method == "overview": if options.is_frequencies: WriteOverviewFrequencies( fields, table, options ) else: WriteOverviewWeights( fields, table, options )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option( "-l", "--large", dest="large", action="store_true", help= "large columns. Do not use native python CSV module [default=%default]." ) parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string", help="filename with field information.") parser.set_defaults( remove=False, unique=False, filename_fields=None, ) (options, args) = E.Start(parser, add_csv_options=True, quiet=True) input_fields = args if options.filename_fields: input_fields = map( lambda x: x[:-1].split("\t")[0], filter(lambda x: x[0] != "#", open(options.filename_fields, "r").readlines())) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout while 1: line = sys.stdin.readline() if not line: E.Stop() sys.exit(0) if line[0] == "#": continue first_line = line break old_fields = first_line[:-1].split("\t") fields = [] for f in input_fields: # do pattern search if f[0] == "%" and f[-1] == "%": pattern = re.compile(f[1:-1]) for o in old_fields: if pattern.search(o) and o not in fields: fields.append(o) else: if f in old_fields: fields.append(f) if options.remove: fields = set(fields) fields = [x for x in old_fields if x not in fields] if options.large: reader = CSV.DictReaderLarge(CommentStripper(sys.stdin), fieldnames=old_fields, dialect=options.csv_dialect) else: reader = csv.DictReader(CommentStripper(sys.stdin), fieldnames=old_fields, dialect=options.csv_dialect) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') print "\t".join(fields) first_row = True ninput, noutput, nerrors = 0, 0, 0 while 1: ninput += 1 try: row = reader.next() except _csv.Error, msg: options.stderr.write("# error while parsing: %s\n" % (msg)) nerrors += 1 continue except StopIteration: break
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option("-l", "--large", dest="large", action="store_true", help="large columns. Do not use native python CSV module [default=%default].") parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string", help="filename with field information.") parser.set_defaults( remove=False, unique=False, filename_fields=None, ) (options, args) = E.Start(parser, add_csv_options=True, quiet=True) statement = " ".join(args) if options.large: reader = CSV.DictReaderLarge(CommentStripper(sys.stdin), dialect=options.csv_dialect) else: reader = csv.DictReader(CommentStripper(sys.stdin), dialect=options.csv_dialect) exec "f = lambda r: %s" % statement in locals() counter = E.Counter() writer = csv.DictWriter(options.stdout, reader.fieldnames, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator) writer.writerow(dict((fn, fn) for fn in reader.fieldnames)) while 1: counter.input += 1 try: row = reader.next() except _csv.Error, msg: options.stderr.write("# error while parsing: %s\n" % (msg)) counter.errors += 1 continue except StopIteration: break
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") options.filename1, options.filename2 = args table1 = CSV.readTable(IOTools.openFile(options.filename1, "r")) table2 = CSV.readTable(IOTools.openFile(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout # build new field list new_fields = [] for x in options.join_fields1: new_fields.append(x) for x in fields1: if x not in options.join_fields1: new_fields.append(x) if x not in options.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if options.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=options.csv_dialect) print("\t".join(fields)) first_row = True for row in reader: row = IOTools.convertDictionary(row) writer.writerow(row) E.Stop()
existing_tables = set([x[0] for x in cc]) cc.close() # use , as separator quick_import_statement = \ "sqlite3 %s '.import %%s %s'" % \ (options.database_name, options.tablename) quick_import_separator = "|" if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = CSV.DictReader(infile, dialect=options.dialect, fieldnames=options.header) if options.replace_header: try: reader.next() except StopIteration: pass E.info("reading %i columns to guess column types" % options.guess_size) rows = []
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.getOutputFile("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.getOutputFile("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.openFile( E.getOutputFile("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.getOutputFile("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.getOutputFile("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.getOutputFile("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.getOutputFile("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.getOutputFile("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.getOutputFile("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.getOutputFile("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.getOutputFile("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.Stop()
def run(infile, options, report_step=10000): options.tablename = quoteTableName(options.tablename, backend=options.backend) if options.map: m = {} for x in options.map: f, t = x.split(":") m[f] = t options.map = m else: options.map = {} existing_tables = set() quick_import_separator = "\t" if options.database_backend == "postgres": import psycopg2 raise NotImplementedError("needs refactoring for commandline options") dbhandle = psycopg2.connect(options.psql_connection) error = psycopg2.Error options.null = "NULL" options.string_value = "'%s'" options.text = "TEXT" options.index = "TEXT" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.database_backend == "mysql": import MySQLdb dbhandle = MySQLdb.connect(host=options.database_host, user=options.database_username, passwd=options.database_password, port=options.database_port, db=options.database_name) error = Exception options.null = "NULL" options.string_value = "%s" options.text = "TEXT" options.index = "VARCHAR(40)" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect(options.database_name) try: os.chmod(options.database_name, 0o664) except OSError as msg: E.warn("could not change permissions of database: %s" % msg) # Avoid the following error: # sqlite3.ProgrammingError: You must not use 8-bit bytestrings # unless you use a text_factory that can interpret 8-bit # bytestrings (like text_factory = str). It is highly # recommended that you instead just switch your application # to Unicode strings # Note: might be better to make csv2db unicode aware. dbhandle.text_factory = str error = sqlite3.OperationalError options.insert_many = True # False options.null = None # "NULL" options.text = "TEXT" options.index = "TEXT" options.string_value = "%s" # "'%s'" statement = "SELECT name FROM sqlite_master WHERE type='table'" cc = executewait(dbhandle, statement, error, options.retry) existing_tables = set([x[0] for x in cc]) cc.close() # use , as separator quick_import_statement = \ "sqlite3 %s '.import %%s %s'" % \ (options.database_name, options.tablename) quick_import_separator = "|" if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = csv.DictReader(CSV.CommentStripper(infile), dialect=options.dialect, fieldnames=options.header) if options.replace_header: try: next(reader) except StopIteration: pass E.info("reading %i columns to guess column types" % options.guess_size) rows = [] for row in reader: if None in row: raise ValueError("undefined columns in input file at row: %s" % row) try: rows.append(IOTools.convertDictionary(row, map=options.map)) except TypeError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) except ValueError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) if len(rows) >= options.guess_size: break E.info("read %i rows for type guessing" % len(rows)) E.info("creating table") if len(rows) == 0: if options.allow_empty: if not reader.fieldnames: E.warn("no data - no table created") else: # create empty table and exit take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) E.info("empty table created") return else: raise ValueError("empty table") else: take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, rows=rows, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) def row_iter(rows, reader): for row in rows: yield quoteRow(row, take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) for data in reader: yield quoteRow(IOTools.convertDictionary(data, map=options.map), take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) ninput = 0 E.info("inserting data") if options.insert_quick: E.info("using quick insert") outfile, filename = tempfile.mkstemp() E.debug("dumping data into %s" % filename) for d in row_iter(rows, reader): ninput += 1 os.write( outfile, quick_import_separator.join([str(d[x]) for x in take]) + "\n") if ninput % report_step == 0: E.info("iteration %i\n" % ninput) os.close(outfile) statement = quick_import_statement % filename E.debug(statement) # infinite loop possible while 1: retcode = E.run(statement, cwd=os.getcwd(), close_fds=True) if retcode != 0: E.warn("import error using statement: %s" % statement) if not options.retry: raise ValueError("import error using statement: %s" % statement) time.sleep(5) continue break os.remove(filename) # there is no way to insert NULL values into sqlite. The only # solution is to update all colums. for column in take: executewait( dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" % (options.tablename, column, column), error, options.retry) elif options.insert_many: data = [] for d in row_iter(rows, reader): ninput += 1 data.append([d[x] for x in take]) if ninput % report_step == 0: E.info("iteration %i" % ninput) statement = "INSERT INTO %s VALUES (%s)" % (options.tablename, ",".join("?" * len(take))) E.info("inserting %i rows" % len(data)) E.debug("multiple insert:\n# %s" % statement) while 1: try: dbhandle.executemany(statement, data) except error as msg: E.warn("import failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not options.retry: raise error(msg) if not re.search("locked", str(msg)): raise error(msg) time.sleep(5) continue break else: # insert line by line (could not figure out how to do bulk loading with # subprocess and COPY FROM STDIN) statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename, ')s, %('.join(take)) # output data used for guessing: for d in row_iter(rows, reader): ninput += 1 E.debug("single insert:\n# %s" % (statement % d)) cc = executewait(dbhandle, statement, error, retry=options.retry, args=d) cc.close() if ninput % report_step == 0: E.info("iteration %i" % ninput) E.info("building indices") nindex = 0 for index in options.indices: nindex += 1 try: statement = "CREATE INDEX %s_index%i ON %s (%s)" % ( options.tablename, nindex, options.tablename, index) cc = executewait(dbhandle, statement, error, options.retry) cc.close() E.info("added index on column %s" % (index)) except error as msg: E.info("adding index on column %s failed: %s" % (index, msg)) statement = "SELECT COUNT(*) FROM %s" % (options.tablename) cc = executewait(dbhandle, statement, error, options.retry) result = cc.fetchone() cc.close() noutput = result[0] E.info("ninput=%i, noutput=%i, nskipped_columns=%i" % (ninput, noutput, len(ignored))) dbhandle.commit()
def readAndGroupTable(infile, options): """read table from infile and group. """ fields, table = CSV.readTable( infile, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) assert options.group_column not in options.columns converter = float new_fields = [fields[options.group_column]] + [fields[x] for x in options.columns] if options.group_function == "min": f = min elif options.group_function == "max": f = max elif options.group_function == "sum": f = lambda z: reduce(lambda x, y: x + y, z) elif options.group_function == "mean": f = scipy.mean elif options.group_function == "cat": f = lambda x: ";".join([y for y in x if y != ""]) converter = str elif options.group_function == "uniq": f = lambda x: ";".join([y for y in set(x) if y != ""]) converter = str elif options.group_function == "stats": f = lambda x: str(Stats.DistributionalParameters(x)) # update headers new_fields = [fields[options.group_column]] for c in options.columns: new_fields += list(map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders())) # convert values to floats (except for group_column) # Delete rows with unconvertable values and not in options.columns new_table = [] for row in table: skip = False new_row = [row[options.group_column]] for c in options.columns: if row[c] == options.missing_value: new_row.append(row[c]) else: try: new_row.append(converter(row[c])) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.groupTable(table, group_column=0, group_function=f) options.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: options.stdout.write("\t".join(map(str, row)) + "\n")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option("-1", "--join-fields1", dest="join_fields1", type="string", help="join fields in first table.") parser.add_option("-2", "--join-fields2", dest="join_fields2", type="string", help="join fields in second table.") parser.add_option("-m", "--method", dest="method", type="choice", help="set operation to perform.", choices=("intersection", "rest", "union")) parser.set_defaults( remove=False, unique=False, join_fields1=None, join_fields2=None, method="intersection", ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise "please specify two files to join." if not options.join_fields1 or not options.join_fields2: raise "please specify at least one join field per table." options.join_fields1 = options.join_fields1.split(",") options.join_fields2 = options.join_fields2.split(",") options.filename1, options.filename2 = args fields1, table1 = CSV.readTable(open(options.filename1, "r")) fields2, table2 = CSV.readTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout nfields1 = [] for x in range(len(fields1)): if fields1[x] in options.join_fields1: nfields1.append(x) nfields2 = [] for x in range(len(fields2)): if fields2[x] in options.join_fields2: nfields2.append(x) # calculate row indices: double keys are not taken care of here keys = {} for row1 in table1: v = map(lambda x: row1[x], nfields1) key = hashlib.md5("".join(v)).digest() keys[key] = row1 if options.method == "intersection": # build new field list take = range(len(fields1)) c = len(take) for x in fields2: if x not in options.join_fields2: take.append(c) c += 1 t = fields1 + fields2 new_fields = map(lambda x: t[x], take) print "\t".join(new_fields) for row2 in table2: v = map(lambda x: row2[x], nfields2) key = hashlib.md5("".join(v)).digest() if key in keys: new_row = keys[key] + row2 outfile.write("\t".join(map(lambda x: new_row[x], take)) + "\n") elif options.method == "rest": new_fields = fields2 print "\t".join(new_fields) for row2 in table2: v = map(lambda x: row2[x], nfields2) key = hashlib.md5("".join(v)).digest() if key not in keys: outfile.write("\t".join(row2) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep"), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by.") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default]") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option( "-t", "--header-names", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_option( "--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default].") parser.add_option( "--transpose-format", dest="transpose_format", type="choice", choices=("default", "separated", ), help="input format of un-transposed table") parser.add_option( "--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over " "several rows.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("--columns", dest="columns", type="string", help="columns to use.") parser.add_option("--file", dest="file", type="string", help="columns to test from table.", metavar="FILE") parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns.", metavar="DELIM") parser.add_option( "-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option( "--group", dest="group_column", type="int", help="group values by column. Supply an integer column " "[default=%default]") parser.add_option("--group-function", dest="group_function", type="choice", choices=( "min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns.") parser.add_option( "--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable " "[%default].") parser.add_option( "--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix.") parser.add_option( "--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default].") parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_option( "--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_option( "--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default].") parser.add_option( "--fdr-method", dest="fdr_method", type="choice", choices=( "BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--fdr-add-column", dest="fdr_add_column", type="string", help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are " "multiple columns [%default]") # IMS: add option to use a column as the row id in flatten parser.add_option( "--id-column", dest="id_column", type="string", help="list of column(s) to use as the row id when flattening " "the table. If None, then row number is used. [default=%default].") parser.add_option( "--variable-name", dest="variable_name", type="string", help="the column header for the 'variable' column when flattening " "[default=%default].") parser.add_option( "--value-name", dest="value_name", type="string", help="the column header for the 'value' column when flattening " "[default=%default].") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format=None, value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (options, args) = E.Start(parser, add_pipe_options=True) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if options.methods == ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write(line) elif options.transpose or "transpose" in options.methods: readAndTransposeTable(options.stdin, options) elif options.flatten_table: # IMS: bug fixed to make work. Also added options for keying # on a particular and adding custom column headings fields, table = CSV.readTable( options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) if options.id_column: id_columns = map( lambda x: int(x) - 1, options.id_column.split(",")) id_header = "\t".join([fields[id_column] for id_column in id_columns]) options.columns = [ x for x in options.columns if x not in id_columns] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name)) for x, row in enumerate(table): if options.id_column: row_id = "\t".join([row[int(x) - 1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write( "%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif options.as_column: fields, table = CSV.readTable( options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) table = zip(*table) options.stdout.write("value\n") for column in options.columns: options.stdout.write("\n".join(table[column]) + "\n") elif options.split_fields: # split comma separated fields fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(options.separator) for x in row] for d in itertools.product(*row): options.stdout.write("%s\n" % "\t".join(d)) elif options.group: readAndGroupTable(options.stdin, options) elif options.join_column: readAndJoinTable(options.stdin, options) elif options.expand_table: readAndExpandTable(options.stdin, options) elif options.collapse_table is not None: readAndCollapseTable(options.stdin, options, options.collapse_table) elif "grep" in options.methods: options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) patterns = [] if options.file: infile = open(options.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(options.delimiter)[0]) else: patterns = args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or (found and not options.invert_match): print line[:-1] else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.readTable( options.stdin, with_header=options.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) options.columns = getColumns(fields, options.columns) # convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map(lambda x: x / value, table[c]) elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = map(lambda x: x * value, table[c]) elif method == "normalize-by-max": for c in options.columns: m = max(table[c]) table[c] = map(lambda x: x / m, table[c]) elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") format = options.format if format is None: format = "%f" for x in range(0, len(options.columns) - 1): for y in range(x + 1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % ( fields[c1], fields[c2], format % e1, format % e2, format % ((e1 + e2) / 2))) E.Stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = zip(tt, range(nrows)) t.sort() for i, n in zip(map(lambda x: x[1], t), range(nrows)): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \ str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \ str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = map( str, Stats.adjustPValues(pvalues, method=options.fdr_method)) if options.fdr_add_column is None: x = 0 for c in options.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append(options.fdr_add_column) else: for co in options.columns: fields.append(options.fdr_add_column + fields[c]) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.readTable( open(other_table_name, "r"), with_header=options.has_headers, as_rows=False) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value # convert back if options.format is not None: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = format % table[c][r] options.stdout.write("\t".join(fields) + "\n") if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") E.Stop()
def buildUTRExtension(infile, outfile): '''build new utrs by building and fitting an HMM to reads upstream and downstream of known genes. Works on output of buildGeneLevelReadExtension. Known problems * the size of the extension is limited by the window size * introns within UTRs are ignored. * UTR extension might be underestimated for highly expressed genes as relative read counts drop off quickly, even though there is a good amount of reads still present in the UTR. The model The model is a three-state model:: UTR --|--> notUTR --|--> otherTranscript --| ^---| ^------| ^-------| ^-----------------------------| The chain starts in UTR and ends in notUTr or otherTranscript. The otherTranscript state models peaks of within the upstream/ downstream region of a gene. These peaks might correspond to additional exons or unknown transcripts. Without this state, the UTR might be artificially extend to include these peaks. Emissions are modelled with beta distributions. These distributions permit both bimodal (UTR) and unimodal (notUTR) distribution of counts. Parameter estimation Parameters are derived from known UTRs within full length territories. Transitions and emissions for the otherTranscript state are set heuristically: * low probabibily for remaining in state "otherTranscript". * these transcripts should be short. * emissions biased towards high counts - only strong signals will be considered. * these could be estimated from known UTRs, but I am worried UTR extensions then will be diluted. Alternatives The method could be improved. * base level resolution? * longer chains result in more data and longer running times. * the averaging in windows smoothes the data, which might have a beneficial effect. * raw counts instead of scaled counts? * better model, as highly expressed genes should give more confident predictions. ''' # the bin size , see gtf2table - can be cleaned from column names # or better set as options in .ini file binsize = 100 territory_size = 15000 # read gene coordinates geneinfos = {} for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'): contig, strand, start, end = x['contig'], x['strand'], int( x['start']), int(x['end']) geneinfos[x['gene_id']] = (contig, strand, start, end) infiles = [ infile + ".readextension_upstream_sense.tsv.gz", infile + ".readextension_downstream_sense.tsv.gz" ] outdir = os.path.join(PARAMS["exportdir"], "utr_extension") R('''suppressMessages(library(RColorBrewer))''') R('''suppressMessages(library(MASS))''') R('''suppressMessages(library(HiddenMarkov))''') # for upstream, downstream upstream_utrs, downstream_utrs = {}, {} all_genes = set() for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)): E.info("processing %s" % filename) parts = os.path.basename(filename).split(".") data = R( '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)''' % locals()) ########################################## ########################################## ########################################## ## estimation ########################################## # take only those with a 'complete' territory R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''') # save UTR R('''utrs = d$utr''') # remove length and utr column R('''d = d[-c(1,2)]''') # remove those which are completely empty, logtransform or scale data and export R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )''' ) utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''') scaled = R( '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))''' ) exons = R('''lraw[,1]''') ####################################################### ####################################################### ####################################################### # do the estimation: E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" % (len(utrs), len(exons), len(scaled), R.dim(scaled))) # counts within and outside UTRs within_utr, outside_utr, otherTranscript = [], [], [] # number of transitions between utrs transitions = numpy.zeros((3, 3), numpy.int) for x in xrange(len(utrs)): utr, exon = utrs[x], exons[x] # only consider genes with expression coverage # note: expression level is logscaled here, 10^1 = 10 if exon < 0.1: continue # first row is column names, so x + 1 values = list(scaled.rx(x + 1, True)) utr_bins = utr // binsize nonutr_bins = (territory_size - utr) // binsize # build transition matrix transitions[0][0] += utr_bins transitions[0][1] += 1 transitions[1][1] += nonutr_bins outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5]) # ignore exon and zero counts within_utr.extend([x for x in values[1:utr_bins] if x > 0.1]) # add only high counts to otherTranscript emissions otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5]) # estimation for # 5% chance of transiting to otherTranscript transitions[1][2] = transitions[1][1] * 0.05 # 10% chance of remaining in otherTranscript transitions[2][1] = 900 transitions[2][2] = 100 E.info( "counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % \ ( len(within_utr), numpy.mean(within_utr), len(outside_utr), numpy.mean(outside_utr), len(otherTranscript), numpy.mean(otherTranscript)) ) ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3) R('''transitions = transitions / rowSums( transitions )''') ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000]) ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000]) ro.globalenv['otherTranscript'] = ro.FloatVector( otherTranscript[:10000]) # estimate beta distribution parameters R('''doFit = function( data ) { data[data == 0] = data[data == 0] + 0.001 data[data == 1] = data[data == 1] - 0.001 f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) ) return (f) }''') fit_within_utr = R( '''fit_within_utr = suppressMessages(doFit( within_utr))''') fit_outside_utr = R( '''fit_outside_utr = suppressMessages(doFit( outside_utr))''') fit_other = R( '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))''' ) within_a, within_b = list(fit_within_utr.rx("estimate"))[0] outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0] other_a, other_b = list(fit_other.rx("estimate"))[0] E.info( "beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % \ (within_a, within_b, outside_a, outside_b, other_a, other_b)) fn = ".".join((parts[0], parts[4], "fit", "png")) outfilename = os.path.join(outdir, fn) R.png(outfilename, height=1000, width=1000) R('''par(mfrow=c(3,1))''') R('''x=seq(0,1,0.02)''') R('''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')''' ) R('''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')''' ) R('''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''') R('''par(new=TRUE)''') R('''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')''' ) R['dev.off']() ##################################################### ##################################################### ##################################################### # build hmm # state 1 = UTR # state 2 = notUTR # state 3 = other transcript p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'], fit_outside_utr$estimate['shape1'], fit_otherTranscript$estimate['shape1']), shape2=c(fit_within_utr$estimate['shape2'], fit_outside_utr$estimate['shape2'], fit_otherTranscript$estimate['shape2'])) ''' ) R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''') E.info("fitting starts") ##################################################### ##################################################### ##################################################### # fit to every sequence genes = R('''rownames(data)''') all_genes.update(set(genes)) utrs = R('''data$utr''') exons = R('''data$exon''') nseqs = len(utrs) counter = E.Counter() for idx in xrange(len(utrs)): gene_id = genes[idx] old_utr = utrs[idx] if idx % 100 == 0: E.debug("processing gene %i/%i" % (idx, len(utrs))) counter.input += 1 # do not predict if terminal exon not expressed if exons[idx] < 1: counter.skipped_notexpressed += 1 new_utrs[gene_id] = Utr._make( (old_utr, None, None, "notexpressed")) continue R('''obs = data[%i,][-c(1,2)]''' % (idx + 1)) # remove na obs = R('''obs = obs[!is.na(obs)]''') if len(obs) <= 1 or max(obs) == 0: new_utrs[gene_id] = Utr._make( (old_utr, None, None, "no observations")) continue # normalize R('''obs = obs / max(obs)''') # add small epsilon to 0 and 1 values R('''obs[obs==0] = obs[obs==0] + 0.001 ''') R('''obs[obs==1] = obs[obs==1] - 0.001 ''') R('''hmm$x = obs''') states = None try: states = list(R('''states = Viterbi( hmm )''')) except ri.RRuntimeError, msg: counter.skipped_error += 1 new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail")) continue max_utr = binsize * (len(states) - 1) # subtract 1 for last exon try: new_utr = binsize * (states.index(2) - 1) new_utrs[gene_id] = Utr._make( (old_utr, new_utr, max_utr, "ok")) counter.success += 1 except ValueError: new_utrs[gene_id] = Utr._make( (old_utr, max_utr, max_utr, "max")) counter.maxutr += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise "please specify two files to join." options.filename1, options.filename2 = args table1 = CSV.ReadTable(open(options.filename1, "r")) table2 = CSV.ReadTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout # build new field list new_fields = [] for x in options.join_fields1: new_fields.append(x) for x in fields1: if x not in options.join_fields1: new_fields.append(x) if x not in options.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if options.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=options.csv_dialect) print "\t".join(fields) first_row = True for row in reader: row = CSV.ConvertDictionary(row) writer.writerow(row) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option("-1", "--join-fields1", dest="join_fields1", type="string", help="join fields in first table.") parser.add_option("-2", "--join-fields2", dest="join_fields2", type="string", help="join fields in second table.") parser.add_option("-m", "--method", dest="method", type="choice", help="set operation to perform.", choices=("intersection", "rest", "union")) parser.set_defaults( remove=False, unique=False, join_fields1=None, join_fields2=None, method="intersection", ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") if not options.join_fields1 or not options.join_fields2: raise ValueError("please specify at least one join field per table") options.join_fields1 = options.join_fields1.split(",") options.join_fields2 = options.join_fields2.split(",") options.filename1, options.filename2 = args fields1, table1 = CSV.readTable(open(options.filename1, "r")) fields2, table2 = CSV.readTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout nfields1 = [] for x in range(len(fields1)): if fields1[x] in options.join_fields1: nfields1.append(x) nfields2 = [] for x in range(len(fields2)): if fields2[x] in options.join_fields2: nfields2.append(x) # calculate row indices: double keys are not taken care of here keys = {} for row1 in table1: v = [row1[x] for x in nfields1] key = hashlib.md5("".join(v)).digest() keys[key] = row1 if options.method == "intersection": # build new field list take = list(range(len(fields1))) c = len(take) for x in fields2: if x not in options.join_fields2: take.append(c) c += 1 t = fields1 + fields2 new_fields = [t[x] for x in take] print("\t".join(new_fields)) for row2 in table2: v = [row2[x] for x in nfields2] key = hashlib.md5("".join(v)).digest() if key in keys: new_row = keys[key] + row2 outfile.write( "\t".join([new_row[x] for x in take]) + "\n") elif options.method == "rest": new_fields = fields2 print("\t".join(new_fields)) for row2 in table2: v = [row2[x] for x in nfields2] key = hashlib.md5("".join(v)).digest() if key not in keys: outfile.write("\t".join(row2) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError, msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: csv2xls.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-o", "--outfile=", dest="output_filename", type="string", help="write to output filename.") parser.set_defaults( output_filename=None, ) (options, args) = E.Start(parser, add_csv_options=True) if not options.output_filename: raise ValueError("please specify an output filename.") w = openpyxl.Workbook(optimized_write=True) # create styles header_style = GetHeaderStyle() data_style = GetDataStyle() for filename in args: lines = filter(lambda x: x[0] != "#", open(filename, "r").readlines()) if len(lines) == 0: continue if options.loglevel >= 2: print "# read %i rows" % len(lines) sys.stdout.flush() headers = lines[0][:-1].split("\t") ws = w.add_sheet(os.path.basename(filename)) cur_row = 0 ws.append(headers) cur_row += 1 reader = csv.DictReader(lines, dialect=options.csv_dialect) for row in reader: row = CSV.ConvertDictionary(row) data = [row.get(headers[x], "") for x in range(len(headers))] ws.append(data) cur_row += 1 w.save(options.output_filename) E.Stop()
def main(): parser = E.OptionParser( version= "%prog version: $Id: analyze_readpositions.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern for additional output files [%default].") parser.set_defaults( length=1000, minimum_coverage=0.90, maximum_reads=[1, 10, 20, 50, 100], output_filename_pattern="%s", normalize=True, ) (options, args) = E.Start(parser, add_csv_options=True) fields, table = CSV.ReadTable(sys.stdin, dictreader=CSV.DictReaderLarge) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) coverage_5prime = numpy.zeros(options.length, numpy.float) coverage_3prime = numpy.zeros(options.length, numpy.float) coverage_maxreads5prime = numpy.zeros(options.length, numpy.float) coverage_maxreads3prime = numpy.zeros(options.length, numpy.float) coverage_full5prime = numpy.zeros(options.length, numpy.float) coverage_full3prime = numpy.zeros(options.length, numpy.float) coverage_min5prime = numpy.zeros(options.length, numpy.float) coverage_min3prime = numpy.zeros(options.length, numpy.float) histograms = [] for x in range(len(options.maximum_reads)): histograms.append([ numpy.zeros(options.length, numpy.float), numpy.zeros(options.length, numpy.float), 0 ]) ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0 for row in table: length, covered, meancov, data, nreads = (int(row["cov_nval"]), float(row["cov_covered"]), float(row["cov_mean"]), row["cov_values"], int(row["nover2"])) ninput += 1 if length < options.length: nlength += 1 continue if data == "na": nskipped += 1 continue noutput += 1 mincov = covered / length values = map(float, data.split(";")) m = max(values) values = [x / m for x in values] coverage_5prime += values[0:1000] coverage_3prime += values[-1000:] if mincov >= 1.0: coverage_full5prime += values[0:1000] coverage_full3prime += values[-1000:] nfull += 1 if meancov >= options.minimum_coverage: coverage_min5prime += values[0:1000] coverage_min3prime += values[-1000:] nmincov += 1 for maxreads in range(len(options.maximum_reads)): if nreads <= options.maximum_reads[maxreads]: histograms[maxreads][0] += values[0:1000] histograms[maxreads][1] += values[-1000:] histograms[maxreads][2] += 1 if options.normalize: for x5, x3 in ((coverage_5prime, coverage_3prime), (coverage_min5prime, coverage_min3prime), (coverage_full5prime, coverage_full3prime)): m = max((max(x5), max(x3))) x3 /= m x5 /= m for x5, x3, c in histograms: m = max((max(x5), max(x3))) x5 /= m x3 /= m outfile = options.stdout outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'", "mincov-3'", "full-5'", "full-3'")) + "\n") for x in range(0, options.length): outfile.write( "\t".join( [ "%6.4f" % x for x in \ (x, coverage_5prime[x], coverage_3prime[x], coverage_min5prime[x], coverage_min3prime[x], coverage_full5prime[x], coverage_full3prime[x] ) ] ) + "\n" ) outfile5 = open(options.output_filename_pattern % "reads5", "w") outfile3 = open(options.output_filename_pattern % "reads3", "w") outfile5.write("\t".join([ "distance", ] + [ "reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads)) ]) + "\n") outfile3.write("\t".join([ "distance", ] + [ "reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads)) ]) + "\n") for x in range(0, options.length): outfile5.write("%i\t%s\n" % (x, "\t".join([ "%6.4f" % histograms[y][0][x] for y in range(len(options.maximum_reads)) ]))) outfile3.write("%i\t%s\n" % (x, "\t".join([ "%6.4f" % histograms[y][1][x] for y in range(len(options.maximum_reads)) ]))) E.info( "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i" %\ (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength) ) E.Stop()
('none', 'all' ), ('kappa', 'all' ), ('omega', 'all' ), ('ds', 'all'), ) map_model2params = { 'none' : 8, 'ds' : 7, 'omega' : 6, 'kappa' : 6, 'omega-ds' : 5, 'kappa-ds' : 5, 'all' : 4 } reader = CSV.DictReader( sys.stdin, dialect=options.csv_dialect ) stats = {} options.stdout.write( "id" ) for a, b in tests: options.stdout.write( "\t%s:%s\tp%s:%s" % (a, b, a, b)) stats[(a,b)] = 0 options.stdout.write( "\n" ) ninput, noutput, nskipped, nerrors, ntests = 0, 0, 0, 0, 0 for row in reader: ninput += 1 if int(row['N:len']) <= options.min_length or int(row['C:len']) <= options.min_length :
statement = "SELECT name FROM sqlite_master WHERE type='table'" cc = executewait(dbhandle, statement, error, options.retry) existing_tables = set([x[0] for x in cc]) cc.close() quick_import_statement = \ "sqlite3 -header -csv -separator '\t' %s '.import %%s %s'" % \ (options.database, options.tablename) if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = CSV.DictReader(infile, dialect=options.dialect, fieldnames=options.header) if options.replace_header: reader.next() E.info("reading %i columns to guess column types" % options.guess_size) rows = [] for row in reader: if None in row: raise ValueError("undefined columns in input file at row: %s" %
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern for additional output files [%default].", ) parser.set_defaults( length=1000, minimum_coverage=0.90, maximum_reads=[1, 10, 20, 50, 100], output_filename_pattern="%s", normalize=True, ) (options, args) = E.Start(parser, add_csv_options=True) fields, table = CSV.readTable(sys.stdin, dictreader=CSV.DictReaderLarge) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) coverage_5prime = numpy.zeros(options.length, numpy.float) coverage_3prime = numpy.zeros(options.length, numpy.float) coverage_maxreads5prime = numpy.zeros(options.length, numpy.float) coverage_maxreads3prime = numpy.zeros(options.length, numpy.float) coverage_full5prime = numpy.zeros(options.length, numpy.float) coverage_full3prime = numpy.zeros(options.length, numpy.float) coverage_min5prime = numpy.zeros(options.length, numpy.float) coverage_min3prime = numpy.zeros(options.length, numpy.float) histograms = [] for x in range(len(options.maximum_reads)): histograms.append([numpy.zeros(options.length, numpy.float), numpy.zeros(options.length, numpy.float), 0]) ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0 for row in table: length, covered, meancov, data, nreads = ( int(row["cov_nval"]), float(row["cov_covered"]), float(row["cov_mean"]), row["cov_values"], int(row["nover2"]), ) ninput += 1 if length < options.length: nlength += 1 continue if data == "na": nskipped += 1 continue noutput += 1 mincov = covered / length values = list(map(float, data.split(";"))) m = max(values) values = [x / m for x in values] coverage_5prime += values[0:1000] coverage_3prime += values[-1000:] if mincov >= 1.0: coverage_full5prime += values[0:1000] coverage_full3prime += values[-1000:] nfull += 1 if meancov >= options.minimum_coverage: coverage_min5prime += values[0:1000] coverage_min3prime += values[-1000:] nmincov += 1 for maxreads in range(len(options.maximum_reads)): if nreads <= options.maximum_reads[maxreads]: histograms[maxreads][0] += values[0:1000] histograms[maxreads][1] += values[-1000:] histograms[maxreads][2] += 1 if options.normalize: for x5, x3 in ( (coverage_5prime, coverage_3prime), (coverage_min5prime, coverage_min3prime), (coverage_full5prime, coverage_full3prime), ): m = max((max(x5), max(x3))) x3 /= m x5 /= m for x5, x3, c in histograms: m = max((max(x5), max(x3))) x5 /= m x3 /= m outfile = options.stdout outfile.write( "\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'", "mincov-3'", "full-5'", "full-3'")) + "\n" ) for x in range(0, options.length): outfile.write( "\t".join( [ "%6.4f" % x for x in ( x, coverage_5prime[x], coverage_3prime[x], coverage_min5prime[x], coverage_min3prime[x], coverage_full5prime[x], coverage_full3prime[x], ) ] ) + "\n" ) outfile5 = IOTools.openFile(options.output_filename_pattern % "reads5", "w") outfile3 = IOTools.openFile(options.output_filename_pattern % "reads3", "w") outfile5.write( "\t".join(["distance"] + ["reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads))]) + "\n" ) outfile3.write( "\t".join(["distance"] + ["reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads))]) + "\n" ) for x in range(0, options.length): outfile5.write( "%i\t%s\n" % (x, "\t".join(["%6.4f" % histograms[y][0][x] for y in range(len(options.maximum_reads))])) ) outfile3.write( "%i\t%s\n" % (x, "\t".join(["%6.4f" % histograms[y][1][x] for y in range(len(options.maximum_reads))])) ) E.info( "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i" % (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_sites_slr.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--method", dest="method", type="choice", choices=("summary-slr", "summary-filtered", "over-representation", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"), help="method to apply.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("-s", "--filename-sites", dest="filename_sites", type="string", help="filename with sites information.") parser.add_option("-l", "--filename-log", dest="filename_log", type="string", help="filename with logging information.") parser.add_option( "-m", "--filename-mali", dest="filename_mali", type="string", help= "filename of multiple alignment, that was input to SLR. If given, is used to filter indels." ) parser.add_option( "--filter-probability", dest="filter_probability", type="float", help="threshold for probability above which to include positive sites." ) parser.add_option("--no-header", dest="write_header", action="store_false", help="only output header.") parser.add_option("--only-header", dest="only_header", action="store_true", help="only output header.") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="threshold for significance tests [%default].") parser.add_option("--use-adjusted", dest="use_adjusted", action="store_true", help="use SLR adjusted probability values.") parser.add_option("--truncate-sites-list", dest="truncate_sites_list", type="int", help="truncate sites list after ## entries (0 for all).") parser.add_option( "--context-size", dest="context_size", type="int", help="size of left/right context around a selected residue.") parser.set_defaults( prefix=None, filter_probability=0, filter_omega=0, filename_sites="-", filename_log=None, filename_mali=None, significance_threshold=0.05, write_header=True, only_header=False, use_adjusted=False, context_size=0, truncate_sites_list=0, ) (options, args) = E.Start(parser) slr = WrapperSlr.Slr() # write headers if "%s" in options.filename_sites: options.prefix = True if options.method == "summary-slr": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # This uses the thresholds as set in SLR. Use "counts" for filtering # residues based on your own thresholds """) thresholds = "95%", "99%", "95% corrected", "99% corrected" if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "ltree\tomega\tkappa\tlnL\tnsites\tnsyn\tngap\t") options.stdout.write("\t".join( map(lambda x: "npos_" + x.replace(" ", "_"), thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "nneg_" + x.replace(" ", "_"), thresholds))) options.stdout.write("\n") elif options.method == "summary-filtered": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # This method uses the supplied threshold and the multiple alignment to filter. # All positions that are above the threshold (P-Value) and which are located in # indels: >= 1 sequence missing from column, are removed. """) if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "ltree\tomega\tkappa\tlnL\tnsites\tnfiltered\tntotal\tnsyn\tnneg\tnpos\n" ) elif options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # Note: sequence positions are 1-based, but mali positions are 0-based. # Residues in indel positions have been removed and signifnicance was determined according # with a threshold of %5.2e """ % options.significance_threshold) if options.prefix: options.stdout.write("prefix\t") options.stdout.write("cluster\tnsites\tp-value\tsites\n") elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Sites under positive/neutral/negative selection according to SLR # # Note: sequence positions are 1-based, but mali positions are 0-based. # Residues in indel positions have been removed and signifnicance was determined according # with a threshold of %5.2e """ % options.significance_threshold) if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "sequence\tn\taa\tseq_pos\tmali_pos\tcontext\n") elif options.method == "over-representation": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write("""# Genes with over-represented sites. # # This method uses as input the output of summary-filtered. """) if options.only_header: sys.exit(0) if options.method in ("summary-slr", "summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): ninput, noutput, nskipped = 0, 0, 0 if "%s" in options.filename_sites: headers, table = CSV.ReadTable(sys.stdin) fprefix = headers.index("prefix") try: fsignificance = headers.index("p") except ValueError: fsignificance = None for row in table: id = row[fprefix] if fsignificance is not None: p_value = row[fsignificance] else: p_value = None ninput += 1 fn = re.sub("%s", id, options.filename_sites) if not os.path.exists(fn): nskipped += 1 continue lines_sites = open(fn, "r").readlines() if options.filename_log: lines_log = open(re.sub("%s", id, options.filename_log), "r").readlines() result = slr.parseOutput(lines_sites, lines_log) if options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table"): mali = Mali.Mali() mali.readFromFile( open(re.sub("%s", id, options.filename_mali), "r")) else: mali = None ProcessResult(result, options, mali, prefix=id, p_value=p_value) noutput += 1 else: if options.filename_sites == "-": lines_sites = sys.stdin.readlines() else: lines_sites = open(options.filename_sites, "r").readlines() ninput += 1 if options.filename_log: lines_log = open(options.filename_log, "r").readlines() result = slr.parseOutput(lines_sites, lines_log) if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: if options.method == "summary-filtered": raise "please supply a multiple alignment for filtering." mali = None ProcessResult(result, options, mali, prefix=options.prefix) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) else: if options.method == "over-representation": results = [] for line in sys.stdin: if line[0] == "#": continue data = line[:-1].split("\t") if data[0] == "prefix": continue results.append( Result(data[0], int(data[6]), int(data[7]), int(data[8]), int(data[9]), int(data[10]))) # probability of a single site being positive ntotal = sum(map(lambda x: x.mNTotal, results)) npositives = sum(map(lambda x: x.mNPositive, results)) p = float(npositives) / float(ntotal) if options.loglevel >= 1: options.stdlog.write("# sites: total=%i, positive=%i, p=%f\n" % (ntotal, npositives, p)) new_results = [] for result in results: if result.mNTotal == 0: continue # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) # = P (x > X ). r = scipy.stats.binom.sf(result.mNPositive - 1, result.mNTotal, p) result.mSignificance = r if r < options.significance_threshold: new_results.append(result) new_results.sort( lambda x, y: cmp(x.mSignificance, y.mSignificance)) options.stdlog.write(Result().getHeader() + "\n") for result in new_results: options.stdout.write(str(result) + "\n") if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, npos=%i\n" % (len(results), len(new_results))) E.Stop()
def createTable(dbhandle, error, options, rows=None, headers=None, first_column=None, existing_tables=[]): # create table by guessing column types from data type. if rows: map_column2type, ignored, max_values = CSV.GetMapColumn2Type( rows, ignore_empty=options.ignore_empty, get_max_values=True) if ignored: E.info("ignored columns: %s" % str(ignored)) headers = map_column2type.keys() headers.sort() elif headers: map_column2type = dict(zip(headers, [ None, ] * len(headers))) ignored = 0 columns_to_ignore = set([x.lower() for x in options.ignore_columns]) columns_to_rename = dict( [x.lower().split(":") for x in options.rename_columns]) take = [] # associate headers to field names columns = [] present = {} for header_index, h in enumerate(headers): hh = h if options.lowercase: hh = string.lower(h) if hh in columns_to_ignore: continue if hh in present: if options.ignore_duplicates: continue else: raise ValueError("duplicate column %s" % hh) present[hh] = 1 take.append(h) if map_column2type[h] == int: max_value = max_values[h] if max_value > 2147483647: t = "BIGINT DEFAULT '0'" elif max_value > 32767: t = "INTEGER DEFAULT '0'" else: t = "SMALLINT DEFAULT '0'" elif map_column2type[h] == float: t = "FLOAT DEFAULT '0'" else: t = "TEXT" # remove special characters from column names if hh == "": if first_column is not None and header_index == 0: hh = first_column else: raise ValueError("column '%s' without header " % h) hh = columns_to_rename.get(hh, hh) hh = re.sub('''['"]''', "", hh) hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh) if hh[0] in "0123456789": hh = "_" + hh columns.append("%s %s" % (hh, t)) # delete old table if it exists while 1: try: cc = dbhandle.cursor() cc.execute("DROP TABLE IF EXISTS '%s'" % options.tablename) dbhandle.commit() cc.close() E.info("existing table %s deleted" % options.tablename) except sqlite3.OperationalError, msg: E.warn(msg) time.sleep(5) continue except error, msg: E.warn("could not delete existing table %s: %s" % (options.tablename, str(msg))) dbhandle.rollback() if not options.retry: raise error, msg elif options.tablename in existing_tables: # table exists, but drop did not work (e.g. database lock) time.sleep(5) continue else: # table might not have existed break
def createTable( dbhandle, error, tablename, options, retry=True, ignore_empty=True, ignore_columns=[], rename_columns=[], lowercase=False, ignore_duplicates=True, indices=[], rows=None, headers=None, first_column=None, existing_tables=set(), append=False, ): # create table by guessing column types from data type. if rows: map_column2type, ignored, max_values = CSV.getMapColumn2Type( rows, ignore_empty=ignore_empty, get_max_values=True ) if ignored: E.info("ignored columns: %s" % str(ignored)) headers = list(map_column2type.keys()) headers.sort() elif headers: map_column2type = dict(list(zip(headers, [None] * len(headers)))) ignored = 0 columns_to_ignore = set([x.lower() for x in ignore_columns]) columns_to_rename = dict([x.lower().split(":") for x in rename_columns]) take = [] # associate headers to field names columns = [] present = {} for header_index, h in enumerate(headers): hh = h if lowercase: hh = string.lower(h) if hh in columns_to_ignore: continue if hh in present: if ignore_duplicates: continue else: raise ValueError("duplicate column %s" % hh) present[hh] = 1 take.append(h) if map_column2type[h] == int: max_value = max_values[h] if max_value > 2147483647: t = "BIGINT DEFAULT '0'" elif max_value > 32767: t = "INTEGER DEFAULT '0'" else: t = "SMALLINT DEFAULT '0'" elif map_column2type[h] == float: t = "FLOAT DEFAULT '0'" else: if h in options.indices: t = options.index else: t = options.text # remove special characters from column names if hh == "": if first_column is not None and header_index == 0: hh = first_column else: raise ValueError("column '%s' without header " % h) hh = columns_to_rename.get(hh, hh) hh = re.sub("""['"]""", "", hh) hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh) if hh[0] in "0123456789": hh = "_" + hh columns.append("%s %s" % (hh, t)) if not options.append: # delete old table if it exists while 1: try: cc = dbhandle.cursor() # mysql: removed '' around table name statement = "DROP TABLE IF EXISTS %s" % tablename E.debug(statement) cc.execute(statement) dbhandle.commit() cc.close() E.info("existing table %s deleted" % tablename) except sqlite3.OperationalError as msg: E.warn(msg) time.sleep(5) continue except error as msg: E.warn("could not delete existing table %s: %s" % (tablename, str(msg))) dbhandle.rollback() if not retry: raise error(msg) elif tablename in existing_tables: # table exists, but drop did not work (e.g. database lock) time.sleep(5) continue else: # table might not have existed break break # create new table statement = "CREATE TABLE %s ( %s );" % (tablename, ", ".join(columns)) E.debug("table create:\n# %s" % (statement)) while 1: try: cc = dbhandle.cursor() cc.execute(statement) cc.close() dbhandle.commit() except error as msg: E.warn("table creation failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not retry: raise error(msg) if not re.search("locked", str(msg)): raise error("%s: %s" % (msg, statement)) time.sleep(5) continue break E.info("table %s created successfully." % tablename) return take, map_column2type, ignored
def createTable(dbhandle, error, tablename, options, retry=True, ignore_empty=True, ignore_columns=[], rename_columns=[], lowercase=False, ignore_duplicates=True, indices=[], rows=None, headers=None, first_column=None, existing_tables=set(), append=False): # create table by guessing column types from data type. if rows: map_column2type, ignored, max_values = CSV.getMapColumn2Type( rows, ignore_empty=ignore_empty, get_max_values=True) if ignored: E.info("ignored columns: %s" % str(ignored)) headers = list(map_column2type.keys()) headers.sort() elif headers: map_column2type = dict(list(zip(headers, [ None, ] * len(headers)))) ignored = 0 columns_to_ignore = set([x.lower() for x in ignore_columns]) columns_to_rename = dict([x.lower().split(":") for x in rename_columns]) take = [] # associate headers to field names columns = [] present = {} for header_index, h in enumerate(headers): hh = h if lowercase: hh = string.lower(h) if hh in columns_to_ignore: continue if hh in present: if ignore_duplicates: continue else: raise ValueError("duplicate column %s" % hh) present[hh] = 1 take.append(h) if map_column2type[h] == int: max_value = max_values[h] if max_value > 2147483647: t = "BIGINT DEFAULT '0'" elif max_value > 32767: t = "INTEGER DEFAULT '0'" else: t = "SMALLINT DEFAULT '0'" elif map_column2type[h] == float: t = "FLOAT DEFAULT '0'" else: if h in options.indices: t = options.index else: t = options.text # remove special characters from column names if hh == "": if first_column is not None and header_index == 0: hh = first_column else: raise ValueError("column '%s' without header " % h) hh = columns_to_rename.get(hh, hh) hh = re.sub('''['"]''', "", hh) hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh) if hh[0] in "0123456789": hh = "_" + hh columns.append("%s %s" % (hh, t)) if not options.append: # delete old table if it exists while 1: try: cc = dbhandle.cursor() # mysql: removed '' around table name statement = "DROP TABLE IF EXISTS %s" % tablename E.debug(statement) cc.execute(statement) dbhandle.commit() cc.close() E.info("existing table %s deleted" % tablename) except sqlite3.OperationalError as msg: E.warn(msg) time.sleep(5) continue except error as msg: E.warn("could not delete existing table %s: %s" % (tablename, str(msg))) dbhandle.rollback() if not retry: raise error(msg) elif tablename in existing_tables: # table exists, but drop did not work (e.g. database lock) time.sleep(5) continue else: # table might not have existed break break # create new table statement = "CREATE TABLE %s ( %s );" % (tablename, ", ".join(columns)) E.debug("table create:\n# %s" % (statement)) while 1: try: cc = dbhandle.cursor() cc.execute(statement) cc.close() dbhandle.commit() except error as msg: E.warn("table creation failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not retry: raise error(msg) if not re.search("locked", str(msg)): raise error("%s: %s" % (msg, statement)) time.sleep(5) continue break E.info("table %s created successfully." % tablename) return take, map_column2type, ignored