def ReadGeneLists(filename_genes, gene_pattern=None): """read gene lists from filename in matrix. returns a tuple (list of all genes, dictionary of gene lists) """ if filename_genes == "-": infile = sys.stdin else: infile = IOTools.openFile(filename_genes, "r") headers, table = CSV.readTable(infile.readlines(), as_rows=False) if filename_genes != "-": infile.close() all_genes = table[0] # if there is only a single column, add a dummy column if len(table) == 1: table.append([1] * len(table[0])) headers.append("foreground") E.info("read %i genes from %s" % (len(all_genes), filename_genes)) if gene_pattern: rx = re.compile(gene_pattern) all_genes = [rx.search(x).groups()[0] for x in all_genes] gene_lists = collections.OrderedDict() for header, col in zip(headers[1:], table[1:]): s = list(set([x for x, y in zip(all_genes, col) if y != "0"])) gene_lists[header] = set(s) return all_genes, gene_lists
def readAndExpandTable(infile, options): '''splits fields in table at separator. If a field in a row contains multiple values, the row is expanded into multiple rows such that all values have space. ''' fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(options.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write("\t".join([d[n] for d in data]) + "\n")
def buildSelectStatementfromPed(filter_type, pedfile, template): '''Build a select statement from a template and a pedigree file''' pedigree = csv.DictReader( IOTools.open_file(pedfile), delimiter='\t', fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status']) affecteds = [] unaffecteds = [] parents = [] select = None # loop over pedigree file and establish relationships for row in pedigree: if row['status'] == '2': if filter_type == "denovo": father = row['father'] mother = row['mother'] proband = row['sample'] elif filter_type == "dominant" or filter_type == "recessive": affecteds += [row['sample']] if filter_type == "recessive": parents += [row['father'], row['mother']] if row['status'] == '1': if filter_type == "dominant": unaffecteds += [row['sample']] elif filter_type == "recessive": if row['sample'] not in parents: unaffecteds += [row['sample']] # Build select statement from template if filter_type == "denovo": select = template.replace("father", father) select = select.replace("mother", mother) select = select.replace("proband", proband) elif filter_type == "dominant": affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds) if len(unaffecteds) == 0: unaffecteds_exp = '' else: unaffecteds_exp = '&&vc.getGenotype("' + \ ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \ '").isHomRef()' select = template.replace("affecteds_exp", affecteds_exp) select = select.replace("unaffecteds_exp", unaffecteds_exp) elif filter_type == "recessive": affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds) unaffecteds_exp = '").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds) if len(parents) == 0: parents_exp = '' else: parents_exp = '&&vc.getGenotype("' + \ ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \ '").getPL().1==0' select = template.replace("affecteds_exp", affecteds_exp) select = select.replace("unaffecteds_exp", unaffecteds_exp) select = select.replace("parents_exp", parents_exp) return select
def readAndJoinTable(infile, options): fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) join_column = options.join_column - 1 join_name = options.join_column_name - 1 join_rows = list(set([x[join_column] for x in table])) join_rows.sort() join_names = list(set([x[join_name] for x in table])) join_names.sort() join_columns = list( set(range(len(fields))).difference(set((join_column, join_name)))) join_columns.sort() new_table = [] map_old2new = {} map_name2start = {} x = 1 for name in join_names: map_name2start[name] = x x += len(join_columns) row_width = len(join_columns) * len(join_names) for x in join_rows: map_old2new[x] = len(map_old2new) new_row = [ x, ] + ["na"] * row_width new_table.append(new_row) for row in table: row_index = map_old2new[row[join_column]] start = map_name2start[row[join_name]] for x in join_columns: new_table[row_index][start] = row[x] start += 1 # print new table options.stdout.write(fields[join_column]) for name in join_names: for column in join_columns: options.stdout.write("\t%s%s%s" % (name, options.separator, fields[column])) options.stdout.write("\n") for row in new_table: options.stdout.write("\t".join(row) + "\n")
def readAndCollapseTable(infile, options, missing_value=""): '''collapse a table. Collapse a table of two columns with row names in the first column. Outputs a table with multiple columns for each row name. ''' fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) if len(fields) != 2: raise NotImplementedError("can only work on tables with two columns") values = collections.defaultdict(list) # column header after which to add separator = table[0][0] row_names = set([x[0] for x in table]) row_name, value = table[0] values[row_name].append(value) added = set([row_name]) for row_name, value in table[1:]: if row_name == separator: for r in row_names: if r not in added: values[r].append(missing_value) added = set() values[row_name].append(value) added.add(row_name) for r in row_names: if r not in added: values[r].append(missing_value) sizes = set([len(x) for x in list(values.values())]) assert len(sizes) == 1, "unequal number of row_names" size = list(sizes)[0] options.stdout.write("row\t%s\n" % ("\t".join(["column_%i" % x for x in range(size)]))) for key, row in list(values.items()): options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
def getGODescriptions(infile): '''build dictionary mapping GOids to types and descriptions. Arguments --------- infile : string Filename of table with GO assignments Returns ------- mapping : dict Dictionary mapping GOid to GOtype and GOdescription. ''' with IOTools.open_file(infile) as inf: fields, table = CSV.readTable(inf, as_rows=False) return dict([ (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[ fields.index("go_id")], table[fields.index("description")]) ])
def computeFDR(infile, options): '''compute FDR on a table. ''' fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) options.stdout.write("\t".join(fields) + "\n") for row in table: data = [] for x in range(len(fields)): data.append(row[x].split(options.separator)) nrows = max([len(d) for d in data]) for d in data: d += [""] * (nrows - len(d)) for n in range(nrows): options.stdout.write("\t".join([d[n] for d in data]) + "\n")
def readAndGroupTable(infile, options): """read table from infile and group. """ fields, table = CSV.readTable(infile, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) assert options.group_column not in options.columns converter = float new_fields = [fields[options.group_column] ] + [fields[x] for x in options.columns] if options.group_function == "min": f = min elif options.group_function == "max": f = max elif options.group_function == "sum": f = lambda z: reduce(lambda x, y: x + y, z) elif options.group_function == "mean": f = scipy.mean elif options.group_function == "cat": f = lambda x: ";".join([y for y in x if y != ""]) converter = str elif options.group_function == "uniq": f = lambda x: ";".join([y for y in set(x) if y != ""]) converter = str elif options.group_function == "stats": f = lambda x: str(Stats.DistributionalParameters(x)) # update headers new_fields = [fields[options.group_column]] for c in options.columns: new_fields += list([ "%s_%s" % (fields[c], x) for x in Stats.DistributionalParameters().getHeaders() ]) # convert values to floats (except for group_column) # Delete rows with unconvertable values and not in options.columns new_table = [] for row in table: skip = False new_row = [row[options.group_column]] for c in options.columns: if row[c] == options.missing_value: new_row.append(row[c]) else: try: new_row.append(converter(row[c])) except ValueError: skip = True break if not skip: new_table.append(new_row) table = new_table new_rows = CSV.groupTable(table, group_column=0, group_function=f) options.stdout.write("\t".join(new_fields) + "\n") for row in new_rows: options.stdout.write("\t".join(map(str, row)) + "\n")
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern for additional output files [%default].") parser.set_defaults( length=1000, minimum_coverage=0.90, maximum_reads=[1, 10, 20, 50, 100], output_filename_pattern="%s", normalize=True, ) (options, args) = E.start(parser, add_csv_options=True) fields, table = CSV.readTable(sys.stdin, dictreader=CSV.DictReaderLarge) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) coverage_5prime = numpy.zeros(options.length, numpy.float) coverage_3prime = numpy.zeros(options.length, numpy.float) coverage_maxreads5prime = numpy.zeros(options.length, numpy.float) coverage_maxreads3prime = numpy.zeros(options.length, numpy.float) coverage_full5prime = numpy.zeros(options.length, numpy.float) coverage_full3prime = numpy.zeros(options.length, numpy.float) coverage_min5prime = numpy.zeros(options.length, numpy.float) coverage_min3prime = numpy.zeros(options.length, numpy.float) histograms = [] for x in range(len(options.maximum_reads)): histograms.append([ numpy.zeros(options.length, numpy.float), numpy.zeros(options.length, numpy.float), 0 ]) ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0 for row in table: length, covered, meancov, data, nreads = (int(row["cov_nval"]), float(row["cov_covered"]), float(row["cov_mean"]), row["cov_values"], int(row["nover2"])) ninput += 1 if length < options.length: nlength += 1 continue if data == "na": nskipped += 1 continue noutput += 1 mincov = covered / length values = list(map(float, data.split(";"))) m = max(values) values = [x / m for x in values] coverage_5prime += values[0:1000] coverage_3prime += values[-1000:] if mincov >= 1.0: coverage_full5prime += values[0:1000] coverage_full3prime += values[-1000:] nfull += 1 if meancov >= options.minimum_coverage: coverage_min5prime += values[0:1000] coverage_min3prime += values[-1000:] nmincov += 1 for maxreads in range(len(options.maximum_reads)): if nreads <= options.maximum_reads[maxreads]: histograms[maxreads][0] += values[0:1000] histograms[maxreads][1] += values[-1000:] histograms[maxreads][2] += 1 if options.normalize: for x5, x3 in ((coverage_5prime, coverage_3prime), (coverage_min5prime, coverage_min3prime), (coverage_full5prime, coverage_full3prime)): m = max((max(x5), max(x3))) x3 /= m x5 /= m for x5, x3, c in histograms: m = max((max(x5), max(x3))) x5 /= m x3 /= m outfile = options.stdout outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'", "mincov-3'", "full-5'", "full-3'")) + "\n") for x in range(0, options.length): outfile.write("\t".join([ "%6.4f" % x for x in (x, coverage_5prime[x], coverage_3prime[x], coverage_min5prime[x], coverage_min3prime[x], coverage_full5prime[x], coverage_full3prime[x]) ]) + "\n") outfile5 = IOTools.open_file(options.output_filename_pattern % "reads5", "w") outfile3 = IOTools.open_file(options.output_filename_pattern % "reads3", "w") outfile5.write("\t".join([ "distance", ] + [ "reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads)) ]) + "\n") outfile3.write("\t".join([ "distance", ] + [ "reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads)) ]) + "\n") for x in range(0, options.length): outfile5.write("%i\t%s\n" % (x, "\t".join([ "%6.4f" % histograms[y][0][x] for y in range(len(options.maximum_reads)) ]))) outfile3.write("%i\t%s\n" % (x, "\t".join([ "%6.4f" % histograms[y][1][x] for y in range(len(options.maximum_reads)) ]))) E.info( "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i" % (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (options, args) = E.start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") options.filename1, options.filename2 = args table1 = CSV.readTable(IOTools.open_file(options.filename1, "r")) table2 = CSV.readTable(IOTools.open_file(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout # build new field list new_fields = [] for x in options.join_fields1: new_fields.append(x) for x in fields1: if x not in options.join_fields1: new_fields.append(x) if x not in options.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if options.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=options.csv_dialect) print("\t".join(fields)) first_row = True for row in reader: row = IOTools.convertDictionary(row) writer.writerow(row) E.stop()
def run(infile, options, report_step=10000): options.tablename = quoteTableName(options.tablename, backend=options.backend) if options.map: m = {} for x in options.map: f, t = x.split(":") m[f] = t options.map = m else: options.map = {} existing_tables = set() quick_import_separator = "\t" if options.database_backend == "postgres": import psycopg2 raise NotImplementedError("needs refactoring for commandline options") dbhandle = psycopg2.connect(options.psql_connection) error = psycopg2.Error options.null = "NULL" options.string_value = "'%s'" options.text = "TEXT" options.index = "TEXT" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.database_backend == "mysql": import MySQLdb dbhandle = MySQLdb.connect(host=options.database_host, user=options.database_username, passwd=options.database_password, port=options.database_port, db=options.database_name) error = Exception options.null = "NULL" options.string_value = "%s" options.text = "TEXT" options.index = "VARCHAR(40)" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect(options.database_name) try: os.chmod(options.database_name, 0o664) except OSError as msg: E.warn("could not change permissions of database: %s" % msg) # Avoid the following error: # sqlite3.ProgrammingError: You must not use 8-bit bytestrings # unless you use a text_factory that can interpret 8-bit # bytestrings (like text_factory = str). It is highly # recommended that you instead just switch your application # to Unicode strings # Note: might be better to make csv2db unicode aware. dbhandle.text_factory = str error = sqlite3.OperationalError options.insert_many = True # False options.null = None # "NULL" options.text = "TEXT" options.index = "TEXT" options.string_value = "%s" # "'%s'" statement = "SELECT name FROM sqlite_master WHERE type='table'" cc = executewait(dbhandle, statement, error, options.retry) existing_tables = set([x[0] for x in cc]) cc.close() # use , as separator quick_import_statement = \ "sqlite3 %s '.import %%s %s'" % \ (options.database_name, options.tablename) quick_import_separator = "|" if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = csv.DictReader(CSV.CommentStripper(infile), dialect=options.dialect, fieldnames=options.header) if options.replace_header: try: next(reader) except StopIteration: pass E.info("reading %i columns to guess column types" % options.guess_size) rows = [] for row in reader: if None in row: raise ValueError("undefined columns in input file at row: %s" % row) try: rows.append(IOTools.convertDictionary(row, map=options.map)) except TypeError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) except ValueError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) if len(rows) >= options.guess_size: break E.info("read %i rows for type guessing" % len(rows)) E.info("creating table") if len(rows) == 0: if options.allow_empty: if not reader.fieldnames: E.warn("no data - no table created") else: # create empty table and exit take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) E.info("empty table created") return else: raise ValueError("empty table") else: take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, rows=rows, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) def row_iter(rows, reader): for row in rows: yield quoteRow(row, take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) for data in reader: yield quoteRow(IOTools.convertDictionary(data, map=options.map), take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) ninput = 0 E.info("inserting data") if options.insert_quick: E.info("using quick insert") outfile, filename = tempfile.mkstemp() E.debug("dumping data into %s" % filename) for d in row_iter(rows, reader): ninput += 1 os.write( outfile, quick_import_separator.join([str(d[x]) for x in take]) + "\n") if ninput % report_step == 0: E.info("iteration %i\n" % ninput) os.close(outfile) statement = quick_import_statement % filename E.debug(statement) # infinite loop possible while 1: retcode = E.run(statement, cwd=os.getcwd(), close_fds=True) if retcode != 0: E.warn("import error using statement: %s" % statement) if not options.retry: raise ValueError("import error using statement: %s" % statement) time.sleep(5) continue break os.remove(filename) # there is no way to insert NULL values into sqlite. The only # solution is to update all colums. for column in take: executewait( dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" % (options.tablename, column, column), error, options.retry) elif options.insert_many: data = [] for d in row_iter(rows, reader): ninput += 1 data.append([d[x] for x in take]) if ninput % report_step == 0: E.info("iteration %i" % ninput) statement = "INSERT INTO %s VALUES (%s)" % (options.tablename, ",".join("?" * len(take))) E.info("inserting %i rows" % len(data)) E.debug("multiple insert:\n# %s" % statement) while 1: try: dbhandle.executemany(statement, data) except error as msg: E.warn("import failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not options.retry: raise error(msg) if not re.search("locked", str(msg)): raise error(msg) time.sleep(5) continue break else: # insert line by line (could not figure out how to do bulk loading with # subprocess and COPY FROM STDIN) statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename, ')s, %('.join(take)) # output data used for guessing: for d in row_iter(rows, reader): ninput += 1 E.debug("single insert:\n# %s" % (statement % d)) cc = executewait(dbhandle, statement, error, retry=options.retry, args=d) cc.close() if ninput % report_step == 0: E.info("iteration %i" % ninput) E.info("building indices") nindex = 0 for index in options.indices: nindex += 1 try: statement = "CREATE INDEX %s_index%i ON %s (%s)" % ( options.tablename, nindex, options.tablename, index) cc = executewait(dbhandle, statement, error, options.retry) cc.close() E.info("added index on column %s" % (index)) except error as msg: E.info("adding index on column %s failed: %s" % (index, msg)) statement = "SELECT COUNT(*) FROM %s" % (options.tablename) cc = executewait(dbhandle, statement, error, options.retry) result = cc.fetchone() cc.close() noutput = result[0] E.info("ninput=%i, noutput=%i, nskipped_columns=%i" % (ninput, noutput, len(ignored))) dbhandle.commit()
def createTable(dbhandle, error, tablename, options, retry=True, ignore_empty=True, ignore_columns=[], rename_columns=[], lowercase=False, ignore_duplicates=True, indices=[], rows=None, headers=None, first_column=None, existing_tables=set(), append=False): # create table by guessing column types from data type. if rows: map_column2type, ignored, max_values = CSV.getMapColumn2Type( rows, ignore_empty=ignore_empty, get_max_values=True) if ignored: E.info("ignored columns: %s" % str(ignored)) headers = list(map_column2type.keys()) headers.sort() elif headers: map_column2type = dict(list(zip(headers, [ None, ] * len(headers)))) ignored = 0 columns_to_ignore = set([x.lower() for x in ignore_columns]) columns_to_rename = dict([x.lower().split(":") for x in rename_columns]) take = [] # associate headers to field names columns = [] present = {} for header_index, h in enumerate(headers): hh = h if lowercase: hh = string.lower(h) if hh in columns_to_ignore: continue if hh in present: if ignore_duplicates: continue else: raise ValueError("duplicate column %s" % hh) present[hh] = 1 take.append(h) if map_column2type[h] == int: max_value = max_values[h] if max_value > 2147483647: t = "BIGINT DEFAULT '0'" elif max_value > 32767: t = "INTEGER DEFAULT '0'" else: t = "SMALLINT DEFAULT '0'" elif map_column2type[h] == float: t = "FLOAT DEFAULT '0'" else: if h in options.indices: t = options.index else: t = options.text # remove special characters from column names if hh == "": if first_column is not None and header_index == 0: hh = first_column else: raise ValueError("column '%s' without header " % h) hh = columns_to_rename.get(hh, hh) hh = re.sub('''['"]''', "", hh) hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh) if hh[0] in "0123456789": hh = "_" + hh columns.append("%s %s" % (hh, t)) if not options.append: # delete old table if it exists while 1: try: cc = dbhandle.cursor() # mysql: removed '' around table name statement = "DROP TABLE IF EXISTS %s" % tablename E.debug(statement) cc.execute(statement) dbhandle.commit() cc.close() E.info("existing table %s deleted" % tablename) except sqlite3.OperationalError as msg: E.warn(msg) time.sleep(5) continue except error as msg: E.warn("could not delete existing table %s: %s" % (tablename, str(msg))) dbhandle.rollback() if not retry: raise error(msg) elif tablename in existing_tables: # table exists, but drop did not work (e.g. database lock) time.sleep(5) continue else: # table might not have existed break break # create new table statement = "CREATE TABLE %s ( %s );" % (tablename, ", ".join(columns)) E.debug("table create:\n# %s" % (statement)) while 1: try: cc = dbhandle.cursor() cc.execute(statement) cc.close() dbhandle.commit() except error as msg: E.warn("table creation failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not retry: raise error(msg) if not re.search("locked", str(msg)): raise error("%s: %s" % (msg, statement)) time.sleep(5) continue break E.info("table %s created successfully." % tablename) return take, map_column2type, ignored
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("--column", dest="column", type="int", help="column to split on.") parser.add_option("--num-bins", dest="num_bins", type="int", help="number of bins to create.") parser.add_option("--method", dest="method", type="choice", choices=("equal-sized-bins",), help="method to use to bin data.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.set_defaults( has_headers=True, method="equal-sized-bins", column=1, num_bins=4, output_filename_pattern="bin%i", ) (options, args) = E.start(parser) options.column -= 1 if args: if args[0] == "-": infile = sys.stdin else: infile = IOTools.open_file(args[0], "r") else: infile = sys.stdin fields, data = CSV.readTable(infile) c = options.column values = [float(x[c]) for x in data] bins = [] if options.method == "equal-sized-bins": increment = int(math.floor(float(len(values)) / options.num_bins)) indices = list(range(0, len(values))) indices.sort(key=lambda x: values[x]) for x in range(len(values)): values[indices[x]] = x bins = list(range(0, len(values) - increment, increment)) elif options.method == "pass": pass E.debug("bins=%s" % str(bins)) outputters = [] for x in range(0, len(bins)): outputters.append( Outputter(options.output_filename_pattern % x, fields)) # output tables for x in range(0, len(data)): bin = bisect.bisect(bins, values[x]) - 1 outputters[bin].write(data[x]) # stats if options.loglevel >= 1: options.stdlog.write("# bin\tstart\tcounts\tfilename\n") for x in range(0, len(bins)): options.stdlog.write("# %i\t%f\t%i\t%s\n" % ( x, bins[x], outputters[x].mCounts, outputters[x].mFilename)) E.info("ninput=%i, noutput=%i" % (len(data), sum((x.mCounts for x in outputters)))) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option( "-l", "--large", dest="large", action="store_true", help= "large columns. Do not use native python CSV module [default=%default]." ) parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string", help="filename with field information.") parser.set_defaults( remove=False, unique=False, filename_fields=None, ) (options, args) = E.start(parser, add_csv_options=True, quiet=True) statement = " ".join(args) if options.large: reader = CSV.DictReaderLarge(CSV.CommentStripper(sys.stdin), dialect=options.csv_dialect) else: reader = csv.DictReader(CSV.CommentStripper(sys.stdin), dialect=options.csv_dialect) exec("f = lambda r: %s" % statement, globals()) counter = E.Counter() writer = csv.DictWriter(options.stdout, reader.fieldnames, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator) writer.writerow(dict((fn, fn) for fn in reader.fieldnames)) while 1: counter.input += 1 try: row = next(reader) except _csv.Error as msg: options.stderr.write("# error while parsing: %s\n" % (msg)) counter.errors += 1 continue except StopIteration: break if not row: break if f(row): writer.writerow(row) counter.output += 1 else: counter.filtered += 1 E.info("%s" % counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option("-1", "--join-fields1", dest="join_fields1", type="string", help="join fields in first table.") parser.add_option("-2", "--join-fields2", dest="join_fields2", type="string", help="join fields in second table.") parser.add_option("-m", "--method", dest="method", type="choice", help="set operation to perform.", choices=("intersection", "rest", "union")) parser.set_defaults( remove=False, unique=False, join_fields1=None, join_fields2=None, method="intersection", ) (options, args) = E.start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") if not options.join_fields1 or not options.join_fields2: raise ValueError("please specify at least one join field per table") options.join_fields1 = options.join_fields1.split(",") options.join_fields2 = options.join_fields2.split(",") options.filename1, options.filename2 = args fields1, table1 = CSV.readTable(open(options.filename1, "r")) fields2, table2 = CSV.readTable(open(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout nfields1 = [] for x in range(len(fields1)): if fields1[x] in options.join_fields1: nfields1.append(x) nfields2 = [] for x in range(len(fields2)): if fields2[x] in options.join_fields2: nfields2.append(x) # calculate row indices: double keys are not taken care of here keys = {} for row1 in table1: v = [row1[x] for x in nfields1] key = hashlib.md5("".join(v)).digest() keys[key] = row1 if options.method == "intersection": # build new field list take = list(range(len(fields1))) c = len(take) for x in fields2: if x not in options.join_fields2: take.append(c) c += 1 t = fields1 + fields2 new_fields = [t[x] for x in take] print("\t".join(new_fields)) for row2 in table2: v = [row2[x] for x in nfields2] key = hashlib.md5("".join(v)).digest() if key in keys: new_row = keys[key] + row2 outfile.write("\t".join([new_row[x] for x in take]) + "\n") elif options.method == "rest": new_fields = fields2 print("\t".join(new_fields)) for row2 in table2: v = [row2[x] for x in nfields2] key = hashlib.md5("".join(v)).digest() if key not in keys: outfile.write("\t".join(row2) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep"), help="""actions to perform on table.""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by.") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default]") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--header-names", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_option( "--set-transpose-field", dest="set_transpose_field", type="string", help="set first field (row 1 and col 1) to this value [%default].") parser.add_option("--transpose-format", dest="transpose_format", type="choice", choices=( "default", "separated", ), help="input format of un-transposed table") parser.add_option( "--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over " "several rows.") parser.add_option("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("--columns", dest="columns", type="string", help="columns to use.") parser.add_option("--file", dest="file", type="string", help="columns to test from table.", metavar="FILE") parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter of columns.", metavar="DELIM") parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_option("--sort-by-rows", dest="sort_rows", type="string", help="output order for rows.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("--group", dest="group_column", type="int", help="group values by column. Supply an integer column " "[default=%default]") parser.add_option("--group-function", dest="group_function", type="choice", choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_option("--join-table", dest="join_column", type="int", help="join rows in a table by columns.") parser.add_option( "--collapse-table", dest="collapse_table", type="string", help="collapse a table. Value determines the missing variable " "[%default].") parser.add_option("--join-column-name", dest="join_column_name", type="int", help="use this column as a prefix.") parser.add_option("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table [%default].") parser.add_option("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_option("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_option( "--separator", dest="separator", type="string", help="separator for multi-valued fields [default=%default].") parser.add_option( "--fdr-method", dest="fdr_method", type="choice", choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--fdr-add-column", dest="fdr_add_column", type="string", help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are " "multiple columns [%default]") # IMS: add option to use a column as the row id in flatten parser.add_option( "--id-column", dest="id_column", type="string", help="list of column(s) to use as the row id when flattening " "the table. If None, then row number is used. [default=%default].") parser.add_option( "--variable-name", dest="variable_name", type="string", help="the column header for the 'variable' column when flattening " "[default=%default].") parser.add_option( "--value-name", dest="value_name", type="string", help="the column header for the 'value' column when flattening " "[default=%default].") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format=None, value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (options, args) = E.start(parser, add_pipe_options=True) options.parameters = options.parameters.split(",") if options.group_column: options.group = True options.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if options.methods == ["remove-header"]: first = True for line in options.stdin: if line[0] == "#": continue if first: first = False continue options.stdout.write(line) elif options.transpose or "transpose" in options.methods: readAndTransposeTable(options.stdin, options) elif options.flatten_table: # IMS: bug fixed to make work. Also added options for keying # on a particular and adding custom column headings fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) if options.id_column: id_columns = [int(x) - 1 for x in options.id_column.split(",")] id_header = "\t".join( [fields[id_column] for id_column in id_columns]) options.columns = [ x for x in options.columns if x not in id_columns ] else: id_header = "row" options.stdout.write( "%s\t%s\t%s\n" % (id_header, options.variable_name, options.value_name)) for x, row in enumerate(table): if options.id_column: row_id = "\t".join( [row[int(x) - 1] for x in options.id_column.split(",")]) else: row_id = str(x) for y in options.columns: options.stdout.write("%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif options.as_column: fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.columns = getColumns(fields, options.columns) table = list(zip(*table)) options.stdout.write("value\n") for column in options.columns: options.stdout.write("\n".join(table[column]) + "\n") elif options.split_fields: # split comma separated fields fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=True) options.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(options.separator) for x in row] for d in itertools.product(*row): options.stdout.write("%s\n" % "\t".join(d)) elif options.group: readAndGroupTable(options.stdin, options) elif options.join_column: readAndJoinTable(options.stdin, options) elif options.expand_table: readAndExpandTable(options.stdin, options) elif options.collapse_table is not None: readAndCollapseTable(options.stdin, options, options.collapse_table) elif "grep" in options.methods: options.columns = [int(x) - 1 for x in options.columns.split(",")] patterns = [] if options.file: infile = IOTools.open_file(options.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(options.delimiter)[0]) else: patterns = args for line in options.stdin: data = line[:-1].split(options.delimiter) found = False for c in options.columns: if data[c] in patterns: found = True break if (not found and options.invert_match) or ( found and not options.invert_match): print(line[:-1]) else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.readTable(options.stdin, with_header=options.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) options.columns = getColumns(fields, options.columns) # convert all values to float for c in options.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in options.methods: if method == "normalize-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = [x / value for x in table[c]] elif method == "multiply-by-value": value = float(options.parameters[0]) del options.parameters[0] for c in options.columns: table[c] = [x * value for x in table[c]] elif method == "normalize-by-max": for c in options.columns: m = max(table[c]) table[c] = [x / m for x in table[c]] elif method == "kullback-leibler": options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") format = options.format if format is None: format = "%f" for x in range(0, len(options.columns) - 1): for y in range(x + 1, len(options.columns)): c1 = options.columns[x] c2 = options.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) options.stdout.write( "%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], format % e1, format % e2, format % ((e1 + e2) / 2))) E.stop() sys.exit(0) elif method == "rank": for c in options.columns: tt = table[c] t = list(zip(tt, list(range(nrows)))) t.sort() for i, n in zip([x[1] for x in t], list(range(nrows))): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[0]) del options.parameters[0] new_value = float(options.parameters[0]) del options.parameters[0] if method == "upper-bound": for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in options.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \ str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \ str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = list( map( str, Stats.adjustPValues(pvalues, method=options.fdr_method))) if options.fdr_add_column is None: x = 0 for c in options.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(options.columns) == 1: fields.append(options.fdr_add_column) else: for co in options.columns: fields.append(options.fdr_add_column + fields[c]) x = 0 for c in options.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(options.columns) elif method == "normalize-by-table": other_table_name = options.parameters[0] del options.parameters[0] other_fields, other_table = CSV.readTable( IOTools.open_file(other_table_name, "r"), with_header=options.has_headers, as_rows=False) # convert all values to float for c in options.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = options.missing_value # convert back if options.format is not None: for c in options.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = format % table[c][r] options.stdout.write("\t".join(fields) + "\n") if options.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in options.sort_rows.split(","): if x not in old2new: continue r = old2new[x] options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") else: for r in range(nrows): options.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_option( "-l", "--large", dest="large", action="store_true", help= "large columns. Do not use native python CSV module [default=%default]." ) parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string", help="filename with field information.") parser.set_defaults( remove=False, unique=False, large=False, filename_fields=None, ) (options, args) = E.start(parser, add_csv_options=True, quiet=True) input_fields = args if options.filename_fields: input_fields = [ x[:-1].split("\t")[0] for x in [ x for x in IOTools.open_file(options.filename_fields, "r").readlines() if x[0] != "#" ] ] if options.unique: outfile = UniqueBuffer(options.stdout) else: outfile = options.stdout while 1: line = options.stdin.readline() if not line: E.stop() sys.exit(0) if line[0] == "#": continue first_line = line break old_fields = first_line[:-1].split("\t") fields = [] for f in input_fields: # do pattern search if f[0] == "%" and f[-1] == "%": pattern = re.compile(f[1:-1]) for o in old_fields: if pattern.search(o) and o not in fields: fields.append(o) else: if f in old_fields: fields.append(f) if options.remove: fields = set(fields) fields = [x for x in old_fields if x not in fields] if options.large: reader = CSV.DictReaderLarge(CSV.CommentStripper(options.stdin), fieldnames=old_fields, dialect=options.csv_dialect) else: reader = csv.DictReader(CSV.CommentStripper(options.stdin), fieldnames=old_fields, dialect=options.csv_dialect) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') print("\t".join(fields)) first_row = True ninput, noutput, nerrors = 0, 0, 0 while 1: ninput += 1 try: row = six.next(reader) except _csv.Error as msg: options.stderr.write("# error while parsing: %s\n" % (msg)) nerrors += 1 continue except StopIteration: break if not row: break writer.writerow(row) noutput += 1 E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) E.stop()