Exemple #1
0
def ReadGeneLists(filename_genes, gene_pattern=None):
    """read gene lists from filename in matrix.

    returns a tuple (list of all genes, dictionary of gene lists) 
    """

    if filename_genes == "-":
        infile = sys.stdin
    else:
        infile = IOTools.openFile(filename_genes, "r")

    headers, table = CSV.readTable(infile.readlines(), as_rows=False)

    if filename_genes != "-":
        infile.close()

    all_genes = table[0]

    # if there is only a single column, add a dummy column
    if len(table) == 1:
        table.append([1] * len(table[0]))
        headers.append("foreground")

    E.info("read %i genes from %s" % (len(all_genes), filename_genes))

    if gene_pattern:
        rx = re.compile(gene_pattern)
        all_genes = [rx.search(x).groups()[0] for x in all_genes]

    gene_lists = collections.OrderedDict()
    for header, col in zip(headers[1:], table[1:]):
        s = list(set([x for x, y in zip(all_genes, col) if y != "0"]))
        gene_lists[header] = set(s)

    return all_genes, gene_lists
def readAndExpandTable(infile, options):
    '''splits fields in table at separator.

    If a field in a row contains multiple values,
    the row is expanded into multiple rows such
    that all values have space.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    options.stdout.write("\t".join(fields) + "\n")

    for row in table:

        data = []
        for x in range(len(fields)):
            data.append(row[x].split(options.separator))

        nrows = max([len(d) for d in data])

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write("\t".join([d[n] for d in data]) + "\n")
Exemple #3
0
def buildSelectStatementfromPed(filter_type, pedfile, template):
    '''Build a select statement from a template and a pedigree file'''
    pedigree = csv.DictReader(
        IOTools.open_file(pedfile),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    affecteds = []
    unaffecteds = []
    parents = []
    select = None
    # loop over pedigree file and establish relationships
    for row in pedigree:
        if row['status'] == '2':
            if filter_type == "denovo":
                father = row['father']
                mother = row['mother']
                proband = row['sample']
            elif filter_type == "dominant" or filter_type == "recessive":
                affecteds += [row['sample']]
            if filter_type == "recessive":
                parents += [row['father'], row['mother']]
        if row['status'] == '1':
            if filter_type == "dominant":
                unaffecteds += [row['sample']]
            elif filter_type == "recessive":
                if row['sample'] not in parents:
                    unaffecteds += [row['sample']]

    # Build select statement from template
    if filter_type == "denovo":
        select = template.replace("father", father)
        select = select.replace("mother", mother)
        select = select.replace("proband", proband)
    elif filter_type == "dominant":
        affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds)
        if len(unaffecteds) == 0:
            unaffecteds_exp = ''
        else:
            unaffecteds_exp = '&&vc.getGenotype("' + \
                ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \
                '").isHomRef()'
        select = template.replace("affecteds_exp", affecteds_exp)
        select = select.replace("unaffecteds_exp", unaffecteds_exp)
    elif filter_type == "recessive":
        affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds)
        unaffecteds_exp = '").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)
        if len(parents) == 0:
            parents_exp = ''
        else:
            parents_exp = '&&vc.getGenotype("' + \
                ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \
                '").getPL().1==0'
        select = template.replace("affecteds_exp", affecteds_exp)
        select = select.replace("unaffecteds_exp", unaffecteds_exp)
        select = select.replace("parents_exp", parents_exp)

    return select
def readAndJoinTable(infile, options):

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    join_column = options.join_column - 1
    join_name = options.join_column_name - 1

    join_rows = list(set([x[join_column] for x in table]))
    join_rows.sort()

    join_names = list(set([x[join_name] for x in table]))
    join_names.sort()

    join_columns = list(
        set(range(len(fields))).difference(set((join_column, join_name))))
    join_columns.sort()

    new_table = []
    map_old2new = {}

    map_name2start = {}
    x = 1
    for name in join_names:
        map_name2start[name] = x
        x += len(join_columns)

    row_width = len(join_columns) * len(join_names)
    for x in join_rows:
        map_old2new[x] = len(map_old2new)
        new_row = [
            x,
        ] + ["na"] * row_width
        new_table.append(new_row)

    for row in table:
        row_index = map_old2new[row[join_column]]
        start = map_name2start[row[join_name]]
        for x in join_columns:
            new_table[row_index][start] = row[x]
            start += 1

    # print new table
    options.stdout.write(fields[join_column])
    for name in join_names:
        for column in join_columns:
            options.stdout.write("\t%s%s%s" %
                                 (name, options.separator, fields[column]))
    options.stdout.write("\n")

    for row in new_table:
        options.stdout.write("\t".join(row) + "\n")
def readAndCollapseTable(infile, options, missing_value=""):
    '''collapse a table.

    Collapse a table of two columns with row names in the first
    column. Outputs a table with multiple columns for each row name.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    if len(fields) != 2:
        raise NotImplementedError("can only work on tables with two columns")

    values = collections.defaultdict(list)

    # column header after which to add
    separator = table[0][0]
    row_names = set([x[0] for x in table])

    row_name, value = table[0]

    values[row_name].append(value)
    added = set([row_name])
    for row_name, value in table[1:]:
        if row_name == separator:
            for r in row_names:
                if r not in added:
                    values[r].append(missing_value)
            added = set()

        values[row_name].append(value)
        added.add(row_name)

    for r in row_names:
        if r not in added:
            values[r].append(missing_value)

    sizes = set([len(x) for x in list(values.values())])
    assert len(sizes) == 1, "unequal number of row_names"
    size = list(sizes)[0]

    options.stdout.write("row\t%s\n" %
                         ("\t".join(["column_%i" % x for x in range(size)])))

    for key, row in list(values.items()):
        options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
Exemple #6
0
def getGODescriptions(infile):
    '''build dictionary mapping GOids to types and descriptions.

    Arguments
    ---------
    infile : string
        Filename of table with GO assignments

    Returns
    -------
    mapping : dict
        Dictionary mapping GOid to GOtype and GOdescription.
    '''

    with IOTools.open_file(infile) as inf:
        fields, table = CSV.readTable(inf, as_rows=False)

    return dict([
        (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[
            fields.index("go_id")], table[fields.index("description")])
    ])
def computeFDR(infile, options):
    '''compute FDR on a table.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    options.stdout.write("\t".join(fields) + "\n")

    for row in table:

        data = []
        for x in range(len(fields)):
            data.append(row[x].split(options.separator))

        nrows = max([len(d) for d in data])

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write("\t".join([d[n] for d in data]) + "\n")
def readAndGroupTable(infile, options):
    """read table from infile and group.
    """
    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)
    options.columns = getColumns(fields, options.columns)
    assert options.group_column not in options.columns

    converter = float
    new_fields = [fields[options.group_column]
                  ] + [fields[x] for x in options.columns]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce(lambda x, y: x + y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join([y for y in x if y != ""])
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join([y for y in set(x) if y != ""])
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [fields[options.group_column]]
        for c in options.columns:
            new_fields += list([
                "%s_%s" % (fields[c], x)
                for x in Stats.DistributionalParameters().getHeaders()
            ])

    # convert values to floats (except for group_column)
    # Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [row[options.group_column]]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append(converter(row[c]))
                except ValueError:
                    skip = True
                    break
        if not skip:
            new_table.append(new_row)
    table = new_table

    new_rows = CSV.groupTable(table, group_column=0, group_function=f)

    options.stdout.write("\t".join(new_fields) + "\n")
    for row in new_rows:
        options.stdout.write("\t".join(map(str, row)) + "\n")
Exemple #9
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--output-filename-pattern",
                      dest="output_filename_pattern",
                      type="string",
                      help="pattern for additional output files [%default].")

    parser.set_defaults(
        length=1000,
        minimum_coverage=0.90,
        maximum_reads=[1, 10, 20, 50, 100],
        output_filename_pattern="%s",
        normalize=True,
    )

    (options, args) = E.start(parser, add_csv_options=True)

    fields, table = CSV.readTable(sys.stdin, dictreader=CSV.DictReaderLarge)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    coverage_5prime = numpy.zeros(options.length, numpy.float)
    coverage_3prime = numpy.zeros(options.length, numpy.float)

    coverage_maxreads5prime = numpy.zeros(options.length, numpy.float)
    coverage_maxreads3prime = numpy.zeros(options.length, numpy.float)

    coverage_full5prime = numpy.zeros(options.length, numpy.float)
    coverage_full3prime = numpy.zeros(options.length, numpy.float)

    coverage_min5prime = numpy.zeros(options.length, numpy.float)
    coverage_min3prime = numpy.zeros(options.length, numpy.float)

    histograms = []
    for x in range(len(options.maximum_reads)):
        histograms.append([
            numpy.zeros(options.length, numpy.float),
            numpy.zeros(options.length, numpy.float), 0
        ])

    ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0
    for row in table:
        length, covered, meancov, data, nreads = (int(row["cov_nval"]),
                                                  float(row["cov_covered"]),
                                                  float(row["cov_mean"]),
                                                  row["cov_values"],
                                                  int(row["nover2"]))
        ninput += 1
        if length < options.length:
            nlength += 1
            continue

        if data == "na":
            nskipped += 1
            continue

        noutput += 1
        mincov = covered / length
        values = list(map(float, data.split(";")))
        m = max(values)
        values = [x / m for x in values]
        coverage_5prime += values[0:1000]
        coverage_3prime += values[-1000:]

        if mincov >= 1.0:
            coverage_full5prime += values[0:1000]
            coverage_full3prime += values[-1000:]
            nfull += 1

        if meancov >= options.minimum_coverage:
            coverage_min5prime += values[0:1000]
            coverage_min3prime += values[-1000:]
            nmincov += 1

        for maxreads in range(len(options.maximum_reads)):
            if nreads <= options.maximum_reads[maxreads]:
                histograms[maxreads][0] += values[0:1000]
                histograms[maxreads][1] += values[-1000:]
                histograms[maxreads][2] += 1

    if options.normalize:
        for x5, x3 in ((coverage_5prime, coverage_3prime),
                       (coverage_min5prime, coverage_min3prime),
                       (coverage_full5prime, coverage_full3prime)):
            m = max((max(x5), max(x3)))
            x3 /= m
            x5 /= m

        for x5, x3, c in histograms:
            m = max((max(x5), max(x3)))
            x5 /= m
            x3 /= m

    outfile = options.stdout
    outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'",
                             "mincov-3'", "full-5'", "full-3'")) + "\n")

    for x in range(0, options.length):
        outfile.write("\t".join([
            "%6.4f" % x
            for x in (x, coverage_5prime[x], coverage_3prime[x],
                      coverage_min5prime[x], coverage_min3prime[x],
                      coverage_full5prime[x], coverage_full3prime[x])
        ]) + "\n")

    outfile5 = IOTools.open_file(options.output_filename_pattern % "reads5",
                                 "w")
    outfile3 = IOTools.open_file(options.output_filename_pattern % "reads3",
                                 "w")

    outfile5.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    outfile3.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    for x in range(0, options.length):
        outfile5.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][0][x]
            for y in range(len(options.maximum_reads))
        ])))
        outfile3.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][1][x]
            for y in range(len(options.maximum_reads))
        ])))

    E.info(
        "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i"
        % (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength))

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    options.filename1, options.filename2 = args

    table1 = CSV.readTable(IOTools.open_file(options.filename1, "r"))
    table2 = CSV.readTable(IOTools.open_file(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines,
                                dialect=options.csv_dialect)

        print("\t".join(fields))

        first_row = True
        for row in reader:
            row = IOTools.convertDictionary(row)
            writer.writerow(row)

    E.stop()
Exemple #11
0
def run(infile, options, report_step=10000):

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    existing_tables = set()

    quick_import_separator = "\t"

    if options.database_backend == "postgres":
        import psycopg2
        raise NotImplementedError("needs refactoring for commandline options")
        dbhandle = psycopg2.connect(options.psql_connection)
        error = psycopg2.Error
        options.null = "NULL"
        options.string_value = "'%s'"
        options.text = "TEXT"
        options.index = "TEXT"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.database_backend == "mysql":
        import MySQLdb
        dbhandle = MySQLdb.connect(host=options.database_host,
                                   user=options.database_username,
                                   passwd=options.database_password,
                                   port=options.database_port,
                                   db=options.database_name)
        error = Exception
        options.null = "NULL"
        options.string_value = "%s"
        options.text = "TEXT"
        options.index = "VARCHAR(40)"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database_name)
        try:
            os.chmod(options.database_name, 0o664)
        except OSError as msg:
            E.warn("could not change permissions of database: %s" % msg)

        # Avoid the following error:
        # sqlite3.ProgrammingError: You must not use 8-bit bytestrings
        # unless you use a text_factory that can interpret 8-bit
        # bytestrings (like text_factory = str). It is highly
        # recommended that you instead just switch your application
        # to Unicode strings
        # Note: might be better to make csv2db unicode aware.
        dbhandle.text_factory = str

        error = sqlite3.OperationalError
        options.insert_many = True  # False
        options.null = None  # "NULL"
        options.text = "TEXT"
        options.index = "TEXT"
        options.string_value = "%s"  # "'%s'"

        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait(dbhandle, statement, error, options.retry)
        existing_tables = set([x[0] for x in cc])
        cc.close()

        # use , as separator
        quick_import_statement = \
            "sqlite3 %s '.import %%s %s'" % \
            (options.database_name, options.tablename)

        quick_import_separator = "|"

    if options.header is not None:
        options.header = [x.strip() for x in options.header.split(",")]

    if options.utf:
        reader = CSV.UnicodeDictReader(infile,
                                       dialect=options.dialect,
                                       fieldnames=options.header)
    else:
        reader = csv.DictReader(CSV.CommentStripper(infile),
                                dialect=options.dialect,
                                fieldnames=options.header)

    if options.replace_header:
        try:
            next(reader)
        except StopIteration:
            pass

    E.info("reading %i columns to guess column types" % options.guess_size)

    rows = []
    for row in reader:
        if None in row:
            raise ValueError("undefined columns in input file at row: %s" %
                             row)

        try:
            rows.append(IOTools.convertDictionary(row, map=options.map))
        except TypeError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))
        except ValueError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break

    E.info("read %i rows for type guessing" % len(rows))
    E.info("creating table")

    if len(rows) == 0:
        if options.allow_empty:
            if not reader.fieldnames:
                E.warn("no data - no table created")
            else:
                # create empty table and exit
                take, map_column2type, ignored = createTable(
                    dbhandle,
                    error,
                    options.tablename,
                    options,
                    retry=options.retry,
                    headers=reader.fieldnames,
                    ignore_empty=options.ignore_empty,
                    ignore_columns=options.ignore_columns,
                    rename_columns=options.rename_columns,
                    lowercase=options.lowercase,
                    ignore_duplicates=options.ignore_duplicates,
                    indices=options.indices,
                    first_column=options.first_column,
                    existing_tables=existing_tables,
                    append=options.append)
                E.info("empty table created")
            return
        else:
            raise ValueError("empty table")
    else:
        take, map_column2type, ignored = createTable(
            dbhandle,
            error,
            options.tablename,
            options,
            rows=rows,
            retry=options.retry,
            headers=reader.fieldnames,
            ignore_empty=options.ignore_empty,
            ignore_columns=options.ignore_columns,
            rename_columns=options.rename_columns,
            lowercase=options.lowercase,
            ignore_duplicates=options.ignore_duplicates,
            indices=options.indices,
            first_column=options.first_column,
            existing_tables=existing_tables,
            append=options.append)

    def row_iter(rows, reader):
        for row in rows:
            yield quoteRow(row,
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)
        for data in reader:
            yield quoteRow(IOTools.convertDictionary(data, map=options.map),
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)

    ninput = 0

    E.info("inserting data")

    if options.insert_quick:
        E.info("using quick insert")

        outfile, filename = tempfile.mkstemp()

        E.debug("dumping data into %s" % filename)

        for d in row_iter(rows, reader):

            ninput += 1
            os.write(
                outfile,
                quick_import_separator.join([str(d[x]) for x in take]) + "\n")

            if ninput % report_step == 0:
                E.info("iteration %i\n" % ninput)

        os.close(outfile)

        statement = quick_import_statement % filename
        E.debug(statement)

        # infinite loop possible
        while 1:

            retcode = E.run(statement, cwd=os.getcwd(), close_fds=True)

            if retcode != 0:
                E.warn("import error using statement: %s" % statement)

                if not options.retry:
                    raise ValueError("import error using statement: %s" %
                                     statement)

                time.sleep(5)
                continue

            break

        os.remove(filename)

        # there is no way to insert NULL values into sqlite. The only
        # solution is to update all colums.
        for column in take:
            executewait(
                dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" %
                (options.tablename, column, column), error, options.retry)

    elif options.insert_many:
        data = []
        for d in row_iter(rows, reader):
            ninput += 1

            data.append([d[x] for x in take])

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

        statement = "INSERT INTO %s VALUES (%s)" % (options.tablename,
                                                    ",".join("?" * len(take)))

        E.info("inserting %i rows" % len(data))
        E.debug("multiple insert:\n# %s" % statement)

        while 1:
            try:
                dbhandle.executemany(statement, data)
            except error as msg:
                E.warn("import failed: msg=%s, statement=\n  %s" %
                       (msg, statement))
                # TODO: check for database locked msg
                if not options.retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error(msg)
                time.sleep(5)
                continue
            break

    else:
        # insert line by line (could not figure out how to do bulk loading with
        # subprocess and COPY FROM STDIN)
        statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename,
                                                         ')s, %('.join(take))
        # output data used for guessing:
        for d in row_iter(rows, reader):

            ninput += 1
            E.debug("single insert:\n# %s" % (statement % d))
            cc = executewait(dbhandle,
                             statement,
                             error,
                             retry=options.retry,
                             args=d)
            cc.close()

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

    E.info("building indices")
    nindex = 0
    for index in options.indices:

        nindex += 1
        try:
            statement = "CREATE INDEX %s_index%i ON %s (%s)" % (
                options.tablename, nindex, options.tablename, index)
            cc = executewait(dbhandle, statement, error, options.retry)
            cc.close()
            E.info("added index on column %s" % (index))
        except error as msg:
            E.info("adding index on column %s failed: %s" % (index, msg))

    statement = "SELECT COUNT(*) FROM %s" % (options.tablename)
    cc = executewait(dbhandle, statement, error, options.retry)
    result = cc.fetchone()
    cc.close()

    noutput = result[0]

    E.info("ninput=%i, noutput=%i, nskipped_columns=%i" %
           (ninput, noutput, len(ignored)))

    dbhandle.commit()
Exemple #12
0
def createTable(dbhandle,
                error,
                tablename,
                options,
                retry=True,
                ignore_empty=True,
                ignore_columns=[],
                rename_columns=[],
                lowercase=False,
                ignore_duplicates=True,
                indices=[],
                rows=None,
                headers=None,
                first_column=None,
                existing_tables=set(),
                append=False):

    # create table by guessing column types from data type.
    if rows:
        map_column2type, ignored, max_values = CSV.getMapColumn2Type(
            rows, ignore_empty=ignore_empty, get_max_values=True)
        if ignored:
            E.info("ignored columns: %s" % str(ignored))

        headers = list(map_column2type.keys())
        headers.sort()

    elif headers:
        map_column2type = dict(list(zip(headers, [
            None,
        ] * len(headers))))
        ignored = 0

    columns_to_ignore = set([x.lower() for x in ignore_columns])
    columns_to_rename = dict([x.lower().split(":") for x in rename_columns])

    take = []
    # associate headers to field names
    columns = []
    present = {}
    for header_index, h in enumerate(headers):
        hh = h
        if lowercase:
            hh = string.lower(h)

        if hh in columns_to_ignore:
            continue

        if hh in present:
            if ignore_duplicates:
                continue
            else:
                raise ValueError("duplicate column %s" % hh)

        present[hh] = 1
        take.append(h)
        if map_column2type[h] == int:
            max_value = max_values[h]
            if max_value > 2147483647:
                t = "BIGINT DEFAULT '0'"
            elif max_value > 32767:
                t = "INTEGER DEFAULT '0'"
            else:
                t = "SMALLINT DEFAULT '0'"

        elif map_column2type[h] == float:
            t = "FLOAT DEFAULT '0'"
        else:
            if h in options.indices:
                t = options.index
            else:
                t = options.text

        # remove special characters from column names
        if hh == "":
            if first_column is not None and header_index == 0:
                hh = first_column
            else:
                raise ValueError("column '%s' without header " % h)
        hh = columns_to_rename.get(hh, hh)
        hh = re.sub('''['"]''', "", hh)
        hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh)
        if hh[0] in "0123456789":
            hh = "_" + hh
        columns.append("%s %s" % (hh, t))

    if not options.append:
        # delete old table if it exists
        while 1:
            try:
                cc = dbhandle.cursor()
                # mysql: removed '' around table name
                statement = "DROP TABLE IF EXISTS %s" % tablename
                E.debug(statement)
                cc.execute(statement)
                dbhandle.commit()
                cc.close()
                E.info("existing table %s deleted" % tablename)
            except sqlite3.OperationalError as msg:
                E.warn(msg)
                time.sleep(5)
                continue
            except error as msg:
                E.warn("could not delete existing table %s: %s" %
                       (tablename, str(msg)))
                dbhandle.rollback()
                if not retry:
                    raise error(msg)
                elif tablename in existing_tables:
                    # table exists, but drop did not work (e.g. database lock)
                    time.sleep(5)
                    continue
                else:
                    # table might not have existed
                    break
            break

        # create new table
        statement = "CREATE TABLE %s ( %s );" % (tablename, ", ".join(columns))

        E.debug("table create:\n# %s" % (statement))

        while 1:
            try:
                cc = dbhandle.cursor()
                cc.execute(statement)
                cc.close()
                dbhandle.commit()
            except error as msg:
                E.warn("table creation failed: msg=%s, statement=\n  %s" %
                       (msg, statement))
                # TODO: check for database locked msg
                if not retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error("%s: %s" % (msg, statement))
                time.sleep(5)
                continue
            break

        E.info("table %s created successfully." % tablename)

    return take, map_column2type, ignored
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--column", dest="column", type="int",
                      help="column to split on.")

    parser.add_option("--num-bins", dest="num_bins", type="int",
                      help="number of bins to create.")

    parser.add_option("--method", dest="method", type="choice",
                      choices=("equal-sized-bins",),
                      help="method to use to bin data.")

    parser.add_option("--no-headers", dest="has_headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.set_defaults(
        has_headers=True,
        method="equal-sized-bins",
        column=1,
        num_bins=4,
        output_filename_pattern="bin%i",
    )

    (options, args) = E.start(parser)
    options.column -= 1

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = IOTools.open_file(args[0], "r")
    else:
        infile = sys.stdin

    fields, data = CSV.readTable(infile)

    c = options.column
    values = [float(x[c]) for x in data]

    bins = []

    if options.method == "equal-sized-bins":
        increment = int(math.floor(float(len(values)) / options.num_bins))
        indices = list(range(0, len(values)))
        indices.sort(key=lambda x: values[x])
        for x in range(len(values)):
            values[indices[x]] = x
        bins = list(range(0, len(values) - increment, increment))

    elif options.method == "pass":
        pass

    E.debug("bins=%s" % str(bins))

    outputters = []
    for x in range(0, len(bins)):
        outputters.append(
            Outputter(options.output_filename_pattern % x, fields))

    # output tables
    for x in range(0, len(data)):
        bin = bisect.bisect(bins, values[x]) - 1
        outputters[bin].write(data[x])

    # stats
    if options.loglevel >= 1:
        options.stdlog.write("# bin\tstart\tcounts\tfilename\n")
        for x in range(0, len(bins)):
            options.stdlog.write("# %i\t%f\t%i\t%s\n" % (
                x, bins[x], outputters[x].mCounts, outputters[x].mFilename))

    E.info("ninput=%i, noutput=%i" %
           (len(data), sum((x.mCounts for x in outputters))))

    E.stop()
Exemple #14
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--remove",
                      dest="remove",
                      action="store_true",
                      help="remove specified columns, keep all others.")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option(
        "-l",
        "--large",
        dest="large",
        action="store_true",
        help=
        "large columns. Do not use native python CSV module [default=%default]."
    )

    parser.add_option("-f",
                      "--filename-fields",
                      dest="filename_fields",
                      type="string",
                      help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        filename_fields=None,
    )

    (options, args) = E.start(parser, add_csv_options=True, quiet=True)

    statement = " ".join(args)

    if options.large:
        reader = CSV.DictReaderLarge(CSV.CommentStripper(sys.stdin),
                                     dialect=options.csv_dialect)
    else:
        reader = csv.DictReader(CSV.CommentStripper(sys.stdin),
                                dialect=options.csv_dialect)

    exec("f = lambda r: %s" % statement, globals())
    counter = E.Counter()
    writer = csv.DictWriter(options.stdout,
                            reader.fieldnames,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator)

    writer.writerow(dict((fn, fn) for fn in reader.fieldnames))

    while 1:
        counter.input += 1
        try:
            row = next(reader)
        except _csv.Error as msg:
            options.stderr.write("# error while parsing: %s\n" % (msg))
            counter.errors += 1
            continue
        except StopIteration:
            break

        if not row:
            break

        if f(row):
            writer.writerow(row)
            counter.output += 1
        else:
            counter.filtered += 1

    E.info("%s" % counter)

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option("-1",
                      "--join-fields1",
                      dest="join_fields1",
                      type="string",
                      help="join fields in first table.")
    parser.add_option("-2",
                      "--join-fields2",
                      dest="join_fields2",
                      type="string",
                      help="join fields in second table.")
    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="set operation to perform.",
                      choices=("intersection", "rest", "union"))

    parser.set_defaults(
        remove=False,
        unique=False,
        join_fields1=None,
        join_fields2=None,
        method="intersection",
    )

    (options, args) = E.start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    if not options.join_fields1 or not options.join_fields2:
        raise ValueError("please specify at least one join field per table")

    options.join_fields1 = options.join_fields1.split(",")
    options.join_fields2 = options.join_fields2.split(",")

    options.filename1, options.filename2 = args

    fields1, table1 = CSV.readTable(open(options.filename1, "r"))
    fields2, table2 = CSV.readTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    nfields1 = []
    for x in range(len(fields1)):
        if fields1[x] in options.join_fields1:
            nfields1.append(x)
    nfields2 = []
    for x in range(len(fields2)):
        if fields2[x] in options.join_fields2:
            nfields2.append(x)

    # calculate row indices: double keys are not taken care of here
    keys = {}
    for row1 in table1:
        v = [row1[x] for x in nfields1]
        key = hashlib.md5("".join(v)).digest()
        keys[key] = row1

    if options.method == "intersection":
        # build new field list
        take = list(range(len(fields1)))
        c = len(take)
        for x in fields2:
            if x not in options.join_fields2:
                take.append(c)
            c += 1

        t = fields1 + fields2

        new_fields = [t[x] for x in take]

        print("\t".join(new_fields))

        for row2 in table2:
            v = [row2[x] for x in nfields2]
            key = hashlib.md5("".join(v)).digest()
            if key in keys:
                new_row = keys[key] + row2
                outfile.write("\t".join([new_row[x] for x in take]) + "\n")

    elif options.method == "rest":

        new_fields = fields2
        print("\t".join(new_fields))

        for row2 in table2:
            v = [row2[x] for x in nfields2]
            key = hashlib.md5("".join(v)).digest()
            if key not in keys:
                outfile.write("\t".join(row2) + "\n")

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("transpose", "normalize-by-max", "normalize-by-value",
                 "multiply-by-value", "percentile", "remove-header",
                 "normalize-by-table", "upper-bound", "lower-bound",
                 "kullback-leibler", "expand", "compress", "fdr", "grep"),
        help="""actions to perform on table.""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default]")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--header-names",
                      dest="has_headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--transpose",
                      dest="transpose",
                      action="store_true",
                      help="transpose table.")

    parser.add_option(
        "--set-transpose-field",
        dest="set_transpose_field",
        type="string",
        help="set first field (row 1 and col 1) to this value [%default].")

    parser.add_option("--transpose-format",
                      dest="transpose_format",
                      type="choice",
                      choices=(
                          "default",
                          "separated",
                      ),
                      help="input format of un-transposed table")

    parser.add_option(
        "--expand",
        dest="expand_table",
        action="store_true",
        help="expand table - multi-value cells with be expanded over "
        "several rows.")

    parser.add_option("--no-headers",
                      dest="has_headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("--columns",
                      dest="columns",
                      type="string",
                      help="columns to use.")

    parser.add_option("--file",
                      dest="file",
                      type="string",
                      help="columns to test from table.",
                      metavar="FILE")

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter of columns.",
                      metavar="DELIM")

    parser.add_option("-V",
                      "--invert-match",
                      dest="invert_match",
                      action="store_true",
                      help="invert match.")

    parser.add_option("--sort-by-rows",
                      dest="sort_rows",
                      type="string",
                      help="output order for rows.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("--group",
                      dest="group_column",
                      type="int",
                      help="group values by column. Supply an integer column "
                      "[default=%default]")

    parser.add_option("--group-function",
                      dest="group_function",
                      type="choice",
                      choices=("min", "max", "sum", "mean", "stats", "cat",
                               "uniq"),
                      help="function to group values by.")

    parser.add_option("--join-table",
                      dest="join_column",
                      type="int",
                      help="join rows in a table by columns.")

    parser.add_option(
        "--collapse-table",
        dest="collapse_table",
        type="string",
        help="collapse a table. Value determines the missing variable "
        "[%default].")

    parser.add_option("--join-column-name",
                      dest="join_column_name",
                      type="int",
                      help="use this column as a prefix.")

    parser.add_option("--flatten-table",
                      dest="flatten_table",
                      action="store_true",
                      help="flatten a table [%default].")

    parser.add_option("--as-column",
                      dest="as_column",
                      action="store_true",
                      help="output table as a single column.")

    parser.add_option("--split-fields",
                      dest="split_fields",
                      action="store_true",
                      help="split fields.")

    parser.add_option(
        "--separator",
        dest="separator",
        type="string",
        help="separator for multi-valued fields [default=%default].")

    parser.add_option(
        "--fdr-method",
        dest="fdr_method",
        type="choice",
        choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--fdr-add-column",
        dest="fdr_add_column",
        type="string",
        help="add new column instead of replacing existing columns. "
        "The value of the option will be used as prefix if there are "
        "multiple columns [%default]")

    # IMS: add option to use a column as the row id in flatten
    parser.add_option(
        "--id-column",
        dest="id_column",
        type="string",
        help="list of column(s) to use as the row id when flattening "
        "the table. If None, then row number is used. [default=%default].")

    parser.add_option(
        "--variable-name",
        dest="variable_name",
        type="string",
        help="the column header for the 'variable' column when flattening "
        "[default=%default].")

    parser.add_option(
        "--value-name",
        dest="value_name",
        type="string",
        help="the column header for the 'value' column when flattening "
        "[default=%default].")

    parser.set_defaults(
        methods=[],
        scale=1.0,
        has_headers=True,
        format=None,
        value=0.0,
        parameters="",
        columns="all",
        transpose=False,
        set_transpose_field=None,
        transpose_format="default",
        group=False,
        group_column=0,
        group_function="mean",
        missing_value="na",
        sort_rows=None,
        flatten_table=False,
        collapse_table=None,
        separator=";",
        expand=False,
        join_column=None,
        join_column_name=None,
        compute_fdr=None,
        as_column=False,
        fdr_method="BH",
        fdr_add_column=None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter="\t",
        invert_match=False,
    )

    (options, args) = E.start(parser, add_pipe_options=True)

    options.parameters = options.parameters.split(",")

    if options.group_column:
        options.group = True
        options.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    # if only to remove header, do this quickly
    if options.methods == ["remove-header"]:

        first = True
        for line in options.stdin:
            if line[0] == "#":
                continue
            if first:
                first = False
                continue
            options.stdout.write(line)

    elif options.transpose or "transpose" in options.methods:

        readAndTransposeTable(options.stdin, options)

    elif options.flatten_table:
        # IMS: bug fixed to make work. Also added options for keying
        # on a particular and adding custom column headings

        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.columns = getColumns(fields, options.columns)

        if options.id_column:
            id_columns = [int(x) - 1 for x in options.id_column.split(",")]
            id_header = "\t".join(
                [fields[id_column] for id_column in id_columns])
            options.columns = [
                x for x in options.columns if x not in id_columns
            ]
        else:
            id_header = "row"

        options.stdout.write(
            "%s\t%s\t%s\n" %
            (id_header, options.variable_name, options.value_name))

        for x, row in enumerate(table):

            if options.id_column:
                row_id = "\t".join(
                    [row[int(x) - 1] for x in options.id_column.split(",")])
            else:
                row_id = str(x)

            for y in options.columns:
                options.stdout.write("%s\t%s\t%s\n" %
                                     (row_id, fields[y], row[y]))

    elif options.as_column:

        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)
        options.columns = getColumns(fields, options.columns)
        table = list(zip(*table))

        options.stdout.write("value\n")

        for column in options.columns:
            options.stdout.write("\n".join(table[column]) + "\n")

    elif options.split_fields:

        # split comma separated fields
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.stdout.write("%s\n" % ("\t".join(fields)))

        for row in table:
            row = [x.split(options.separator) for x in row]
            for d in itertools.product(*row):
                options.stdout.write("%s\n" % "\t".join(d))

    elif options.group:
        readAndGroupTable(options.stdin, options)

    elif options.join_column:
        readAndJoinTable(options.stdin, options)

    elif options.expand_table:
        readAndExpandTable(options.stdin, options)

    elif options.collapse_table is not None:
        readAndCollapseTable(options.stdin, options, options.collapse_table)

    elif "grep" in options.methods:

        options.columns = [int(x) - 1 for x in options.columns.split(",")]

        patterns = []

        if options.file:
            infile = IOTools.open_file(options.file, "r")
            for line in infile:
                if line[0] == "#":
                    continue
                patterns.append(line[:-1].split(options.delimiter)[0])
        else:
            patterns = args

        for line in options.stdin:

            data = line[:-1].split(options.delimiter)
            found = False

            for c in options.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found and options.invert_match) or (
                    found and not options.invert_match):
                print(line[:-1])
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        # Apply remainder of transformations
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=False)
        # convert columns to list
        table = [list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError("table is empty")

        nrows = len(table[0])

        E.info("processing table with %i rows and %i columns" % (nrows, ncols))

        options.columns = getColumns(fields, options.columns)

        # convert all values to float
        for c in options.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float(table[c][r])
                except ValueError:
                    continue

        for method in options.methods:

            if method == "normalize-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = [x / value for x in table[c]]

            elif method == "multiply-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = [x * value for x in table[c]]

            elif method == "normalize-by-max":

                for c in options.columns:
                    m = max(table[c])
                    table[c] = [x / m for x in table[c]]

            elif method == "kullback-leibler":
                options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                format = options.format
                if format is None:
                    format = "%f"

                for x in range(0, len(options.columns) - 1):
                    for y in range(x + 1, len(options.columns)):
                        c1 = options.columns[x]
                        c2 = options.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log(p / q)
                            e2 += q * math.log(q / p)

                        options.stdout.write(
                            "%s\t%s\t%s\t%s\t%s\n" %
                            (fields[c1], fields[c2], format % e1, format % e2,
                             format % ((e1 + e2) / 2)))
                E.stop()
                sys.exit(0)

            elif method == "rank":

                for c in options.columns:
                    tt = table[c]
                    t = list(zip(tt, list(range(nrows))))
                    t.sort()
                    for i, n in zip([x[1] for x in t], list(range(nrows))):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[0])
                del options.parameters[0]
                new_value = float(options.parameters[0])
                del options.parameters[0]

                if method == "upper-bound":
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns:
                    pvalues.extend(table[c])

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \
                    str(max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \
                    str(min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = list(
                    map(
                        str,
                        Stats.adjustPValues(pvalues,
                                            method=options.fdr_method)))

                if options.fdr_add_column is None:
                    x = 0
                    for c in options.columns:
                        table[c] = qvalues[x:x + nrows]
                        x += nrows
                else:
                    # add new column headers
                    if len(options.columns) == 1:
                        fields.append(options.fdr_add_column)
                    else:
                        for co in options.columns:
                            fields.append(options.fdr_add_column + fields[c])

                    x = 0
                    for c in options.columns:
                        # add a new column
                        table.append(qvalues[x:x + nrows])
                        x += nrows
                    ncols += len(options.columns)

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table = CSV.readTable(
                    IOTools.open_file(other_table_name, "r"),
                    with_header=options.has_headers,
                    as_rows=False)

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float(other_table[c][r])
                        except ValueError:
                            continue

                # set 0s to 1 in the other matrix
                for c in options.columns:
                    for r in range(nrows):
                        if isinstance(table[c][r], float) and \
                                isinstance(other_table[c][r], float) and \
                                other_table[c][r] != 0:
                            table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = options.missing_value

        # convert back
        if options.format is not None:
            for c in options.columns:
                for r in range(nrows):
                    if isinstance(table[c][r], float):
                        table[c][r] = format % table[c][r]

        options.stdout.write("\t".join(fields) + "\n")
        if options.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in options.sort_rows.split(","):
                if x not in old2new:
                    continue
                r = old2new[x]
                options.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")
        else:
            for r in range(nrows):
                options.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")

    E.stop()
Exemple #17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = IOTools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"),
                                    "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()
Exemple #18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--remove",
                      dest="remove",
                      action="store_true",
                      help="remove specified columns, keep all others.")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option(
        "-l",
        "--large",
        dest="large",
        action="store_true",
        help=
        "large columns. Do not use native python CSV module [default=%default]."
    )

    parser.add_option("-f",
                      "--filename-fields",
                      dest="filename_fields",
                      type="string",
                      help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        large=False,
        filename_fields=None,
    )

    (options, args) = E.start(parser, add_csv_options=True, quiet=True)

    input_fields = args

    if options.filename_fields:
        input_fields = [
            x[:-1].split("\t")[0] for x in [
                x for x in IOTools.open_file(options.filename_fields,
                                             "r").readlines() if x[0] != "#"
            ]
        ]

    if options.unique:
        outfile = UniqueBuffer(options.stdout)
    else:
        outfile = options.stdout

    while 1:
        line = options.stdin.readline()

        if not line:
            E.stop()
            sys.exit(0)

        if line[0] == "#":
            continue

        first_line = line
        break

    old_fields = first_line[:-1].split("\t")

    fields = []
    for f in input_fields:
        # do pattern search
        if f[0] == "%" and f[-1] == "%":
            pattern = re.compile(f[1:-1])
            for o in old_fields:
                if pattern.search(o) and o not in fields:
                    fields.append(o)
        else:
            if f in old_fields:
                fields.append(f)

    if options.remove:
        fields = set(fields)
        fields = [x for x in old_fields if x not in fields]

    if options.large:
        reader = CSV.DictReaderLarge(CSV.CommentStripper(options.stdin),
                                     fieldnames=old_fields,
                                     dialect=options.csv_dialect)
    else:
        reader = csv.DictReader(CSV.CommentStripper(options.stdin),
                                fieldnames=old_fields,
                                dialect=options.csv_dialect)

    writer = csv.DictWriter(outfile,
                            fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    print("\t".join(fields))

    first_row = True
    ninput, noutput, nerrors = 0, 0, 0

    while 1:
        ninput += 1
        try:
            row = six.next(reader)
        except _csv.Error as msg:
            options.stderr.write("# error while parsing: %s\n" % (msg))
            nerrors += 1
            continue
        except StopIteration:
            break
        if not row:
            break
        writer.writerow(row)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.stop()