Ejemplo n.º 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codonbias_weights2tsv.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--methods",
                      dest="methods",
                      type="string",
                      help="methods to apply.")

    parser.add_option("--is-frequencies",
                      dest="is_frequencies",
                      action="store_true",
                      help="data is frequencies (default: weights).")

    parser.add_option("-s",
                      "--sort",
                      dest="sort",
                      type="choice",
                      choices=("percent-difference", "aa"),
                      help="sort order of output table.")

    parser.add_option(
        "-g",
        "--global-sort",
        dest="global_sort",
        action="store_true",
        help="globally sort results (otherwise: by species pair).")

    parser.set_defaults( \
       methods = "",
       is_frequencies = False,
       sort = "percent-difference",
       global_sort= False,
       )

    (options, args) = E.Start(parser)
    if options.methods:
        options.methods = options.methods.split(",")

    fields, table = CSV.ReadTable(sys.stdin)

    ## convert weights to floats
    table = CSV.getConvertedTable(table, range(1, len(fields)))

    for method in options.methods:

        if method == "overview":
            if options.is_frequencies:
                WriteOverviewFrequencies(fields, table, options)
            else:
                WriteOverviewWeights(fields, table, options)
Ejemplo n.º 2
0
def readAndGroupTable( infile, options ):
    """read table from infile and group.
    """
    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )
    options.columns = getColumns( fields, options.columns )
    assert options.group_column not in options.columns

    converter = float
    new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce( lambda x,y: x+y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join( [ y for y in x if y != "" ] )
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join( [ y for y in set(x) if y != "" ] )
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [ fields[options.group_column] ]
        for c in options.columns:
            new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) )

    ## convert values to floats (except for group_column)
    ## Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [ row[options.group_column] ]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append( converter(row[c]) )
                except ValueError:
                    skip = True
                    break
        if not skip: new_table.append(new_row)
    table = new_table

    new_rows = CSV.GroupTable( table,
                               group_column = 0,
                               group_function = f )

    options.stdout.write("\t".join(new_fields) + "\n")        
    for row in new_rows:
        options.stdout.write( "\t".join( map(str,row) ) + "\n")
Ejemplo n.º 3
0
def ReadGeneLists(filename_genes, gene_pattern=None):
    """read gene lists from filename in matrix.

    returns a tuple (list of all genes, dictionary of gene lists)
    """

    if filename_genes == "-":
        infile = sys.stdin
    else:
        infile = IOTools.openFile(filename_genes, "r")

    headers, table = CSV.readTable(infile.readlines(), as_rows=False)

    if filename_genes != "-":
        infile.close()

    all_genes = table[0]

    # if there is only a single column, add a dummy column
    if len(table) == 1:
        table.append([1] * len(table[0]))
        headers.append("foreground")

    E.info("read %i genes from %s" % (len(all_genes), filename_genes))

    if gene_pattern:
        rx = re.compile(gene_pattern)
        all_genes = [rx.search(x).groups()[0] for x in all_genes]

    gene_lists = collections.OrderedDict()
    for header, col in zip(headers[1:], table[1:]):
        s = list(set([x for x, y in zip(all_genes, col) if y != "0"]))
        gene_lists[header] = set(s)

    return all_genes, gene_lists
Ejemplo n.º 4
0
def ReadGeneLists(filename_genes, gene_pattern=None):
    """read gene lists from filename in matrix.

    returns a tuple (list of all genes, dictionary of gene lists) 
    """

    if filename_genes == "-":
        infile = sys.stdin
    else:
        infile = IOTools.openFile(filename_genes, "r")

    headers, table = CSV.readTable(infile.readlines(), as_rows=False)

    if filename_genes != "-":
        infile.close()

    all_genes = table[0]

    # if there is only a single column, add a dummy column
    if len(table) == 1:
        table.append([1] * len(table[0]))
        headers.append("foreground")

    E.info("read %i genes from %s" % (len(all_genes), filename_genes))

    if gene_pattern:
        rx = re.compile(gene_pattern)
        all_genes = [rx.search(x).groups()[0] for x in all_genes]

    gene_lists = collections.OrderedDict()
    for header, col in zip(headers[1:], table[1:]):
        s = list(set([x for x, y in zip(all_genes, col) if y != "0"]))
        gene_lists[header] = set(s)

    return all_genes, gene_lists
Ejemplo n.º 5
0
def readAndExpandTable(infile, options):
    '''splits fields in table at separator. 

    If a field in a row contains multiple values,
    the row is expanded into multiple rows such
    that all values have space.
    '''

    fields, table = CSV.readTable(
        infile, with_header=options.has_headers, as_rows=True)

    options.stdout.write("\t".join(fields) + "\n")

    for row in table:

        data = []
        for x in range(len(fields)):
            data.append(row[x].split(options.separator))

        nrows = max([len(d) for d in data])

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write("\t".join([d[n] for d in data]) + "\n")
Ejemplo n.º 6
0
def getGODescriptions(infile):
    """build dictionary mapping GOids to types and descriptions.

    Arguments
    ---------
    infile : string
        Filename of table with GO assignments

    Returns
    -------
    mapping : dict
        Dictionary mapping GOid to GOtype and GOdescription.
    """

    with IOTools.openFile(infile) as inf:
        fields, table = CSV.readTable(inf, as_rows=False)

    return dict(
        [
            (y, (x, z))
            for x, y, z in zip(
                table[fields.index("go_type")], table[fields.index("go_id")], table[fields.index("description")]
            )
        ]
    )
Ejemplo n.º 7
0
def readAndExpandTable(infile, options):
    '''splits fields in table at separator.

    If a field in a row contains multiple values,
    the row is expanded into multiple rows such
    that all values have space.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    options.stdout.write("\t".join(fields) + "\n")

    for row in table:

        data = []
        for x in range(len(fields)):
            data.append(row[x].split(options.separator))

        nrows = max([len(d) for d in data])

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write("\t".join([d[n] for d in data]) + "\n")
Ejemplo n.º 8
0
def buildSelectStatementfromPed(filter_type, pedfile, template):
    '''Build a select statement from a template and a pedigree file'''
    pedigree = csv.DictReader(
        IOTools.openFile(pedfile),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    affecteds = []
    unaffecteds = []
    parents = []
    select = None
    # loop over pedigree file and establish relationships
    for row in pedigree:
        if row['status'] == '2':
            if filter_type == "denovo":
                father = row['father']
                mother = row['mother']
                proband = row['sample']
            elif filter_type == "dominant" or filter_type == "recessive":
                affecteds += [row['sample']]
            if filter_type == "recessive":
                parents += [row['father'], row['mother']]
        if row['status'] == '1':
            if filter_type == "dominant":
                unaffecteds += [row['sample']]
            elif filter_type == "recessive":
                if row['sample'] not in parents:
                    unaffecteds += [row['sample']]

    # Build select statement from template
    if filter_type == "denovo":
        select = template.replace("father", father)
        select = select.replace("mother", mother)
        select = select.replace("proband", proband)
    elif filter_type == "dominant":
        affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds)
        if len(unaffecteds) == 0:
            unaffecteds_exp = ''
        else:
            unaffecteds_exp = '&&vc.getGenotype("' + \
                ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \
                '").isHomRef()'
        select = template.replace("affecteds_exp", affecteds_exp)
        select = select.replace("unaffecteds_exp", unaffecteds_exp)
    elif filter_type == "recessive":
        affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds)
        unaffecteds_exp = '").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)
        if len(parents) == 0:
            parents_exp = ''
        else:
            parents_exp = '&&vc.getGenotype("' + \
                ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \
                '").getPL().1==0'
        select = template.replace("affecteds_exp", affecteds_exp)
        select = select.replace("unaffecteds_exp", unaffecteds_exp)
        select = select.replace("parents_exp", parents_exp)

    return select
Ejemplo n.º 9
0
def getGODescriptions(infile):
    '''return dictionary mapping GO category to description
    and namespace.
    '''

    with IOTools.openFile(infile) as inf:
        fields, table = CSV.ReadTable(inf, as_rows=False)

    return dict([
        (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[
            fields.index("go_id")], table[fields.index("description")])
    ])
Ejemplo n.º 10
0
def readAndJoinTable(infile, options):

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    join_column = options.join_column - 1
    join_name = options.join_column_name - 1

    join_rows = list(set([x[join_column] for x in table]))
    join_rows.sort()

    join_names = list(set([x[join_name] for x in table]))
    join_names.sort()

    join_columns = list(
        set(range(len(fields))).difference(set((join_column, join_name))))
    join_columns.sort()

    new_table = []
    map_old2new = {}

    map_name2start = {}
    x = 1
    for name in join_names:
        map_name2start[name] = x
        x += len(join_columns)

    row_width = len(join_columns) * len(join_names)
    for x in join_rows:
        map_old2new[x] = len(map_old2new)
        new_row = [
            x,
        ] + ["na"] * row_width
        new_table.append(new_row)

    for row in table:
        row_index = map_old2new[row[join_column]]
        start = map_name2start[row[join_name]]
        for x in join_columns:
            new_table[row_index][start] = row[x]
            start += 1

    # print new table
    options.stdout.write(fields[join_column])
    for name in join_names:
        for column in join_columns:
            options.stdout.write("\t%s%s%s" %
                                 (name, options.separator, fields[column]))
    options.stdout.write("\n")

    for row in new_table:
        options.stdout.write("\t".join(row) + "\n")
Ejemplo n.º 11
0
 def row_iter(rows, reader):
     for row in rows:
         yield quoteRow(row, take, map_column2type,
                        options.missing_values,
                        null=options.null,
                        string_value=options.string_value)
     for data in reader:
         yield quoteRow(CSV.ConvertDictionary(data, map=options.map),
                        take,
                        map_column2type,
                        options.missing_values,
                        null=options.null,
                        string_value=options.string_value)
Ejemplo n.º 12
0
def readAndJoinTable(infile, options):

    fields, table = CSV.readTable(
        infile, with_header=options.has_headers, as_rows=True)

    join_column = options.join_column - 1
    join_name = options.join_column_name - 1

    join_rows = list(set(map(lambda x: x[join_column], table)))
    join_rows.sort()

    join_names = list(set(map(lambda x: x[join_name], table)))
    join_names.sort()

    join_columns = list(
        set(range(len(fields))).difference(set((join_column, join_name))))
    join_columns.sort()

    new_table = []
    map_old2new = {}

    map_name2start = {}
    x = 1
    for name in join_names:
        map_name2start[name] = x
        x += len(join_columns)

    row_width = len(join_columns) * len(join_names)
    for x in join_rows:
        map_old2new[x] = len(map_old2new)
        new_row = [x, ] + ["na"] * row_width
        new_table.append(new_row)

    for row in table:
        row_index = map_old2new[row[join_column]]
        start = map_name2start[row[join_name]]
        for x in join_columns:
            new_table[row_index][start] = row[x]
            start += 1

    # print new table
    options.stdout.write(fields[join_column])
    for name in join_names:
        for column in join_columns:
            options.stdout.write(
                "\t%s%s%s" % (name, options.separator, fields[column]))
    options.stdout.write("\n")

    for row in new_table:
        options.stdout.write("\t".join(row) + "\n")
Ejemplo n.º 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: codonbias_weights2tsv.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option("--methods", dest="methods", type="string",
                      help="methods to apply.")

    parser.add_option("--is-frequencies", dest="is_frequencies", action="store_true",
                      help="data is frequencies (default: weights).")

    parser.add_option("-s", "--sort", dest="sort", type="choice",
                      choices=("percent-difference", "aa"),
                      help="sort order of output table.")

    parser.add_option("-g", "--global-sort", dest="global_sort", action="store_true",
                      help="globally sort results (otherwise: by species pair).")

    parser.set_defaults(
        methods="",
        is_frequencies=False,
        sort="percent-difference",
        global_sort=False,
    )

    (options, args) = E.Start(parser)
    if options.methods:
        options.methods = options.methods.split(",")

    fields, table = CSV.ReadTable(sys.stdin)

    # convert weights to floats
    table = CSV.getConvertedTable(table, range(1, len(fields)))

    for method in options.methods:

        if method == "overview":
            if options.is_frequencies:
                WriteOverviewFrequencies(fields, table, options)
            else:
                WriteOverviewWeights(fields, table, options)
Ejemplo n.º 14
0
def readAndCollapseTable(infile, options, missing_value=""):
    '''collapse a table.

    Collapse a table of two columns with row names in the first
    column. Outputs a table with multiple columns for each row name.
    '''

    fields, table = CSV.readTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    if len(fields) != 2:
        raise NotImplementedError("can only work on tables with two columns")

    values = collections.defaultdict(list)

    # column header after which to add
    separator = table[0][0]
    row_names = set([x[0] for x in table])

    row_name, value = table[0]

    values[row_name].append(value)
    added = set([row_name])
    for row_name, value in table[1:]:
        if row_name == separator:
            for r in row_names:
                if r not in added:
                    values[r].append(missing_value)
            added = set()

        values[row_name].append(value)
        added.add(row_name)

    for r in row_names:
        if r not in added:
            values[r].append(missing_value)

    sizes = set([len(x) for x in list(values.values())])
    assert len(sizes) == 1, "unequal number of row_names"
    size = list(sizes)[0]

    options.stdout.write("row\t%s\n" %
                         ("\t".join(["column_%i" % x for x in range(size)])))

    for key, row in list(values.items()):
        options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
Ejemplo n.º 15
0
def readAndCollapseTable(infile, options, missing_value=""):
    '''collapse a table.

    Collapse a table of two columns with row names in the first
    column. Outputs a table with multiple columns for each row name.
    '''

    fields, table = CSV.readTable(
        infile, with_header=options.has_headers, as_rows=True)

    if len(fields) != 2:
        raise NotImplementedError("can only work on tables with two columns")

    values = collections.defaultdict(list)

    # column header after which to add
    separator = table[0][0]
    row_names = set([x[0] for x in table])

    row_name, value = table[0]

    values[row_name].append(value)
    added = set([row_name])
    for row_name, value in table[1:]:
        if row_name == separator:
            for r in row_names:
                if r not in added:
                    values[r].append(missing_value)
            added = set()

        values[row_name].append(value)
        added.add(row_name)

    for r in row_names:
        if r not in added:
            values[r].append(missing_value)

    sizes = set([len(x) for x in values.values()])
    assert len(sizes) == 1, "unequal number of row_names"
    size = list(sizes)[0]

    options.stdout.write(
        "row\t%s\n" % ("\t".join(["column_%i" % x for x in range(size)])))

    for key, row in values.items():
        options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
Ejemplo n.º 16
0
def iterateMacs2Peaks(infile):
    '''iterate over peaks.xls file and return parsed data.

    pvalues and fdr are converted to values between 0 and 1
    from their -log10 values.
    '''

    for row in CSV.DictReader(infile, dialect='excel-tab'):
        # these are 1-based coordinates
        # macs can have negative start coordinates
        # start
        try:
            yield Macs2Peak._make(
                (row['chr'], max(int(row['start']) - 1, 0), int(row['end']),
                 int(row['length']), float(row['pileup']),
                 math.pow(10, -float(row['-log10(pvalue)'])),
                 float(row['fold_enrichment']),
                 math.pow(10, -float(row['-log10(qvalue)'])), row['name']))
        except KeyError, msg:
            raise KeyError("%s: %s" % (msg, row))
Ejemplo n.º 17
0
def getGODescriptions(infile):
    '''build dictionary mapping GOids to types and descriptions.

    Arguments
    ---------
    infile : string
        Filename of table with GO assignments

    Returns
    -------
    mapping : dict
        Dictionary mapping GOid to GOtype and GOdescription.
    '''

    with IOTools.openFile(infile) as inf:
        fields, table = CSV.readTable(inf, as_rows=False)

    return dict([
        (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[
            fields.index("go_id")], table[fields.index("description")])
    ])
Ejemplo n.º 18
0
def computeFDR( infile, options ):
    '''compute FDR on a table.
    '''

    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )

    options.stdout.write("\t".join(fields) + "\n")
    
    for row in table:

        data = []
        for x in range(len(fields)):
            data.append( row[x].split( options.separator ) )

        nrows = max( [ len(d) for d in data ] )

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write( "\t".join( [ d[n] for d in data ] ) + "\n" )
Ejemplo n.º 19
0
def computeFDR(infile, options):
    '''compute FDR on a table.
    '''

    fields, table = CSV.readTable(
        infile, with_header=options.has_headers, as_rows=True)

    options.stdout.write("\t".join(fields) + "\n")

    for row in table:

        data = []
        for x in range(len(fields)):
            data.append(row[x].split(options.separator))

        nrows = max([len(d) for d in data])

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write("\t".join([d[n] for d in data]) + "\n")
Ejemplo n.º 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--column",
                      dest="column",
                      type="int",
                      help="column to split on.")

    parser.add_option("--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins to create.")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("equal-sized-bins", ),
                      help="method to use to bin data.")

    parser.add_option("--no-headers",
                      dest="has_headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option(
        "-p",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help=
        "OUTPUT filename with histogram information on aggregate coverages [%default]."
    )

    parser.set_defaults(
        has_headers=True,
        method="equal-sized-bins",
        column=1,
        num_bins=4,
        output_filename_pattern="bin%i",
    )

    (options, args) = E.Start(parser)
    options.column -= 1

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = open(args[0], "r")
    else:
        infile = sys.stdin

    fields, data = CSV.ReadTable(infile)

    c = options.column
    values = [float(x[c]) for x in data]

    bins = []

    if options.method == "equal-sized-bins":
        increment = int(math.floor(float(len(values)) / options.num_bins))
        indices = range(0, len(values))
        indices.sort(key=lambda x: values[x])
        for x in xrange(len(values)):
            values[indices[x]] = x
        bins = range(0, len(values) - increment, increment)

    elif options.method == "pass":
        pass

    E.debug("bins=%s" % str(bins))

    outputters = []
    for x in xrange(0, len(bins)):
        outputters.append(
            Outputter(options.output_filename_pattern % x, fields))

    # output tables
    for x in xrange(0, len(data)):
        bin = bisect.bisect(bins, values[x]) - 1
        outputters[bin].write(data[x])

    # stats
    if options.loglevel >= 1:
        options.stdlog.write("# bin\tstart\tcounts\tfilename\n")
        for x in xrange(0, len(bins)):
            options.stdlog.write(
                "# %i\t%f\t%i\t%s\n" %
                (x, bins[x], outputters[x].mCounts, outputters[x].mFilename))

    E.info("ninput=%i, noutput=%i" %
           (len(data), sum((x.mCounts for x in outputters))))

    E.Stop()
Ejemplo n.º 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("transpose", "normalize-by-max", "normalize-by-value",
                 "multiply-by-value", "percentile", "remove-header",
                 "normalize-by-table", "upper-bound", "lower-bound",
                 "kullback-leibler", "expand", "compress", "fdr", "grep"),
        help="""actions to perform on table.""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default]")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--header-names",
                      dest="has_headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--transpose",
                      dest="transpose",
                      action="store_true",
                      help="transpose table.")

    parser.add_option(
        "--set-transpose-field",
        dest="set_transpose_field",
        type="string",
        help="set first field (row 1 and col 1) to this value [%default].")

    parser.add_option("--transpose-format",
                      dest="transpose_format",
                      type="choice",
                      choices=(
                          "default",
                          "separated",
                      ),
                      help="input format of un-transposed table")

    parser.add_option(
        "--expand",
        dest="expand_table",
        action="store_true",
        help="expand table - multi-value cells with be expanded over "
        "several rows.")

    parser.add_option("--no-headers",
                      dest="has_headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("--columns",
                      dest="columns",
                      type="string",
                      help="columns to use.")

    parser.add_option("--file",
                      dest="file",
                      type="string",
                      help="columns to test from table.",
                      metavar="FILE")

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter of columns.",
                      metavar="DELIM")

    parser.add_option("-V",
                      "--invert-match",
                      dest="invert_match",
                      action="store_true",
                      help="invert match.")

    parser.add_option("--sort-by-rows",
                      dest="sort_rows",
                      type="string",
                      help="output order for rows.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("--group",
                      dest="group_column",
                      type="int",
                      help="group values by column. Supply an integer column "
                      "[default=%default]")

    parser.add_option("--group-function",
                      dest="group_function",
                      type="choice",
                      choices=("min", "max", "sum", "mean", "stats", "cat",
                               "uniq"),
                      help="function to group values by.")

    parser.add_option("--join-table",
                      dest="join_column",
                      type="int",
                      help="join rows in a table by columns.")

    parser.add_option(
        "--collapse-table",
        dest="collapse_table",
        type="string",
        help="collapse a table. Value determines the missing variable "
        "[%default].")

    parser.add_option("--join-column-name",
                      dest="join_column_name",
                      type="int",
                      help="use this column as a prefix.")

    parser.add_option("--flatten-table",
                      dest="flatten_table",
                      action="store_true",
                      help="flatten a table [%default].")

    parser.add_option("--as-column",
                      dest="as_column",
                      action="store_true",
                      help="output table as a single column.")

    parser.add_option("--split-fields",
                      dest="split_fields",
                      action="store_true",
                      help="split fields.")

    parser.add_option(
        "--separator",
        dest="separator",
        type="string",
        help="separator for multi-valued fields [default=%default].")

    parser.add_option(
        "--fdr-method",
        dest="fdr_method",
        type="choice",
        choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--fdr-add-column",
        dest="fdr_add_column",
        type="string",
        help="add new column instead of replacing existing columns. "
        "The value of the option will be used as prefix if there are "
        "multiple columns [%default]")

    # IMS: add option to use a column as the row id in flatten
    parser.add_option(
        "--id-column",
        dest="id_column",
        type="string",
        help="list of column(s) to use as the row id when flattening "
        "the table. If None, then row number is used. [default=%default].")

    parser.add_option(
        "--variable-name",
        dest="variable_name",
        type="string",
        help="the column header for the 'variable' column when flattening "
        "[default=%default].")

    parser.add_option(
        "--value-name",
        dest="value_name",
        type="string",
        help="the column header for the 'value' column when flattening "
        "[default=%default].")

    parser.set_defaults(
        methods=[],
        scale=1.0,
        has_headers=True,
        format=None,
        value=0.0,
        parameters="",
        columns="all",
        transpose=False,
        set_transpose_field=None,
        transpose_format="default",
        group=False,
        group_column=0,
        group_function="mean",
        missing_value="na",
        sort_rows=None,
        flatten_table=False,
        collapse_table=None,
        separator=";",
        expand=False,
        join_column=None,
        join_column_name=None,
        compute_fdr=None,
        as_column=False,
        fdr_method="BH",
        fdr_add_column=None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter="\t",
        invert_match=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    options.parameters = options.parameters.split(",")

    if options.group_column:
        options.group = True
        options.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    # if only to remove header, do this quickly
    if options.methods == ["remove-header"]:

        first = True
        for line in options.stdin:
            if line[0] == "#":
                continue
            if first:
                first = False
                continue
            options.stdout.write(line)

    elif options.transpose or "transpose" in options.methods:

        readAndTransposeTable(options.stdin, options)

    elif options.flatten_table:
        # IMS: bug fixed to make work. Also added options for keying
        # on a particular and adding custom column headings

        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.columns = getColumns(fields, options.columns)

        if options.id_column:
            id_columns = [int(x) - 1 for x in options.id_column.split(",")]
            id_header = "\t".join(
                [fields[id_column] for id_column in id_columns])
            options.columns = [
                x for x in options.columns if x not in id_columns
            ]
        else:
            id_header = "row"

        options.stdout.write(
            "%s\t%s\t%s\n" %
            (id_header, options.variable_name, options.value_name))

        for x, row in enumerate(table):

            if options.id_column:
                row_id = "\t".join(
                    [row[int(x) - 1] for x in options.id_column.split(",")])
            else:
                row_id = str(x)

            for y in options.columns:
                options.stdout.write("%s\t%s\t%s\n" %
                                     (row_id, fields[y], row[y]))

    elif options.as_column:

        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)
        options.columns = getColumns(fields, options.columns)
        table = list(zip(*table))

        options.stdout.write("value\n")

        for column in options.columns:
            options.stdout.write("\n".join(table[column]) + "\n")

    elif options.split_fields:

        # split comma separated fields
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.stdout.write("%s\n" % ("\t".join(fields)))

        for row in table:
            row = [x.split(options.separator) for x in row]
            for d in itertools.product(*row):
                options.stdout.write("%s\n" % "\t".join(d))

    elif options.group:
        readAndGroupTable(options.stdin, options)

    elif options.join_column:
        readAndJoinTable(options.stdin, options)

    elif options.expand_table:
        readAndExpandTable(options.stdin, options)

    elif options.collapse_table is not None:
        readAndCollapseTable(options.stdin, options, options.collapse_table)

    elif "grep" in options.methods:

        options.columns = [int(x) - 1 for x in options.columns.split(",")]

        patterns = []

        if options.file:
            infile = IOTools.openFile(options.file, "r")
            for line in infile:
                if line[0] == "#":
                    continue
                patterns.append(line[:-1].split(options.delimiter)[0])
        else:
            patterns = args

        for line in options.stdin:

            data = line[:-1].split(options.delimiter)
            found = False

            for c in options.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found and options.invert_match) or (
                    found and not options.invert_match):
                print(line[:-1])
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        # Apply remainder of transformations
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=False)
        # convert columns to list
        table = [list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError("table is empty")

        nrows = len(table[0])

        E.info("processing table with %i rows and %i columns" % (nrows, ncols))

        options.columns = getColumns(fields, options.columns)

        # convert all values to float
        for c in options.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float(table[c][r])
                except ValueError:
                    continue

        for method in options.methods:

            if method == "normalize-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = [x / value for x in table[c]]

            elif method == "multiply-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = [x * value for x in table[c]]

            elif method == "normalize-by-max":

                for c in options.columns:
                    m = max(table[c])
                    table[c] = [x / m for x in table[c]]

            elif method == "kullback-leibler":
                options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                format = options.format
                if format is None:
                    format = "%f"

                for x in range(0, len(options.columns) - 1):
                    for y in range(x + 1, len(options.columns)):
                        c1 = options.columns[x]
                        c2 = options.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log(p / q)
                            e2 += q * math.log(q / p)

                        options.stdout.write(
                            "%s\t%s\t%s\t%s\t%s\n" %
                            (fields[c1], fields[c2], format % e1, format % e2,
                             format % ((e1 + e2) / 2)))
                E.Stop()
                sys.exit(0)

            elif method == "rank":

                for c in options.columns:
                    tt = table[c]
                    t = list(zip(tt, list(range(nrows))))
                    t.sort()
                    for i, n in zip([x[1] for x in t], list(range(nrows))):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[0])
                del options.parameters[0]
                new_value = float(options.parameters[0])
                del options.parameters[0]

                if method == "upper-bound":
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns:
                    pvalues.extend(table[c])

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \
                    str(max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \
                    str(min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = list(
                    map(
                        str,
                        Stats.adjustPValues(pvalues,
                                            method=options.fdr_method)))

                if options.fdr_add_column is None:
                    x = 0
                    for c in options.columns:
                        table[c] = qvalues[x:x + nrows]
                        x += nrows
                else:
                    # add new column headers
                    if len(options.columns) == 1:
                        fields.append(options.fdr_add_column)
                    else:
                        for co in options.columns:
                            fields.append(options.fdr_add_column + fields[c])

                    x = 0
                    for c in options.columns:
                        # add a new column
                        table.append(qvalues[x:x + nrows])
                        x += nrows
                    ncols += len(options.columns)

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table = CSV.readTable(
                    IOTools.openFile(other_table_name, "r"),
                    with_header=options.has_headers,
                    as_rows=False)

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float(other_table[c][r])
                        except ValueError:
                            continue

                # set 0s to 1 in the other matrix
                for c in options.columns:
                    for r in range(nrows):
                        if isinstance(table[c][r], float) and \
                                isinstance(other_table[c][r], float) and \
                                other_table[c][r] != 0:
                            table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = options.missing_value

        # convert back
        if options.format is not None:
            for c in options.columns:
                for r in range(nrows):
                    if isinstance(table[c][r], float):
                        table[c][r] = format % table[c][r]

        options.stdout.write("\t".join(fields) + "\n")
        if options.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in options.sort_rows.split(","):
                if x not in old2new:
                    continue
                r = old2new[x]
                options.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")
        else:
            for r in range(nrows):
                options.stdout.write(
                    "\t".join(map(str, [table[c][r]
                                        for c in range(ncols)])) + "\n")

    E.Stop()
Ejemplo n.º 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--column", dest="column", type="int",
                      help="column to split on.")

    parser.add_option("--num-bins", dest="num_bins", type="int",
                      help="number of bins to create.")

    parser.add_option("--method", dest="method", type="choice",
                      choices=("equal-sized-bins",),
                      help="method to use to bin data.")

    parser.add_option("--no-headers", dest="has_headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.set_defaults(
        has_headers=True,
        method="equal-sized-bins",
        column=1,
        num_bins=4,
        output_filename_pattern="bin%i",
    )

    (options, args) = E.Start(parser)
    options.column -= 1

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = IOTools.openFile(args[0], "r")
    else:
        infile = sys.stdin

    fields, data = CSV.readTable(infile)

    c = options.column
    values = [float(x[c]) for x in data]

    bins = []

    if options.method == "equal-sized-bins":
        increment = int(math.floor(float(len(values)) / options.num_bins))
        indices = list(range(0, len(values)))
        indices.sort(key=lambda x: values[x])
        for x in range(len(values)):
            values[indices[x]] = x
        bins = list(range(0, len(values) - increment, increment))

    elif options.method == "pass":
        pass

    E.debug("bins=%s" % str(bins))

    outputters = []
    for x in range(0, len(bins)):
        outputters.append(
            Outputter(options.output_filename_pattern % x, fields))

    # output tables
    for x in range(0, len(data)):
        bin = bisect.bisect(bins, values[x]) - 1
        outputters[bin].write(data[x])

    # stats
    if options.loglevel >= 1:
        options.stdlog.write("# bin\tstart\tcounts\tfilename\n")
        for x in range(0, len(bins)):
            options.stdlog.write("# %i\t%f\t%i\t%s\n" % (
                x, bins[x], outputters[x].mCounts, outputters[x].mFilename))

    E.info("ninput=%i, noutput=%i" %
           (len(data), sum((x.mCounts for x in outputters))))

    E.Stop()
Ejemplo n.º 23
0
                       choices=("percent-difference", "aa"),
                       help="sort order of output table.")

    parser.add_option( "-g", "--global-sort", dest="global_sort", action="store_true",
                       help="globally sort results (otherwise: by species pair).")

    parser.set_defaults( \
       methods = "",
       is_frequencies = False,
       sort = "percent-difference",
       global_sort= False,
       )

    (options, args) = E.Start( parser )
    if options.methods:
        options.methods = options.methods.split(",")

    fields, table = CSV.ReadTable(sys.stdin)

    ## convert weights to floats
    table = CSV.getConvertedTable( table, range( 1, len(fields) ) )

    for method in options.methods:

        if method == "overview":
            if options.is_frequencies:
                WriteOverviewFrequencies( fields, table, options )
            else:
                WriteOverviewWeights( fields, table, options )            
        
Ejemplo n.º 24
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--remove",
                      dest="remove",
                      action="store_true",
                      help="remove specified columns, keep all others.")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option(
        "-l",
        "--large",
        dest="large",
        action="store_true",
        help=
        "large columns. Do not use native python CSV module [default=%default]."
    )

    parser.add_option("-f",
                      "--filename-fields",
                      dest="filename_fields",
                      type="string",
                      help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        filename_fields=None,
    )

    (options, args) = E.Start(parser, add_csv_options=True, quiet=True)

    input_fields = args

    if options.filename_fields:
        input_fields = map(
            lambda x: x[:-1].split("\t")[0],
            filter(lambda x: x[0] != "#",
                   open(options.filename_fields, "r").readlines()))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    while 1:
        line = sys.stdin.readline()

        if not line:
            E.Stop()
            sys.exit(0)

        if line[0] == "#":
            continue

        first_line = line
        break

    old_fields = first_line[:-1].split("\t")

    fields = []
    for f in input_fields:
        # do pattern search
        if f[0] == "%" and f[-1] == "%":
            pattern = re.compile(f[1:-1])
            for o in old_fields:
                if pattern.search(o) and o not in fields:
                    fields.append(o)
        else:
            if f in old_fields:
                fields.append(f)

    if options.remove:
        fields = set(fields)
        fields = [x for x in old_fields if x not in fields]

    if options.large:
        reader = CSV.DictReaderLarge(CommentStripper(sys.stdin),
                                     fieldnames=old_fields,
                                     dialect=options.csv_dialect)
    else:
        reader = csv.DictReader(CommentStripper(sys.stdin),
                                fieldnames=old_fields,
                                dialect=options.csv_dialect)

    writer = csv.DictWriter(outfile,
                            fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    print "\t".join(fields)

    first_row = True
    ninput, noutput, nerrors = 0, 0, 0

    while 1:
        ninput += 1
        try:
            row = reader.next()
        except _csv.Error, msg:
            options.stderr.write("# error while parsing: %s\n" % (msg))
            nerrors += 1
            continue
        except StopIteration:
            break
Ejemplo n.º 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-r", "--remove", dest="remove", action="store_true",
                      help="remove specified columns, keep all others.")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="output rows are uniq.")

    parser.add_option("-l", "--large", dest="large", action="store_true",
                      help="large columns. Do not use native python CSV module [default=%default].")

    parser.add_option("-f", "--filename-fields", dest="filename_fields", type="string",
                      help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        filename_fields=None,
    )

    (options, args) = E.Start(parser,
                              add_csv_options=True,
                              quiet=True)

    statement = " ".join(args)

    if options.large:
        reader = CSV.DictReaderLarge(CommentStripper(sys.stdin),
                                     dialect=options.csv_dialect)
    else:
        reader = csv.DictReader(CommentStripper(sys.stdin),
                                dialect=options.csv_dialect)

    exec "f = lambda r: %s" % statement in locals()

    counter = E.Counter()
    writer = csv.DictWriter(options.stdout,
                            reader.fieldnames,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator)

    writer.writerow(dict((fn, fn) for fn in reader.fieldnames))

    while 1:
        counter.input += 1
        try:
            row = reader.next()
        except _csv.Error, msg:
            options.stderr.write("# error while parsing: %s\n" % (msg))
            counter.errors += 1
            continue
        except StopIteration:
            break
Ejemplo n.º 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    options.filename1, options.filename2 = args

    table1 = CSV.readTable(IOTools.openFile(options.filename1, "r"))
    table2 = CSV.readTable(IOTools.openFile(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines,
                                dialect=options.csv_dialect)

        print("\t".join(fields))

        first_row = True
        for row in reader:
            row = IOTools.convertDictionary(row)
            writer.writerow(row)

    E.Stop()
Ejemplo n.º 27
0
        existing_tables = set([x[0] for x in cc])
        cc.close()

        # use , as separator
        quick_import_statement = \
            "sqlite3 %s '.import %%s %s'" % \
            (options.database_name, options.tablename)

        quick_import_separator = "|"

    if options.header is not None:
        options.header = [x.strip() for x in options.header.split(",")]

    if options.utf:
        reader = CSV.UnicodeDictReader(infile,
                                       dialect=options.dialect,
                                       fieldnames=options.header)
    else:
        reader = CSV.DictReader(infile,
                                dialect=options.dialect,
                                fieldnames=options.header)

    if options.replace_header:
        try:
            reader.next()
        except StopIteration:
            pass

    E.info("reading %i columns to guess column types" % options.guess_size)

    rows = []
Ejemplo n.º 28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.getOutputFile("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.getOutputFile("%s_saturation_estimation.tsv" % fn))

            outfile = IOTools.openFile(
                E.getOutputFile("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.getOutputFile("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.getOutputFile("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.getOutputFile("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = IOTools.openFile(E.getOutputFile("enrichment.tsv.gz"), "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.getOutputFile("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.getOutputFile("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.getOutputFile("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.getOutputFile("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.getOutputFile("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.getOutputFile("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.getOutputFile("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.getOutputFile("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 29
0
def run(infile, options, report_step=10000):

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    existing_tables = set()

    quick_import_separator = "\t"

    if options.database_backend == "postgres":
        import psycopg2
        raise NotImplementedError("needs refactoring for commandline options")
        dbhandle = psycopg2.connect(options.psql_connection)
        error = psycopg2.Error
        options.null = "NULL"
        options.string_value = "'%s'"
        options.text = "TEXT"
        options.index = "TEXT"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.database_backend == "mysql":
        import MySQLdb
        dbhandle = MySQLdb.connect(host=options.database_host,
                                   user=options.database_username,
                                   passwd=options.database_password,
                                   port=options.database_port,
                                   db=options.database_name)
        error = Exception
        options.null = "NULL"
        options.string_value = "%s"
        options.text = "TEXT"
        options.index = "VARCHAR(40)"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database_name)
        try:
            os.chmod(options.database_name, 0o664)
        except OSError as msg:
            E.warn("could not change permissions of database: %s" % msg)

        # Avoid the following error:
        # sqlite3.ProgrammingError: You must not use 8-bit bytestrings
        # unless you use a text_factory that can interpret 8-bit
        # bytestrings (like text_factory = str). It is highly
        # recommended that you instead just switch your application
        # to Unicode strings
        # Note: might be better to make csv2db unicode aware.
        dbhandle.text_factory = str

        error = sqlite3.OperationalError
        options.insert_many = True  # False
        options.null = None  # "NULL"
        options.text = "TEXT"
        options.index = "TEXT"
        options.string_value = "%s"  # "'%s'"

        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait(dbhandle, statement, error, options.retry)
        existing_tables = set([x[0] for x in cc])
        cc.close()

        # use , as separator
        quick_import_statement = \
            "sqlite3 %s '.import %%s %s'" % \
            (options.database_name, options.tablename)

        quick_import_separator = "|"

    if options.header is not None:
        options.header = [x.strip() for x in options.header.split(",")]

    if options.utf:
        reader = CSV.UnicodeDictReader(infile,
                                       dialect=options.dialect,
                                       fieldnames=options.header)
    else:
        reader = csv.DictReader(CSV.CommentStripper(infile),
                                dialect=options.dialect,
                                fieldnames=options.header)

    if options.replace_header:
        try:
            next(reader)
        except StopIteration:
            pass

    E.info("reading %i columns to guess column types" % options.guess_size)

    rows = []
    for row in reader:
        if None in row:
            raise ValueError("undefined columns in input file at row: %s" %
                             row)

        try:
            rows.append(IOTools.convertDictionary(row, map=options.map))
        except TypeError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))
        except ValueError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break

    E.info("read %i rows for type guessing" % len(rows))
    E.info("creating table")

    if len(rows) == 0:
        if options.allow_empty:
            if not reader.fieldnames:
                E.warn("no data - no table created")
            else:
                # create empty table and exit
                take, map_column2type, ignored = createTable(
                    dbhandle,
                    error,
                    options.tablename,
                    options,
                    retry=options.retry,
                    headers=reader.fieldnames,
                    ignore_empty=options.ignore_empty,
                    ignore_columns=options.ignore_columns,
                    rename_columns=options.rename_columns,
                    lowercase=options.lowercase,
                    ignore_duplicates=options.ignore_duplicates,
                    indices=options.indices,
                    first_column=options.first_column,
                    existing_tables=existing_tables,
                    append=options.append)
                E.info("empty table created")
            return
        else:
            raise ValueError("empty table")
    else:
        take, map_column2type, ignored = createTable(
            dbhandle,
            error,
            options.tablename,
            options,
            rows=rows,
            retry=options.retry,
            headers=reader.fieldnames,
            ignore_empty=options.ignore_empty,
            ignore_columns=options.ignore_columns,
            rename_columns=options.rename_columns,
            lowercase=options.lowercase,
            ignore_duplicates=options.ignore_duplicates,
            indices=options.indices,
            first_column=options.first_column,
            existing_tables=existing_tables,
            append=options.append)

    def row_iter(rows, reader):
        for row in rows:
            yield quoteRow(row,
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)
        for data in reader:
            yield quoteRow(IOTools.convertDictionary(data, map=options.map),
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)

    ninput = 0

    E.info("inserting data")

    if options.insert_quick:
        E.info("using quick insert")

        outfile, filename = tempfile.mkstemp()

        E.debug("dumping data into %s" % filename)

        for d in row_iter(rows, reader):

            ninput += 1
            os.write(
                outfile,
                quick_import_separator.join([str(d[x]) for x in take]) + "\n")

            if ninput % report_step == 0:
                E.info("iteration %i\n" % ninput)

        os.close(outfile)

        statement = quick_import_statement % filename
        E.debug(statement)

        # infinite loop possible
        while 1:

            retcode = E.run(statement, cwd=os.getcwd(), close_fds=True)

            if retcode != 0:
                E.warn("import error using statement: %s" % statement)

                if not options.retry:
                    raise ValueError("import error using statement: %s" %
                                     statement)

                time.sleep(5)
                continue

            break

        os.remove(filename)

        # there is no way to insert NULL values into sqlite. The only
        # solution is to update all colums.
        for column in take:
            executewait(
                dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" %
                (options.tablename, column, column), error, options.retry)

    elif options.insert_many:
        data = []
        for d in row_iter(rows, reader):
            ninput += 1

            data.append([d[x] for x in take])

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

        statement = "INSERT INTO %s VALUES (%s)" % (options.tablename,
                                                    ",".join("?" * len(take)))

        E.info("inserting %i rows" % len(data))
        E.debug("multiple insert:\n# %s" % statement)

        while 1:
            try:
                dbhandle.executemany(statement, data)
            except error as msg:
                E.warn("import failed: msg=%s, statement=\n  %s" %
                       (msg, statement))
                # TODO: check for database locked msg
                if not options.retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error(msg)
                time.sleep(5)
                continue
            break

    else:
        # insert line by line (could not figure out how to do bulk loading with
        # subprocess and COPY FROM STDIN)
        statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename,
                                                         ')s, %('.join(take))
        # output data used for guessing:
        for d in row_iter(rows, reader):

            ninput += 1
            E.debug("single insert:\n# %s" % (statement % d))
            cc = executewait(dbhandle,
                             statement,
                             error,
                             retry=options.retry,
                             args=d)
            cc.close()

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

    E.info("building indices")
    nindex = 0
    for index in options.indices:

        nindex += 1
        try:
            statement = "CREATE INDEX %s_index%i ON %s (%s)" % (
                options.tablename, nindex, options.tablename, index)
            cc = executewait(dbhandle, statement, error, options.retry)
            cc.close()
            E.info("added index on column %s" % (index))
        except error as msg:
            E.info("adding index on column %s failed: %s" % (index, msg))

    statement = "SELECT COUNT(*) FROM %s" % (options.tablename)
    cc = executewait(dbhandle, statement, error, options.retry)
    result = cc.fetchone()
    cc.close()

    noutput = result[0]

    E.info("ninput=%i, noutput=%i, nskipped_columns=%i" %
           (ninput, noutput, len(ignored)))

    dbhandle.commit()
Ejemplo n.º 30
0
def readAndGroupTable(infile, options):
    """read table from infile and group.
    """
    fields, table = CSV.readTable(
        infile, with_header=options.has_headers, as_rows=True)
    options.columns = getColumns(fields, options.columns)
    assert options.group_column not in options.columns

    converter = float
    new_fields = [fields[options.group_column]] + [fields[x]
                                                   for x in options.columns]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce(lambda x, y: x + y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join([y for y in x if y != ""])
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join([y for y in set(x) if y != ""])
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [fields[options.group_column]]
        for c in options.columns:
            new_fields += list(map(lambda x: "%s_%s" %
                                   (fields[c], x), Stats.DistributionalParameters().getHeaders()))

    # convert values to floats (except for group_column)
    # Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [row[options.group_column]]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append(converter(row[c]))
                except ValueError:
                    skip = True
                    break
        if not skip:
            new_table.append(new_row)
    table = new_table

    new_rows = CSV.groupTable(table,
                              group_column=0,
                              group_function=f)

    options.stdout.write("\t".join(new_fields) + "\n")
    for row in new_rows:
        options.stdout.write("\t".join(map(str, row)) + "\n")
Ejemplo n.º 31
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option("-1",
                      "--join-fields1",
                      dest="join_fields1",
                      type="string",
                      help="join fields in first table.")
    parser.add_option("-2",
                      "--join-fields2",
                      dest="join_fields2",
                      type="string",
                      help="join fields in second table.")
    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="set operation to perform.",
                      choices=("intersection", "rest", "union"))

    parser.set_defaults(
        remove=False,
        unique=False,
        join_fields1=None,
        join_fields2=None,
        method="intersection",
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise "please specify two files to join."

    if not options.join_fields1 or not options.join_fields2:
        raise "please specify at least one join field per table."

    options.join_fields1 = options.join_fields1.split(",")
    options.join_fields2 = options.join_fields2.split(",")

    options.filename1, options.filename2 = args

    fields1, table1 = CSV.readTable(open(options.filename1, "r"))
    fields2, table2 = CSV.readTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    nfields1 = []
    for x in range(len(fields1)):
        if fields1[x] in options.join_fields1:
            nfields1.append(x)
    nfields2 = []
    for x in range(len(fields2)):
        if fields2[x] in options.join_fields2:
            nfields2.append(x)

    # calculate row indices: double keys are not taken care of here
    keys = {}
    for row1 in table1:
        v = map(lambda x: row1[x], nfields1)
        key = hashlib.md5("".join(v)).digest()
        keys[key] = row1

    if options.method == "intersection":
        # build new field list
        take = range(len(fields1))
        c = len(take)
        for x in fields2:
            if x not in options.join_fields2:
                take.append(c)
            c += 1

        t = fields1 + fields2

        new_fields = map(lambda x: t[x], take)

        print "\t".join(new_fields)

        for row2 in table2:
            v = map(lambda x: row2[x], nfields2)
            key = hashlib.md5("".join(v)).digest()
            if key in keys:
                new_row = keys[key] + row2
                outfile.write("\t".join(map(lambda x: new_row[x], take)) +
                              "\n")

    elif options.method == "rest":

        new_fields = fields2
        print "\t".join(new_fields)

        for row2 in table2:
            v = map(lambda x: row2[x], nfields2)
            key = hashlib.md5("".join(v)).digest()
            if key not in keys:
                outfile.write("\t".join(row2) + "\n")

    E.Stop()
Ejemplo n.º 32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m", "--method", dest="methods", type="choice", action="append",
        choices=("transpose", "normalize-by-max", "normalize-by-value",
                 "multiply-by-value",
                 "percentile", "remove-header", "normalize-by-table",
                 "upper-bound", "lower-bound", "kullback-leibler",
                 "expand", "compress", "fdr", "grep"),
        help="""actions to perform on table.""")

    parser.add_option("-s", "--scale", dest="scale", type="float",
                      help="factor to scale matrix by.")

    parser.add_option("-f", "--format", dest="format", type="string",
                      help="output number format [default]")

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="Parameters for various functions.")

    parser.add_option(
        "-t", "--header-names", dest="has_headers", action="store_true",
        help="matrix has row/column headers.")

    parser.add_option("--transpose", dest="transpose", action="store_true",
                      help="transpose table.")

    parser.add_option(
        "--set-transpose-field", dest="set_transpose_field", type="string",
        help="set first field (row 1 and col 1) to this value [%default].")

    parser.add_option(
        "--transpose-format", dest="transpose_format", type="choice",
        choices=("default", "separated", ),
        help="input format of un-transposed table")

    parser.add_option(
        "--expand", dest="expand_table", action="store_true",
        help="expand table - multi-value cells with be expanded over "
        "several rows.")

    parser.add_option("--no-headers", dest="has_headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("--columns", dest="columns", type="string",
                      help="columns to use.")

    parser.add_option("--file", dest="file", type="string",
                      help="columns to test from table.",
                      metavar="FILE")

    parser.add_option("-d", "--delimiter", dest="delimiter", type="string",
                      help="delimiter of columns.",
                      metavar="DELIM")

    parser.add_option(
        "-V", "--invert-match", dest="invert_match",
        action="store_true",
        help="invert match.")

    parser.add_option("--sort-by-rows", dest="sort_rows", type="string",
                      help="output order for rows.")

    parser.add_option("-a", "--value", dest="value", type="float",
                      help="value to use for various algorithms.")

    parser.add_option(
        "--group", dest="group_column", type="int",
        help="group values by column. Supply an integer column "
        "[default=%default]")

    parser.add_option("--group-function", dest="group_function", type="choice",
                      choices=(
                          "min", "max", "sum", "mean", "stats", "cat", "uniq"),
                      help="function to group values by.")

    parser.add_option("--join-table", dest="join_column", type="int",
                      help="join rows in a table by columns.")

    parser.add_option(
        "--collapse-table", dest="collapse_table", type="string",
        help="collapse a table. Value determines the missing variable "
        "[%default].")

    parser.add_option(
        "--join-column-name", dest="join_column_name", type="int",
        help="use this column as a prefix.")

    parser.add_option(
        "--flatten-table", dest="flatten_table", action="store_true",
        help="flatten a table [%default].")

    parser.add_option("--as-column", dest="as_column", action="store_true",
                      help="output table as a single column.")

    parser.add_option(
        "--split-fields", dest="split_fields", action="store_true",
        help="split fields.")

    parser.add_option(
        "--separator", dest="separator", type="string",
        help="separator for multi-valued fields [default=%default].")

    parser.add_option(
        "--fdr-method", dest="fdr_method", type="choice",
        choices=(
            "BH", "bonferroni", "holm", "hommel", "hochberg", "BY"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--fdr-add-column", dest="fdr_add_column", type="string",
        help="add new column instead of replacing existing columns. "
        "The value of the option will be used as prefix if there are "
        "multiple columns [%default]")

    # IMS: add option to use a column as the row id in flatten
    parser.add_option(
        "--id-column", dest="id_column", type="string",
        help="list of column(s) to use as the row id when flattening "
        "the table. If None, then row number is used. [default=%default].")

    parser.add_option(
        "--variable-name", dest="variable_name", type="string",
        help="the column header for the 'variable' column when flattening "
        "[default=%default].")

    parser.add_option(
        "--value-name", dest="value_name", type="string",
        help="the column header for the 'value' column when flattening "
        "[default=%default].")

    parser.set_defaults(
        methods=[],
        scale=1.0,
        has_headers=True,
        format=None,
        value=0.0,
        parameters="",
        columns="all",
        transpose=False,
        set_transpose_field=None,
        transpose_format="default",
        group=False,
        group_column=0,
        group_function="mean",
        missing_value="na",
        sort_rows=None,
        flatten_table=False,
        collapse_table=None,
        separator=";",
        expand=False,
        join_column=None,
        join_column_name=None,
        compute_fdr=None,
        as_column=False,
        fdr_method="BH",
        fdr_add_column=None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter="\t",
        invert_match=False,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    options.parameters = options.parameters.split(",")

    if options.group_column:
        options.group = True
        options.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    # if only to remove header, do this quickly
    if options.methods == ["remove-header"]:

        first = True
        for line in options.stdin:
            if line[0] == "#":
                continue
            if first:
                first = False
                continue
            options.stdout.write(line)

    elif options.transpose or "transpose" in options.methods:

        readAndTransposeTable(options.stdin, options)

    elif options.flatten_table:
        # IMS: bug fixed to make work. Also added options for keying
        # on a particular and adding custom column headings

        fields, table = CSV.readTable(
            options.stdin, with_header=options.has_headers, as_rows=True)

        options.columns = getColumns(fields, options.columns)

        if options.id_column:
            id_columns = map(
                lambda x: int(x) - 1, options.id_column.split(","))
            id_header = "\t".join([fields[id_column]
                                   for id_column in id_columns])
            options.columns = [
                x for x in options.columns if x not in id_columns]
        else:
            id_header = "row"

        options.stdout.write(
            "%s\t%s\t%s\n" % (id_header, options.variable_name,
                              options.value_name))

        for x, row in enumerate(table):

            if options.id_column:
                row_id = "\t".join([row[int(x) - 1]
                                    for x in options.id_column.split(",")])
            else:
                row_id = str(x)

            for y in options.columns:
                options.stdout.write(
                    "%s\t%s\t%s\n" % (row_id, fields[y], row[y]))

    elif options.as_column:

        fields, table = CSV.readTable(
            options.stdin, with_header=options.has_headers, as_rows=True)
        options.columns = getColumns(fields, options.columns)
        table = zip(*table)

        options.stdout.write("value\n")

        for column in options.columns:
            options.stdout.write("\n".join(table[column]) + "\n")

    elif options.split_fields:

        # split comma separated fields
        fields, table = CSV.readTable(options.stdin,
                                      with_header=options.has_headers,
                                      as_rows=True)

        options.stdout.write("%s\n" % ("\t".join(fields)))

        for row in table:
            row = [x.split(options.separator) for x in row]
            for d in itertools.product(*row):
                options.stdout.write("%s\n" % "\t".join(d))

    elif options.group:
        readAndGroupTable(options.stdin, options)

    elif options.join_column:
        readAndJoinTable(options.stdin, options)

    elif options.expand_table:
        readAndExpandTable(options.stdin, options)

    elif options.collapse_table is not None:
        readAndCollapseTable(options.stdin, options, options.collapse_table)

    elif "grep" in options.methods:

        options.columns = map(lambda x: int(x) - 1, options.columns.split(","))

        patterns = []

        if options.file:
            infile = open(options.file, "r")
            for line in infile:
                if line[0] == "#":
                    continue
                patterns.append(line[:-1].split(options.delimiter)[0])
        else:
            patterns = args

        for line in options.stdin:

            data = line[:-1].split(options.delimiter)
            found = False

            for c in options.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found and options.invert_match) or (found and not options.invert_match):
                print line[:-1]
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        # Apply remainder of transformations
        fields, table = CSV.readTable(
            options.stdin, with_header=options.has_headers, as_rows=False)
        # convert columns to list
        table = [list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError("table is empty")

        nrows = len(table[0])

        E.info("processing table with %i rows and %i columns" % (nrows, ncols))

        options.columns = getColumns(fields, options.columns)

        # convert all values to float
        for c in options.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float(table[c][r])
                except ValueError:
                    continue

        for method in options.methods:

            if method == "normalize-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = map(lambda x: x / value, table[c])

            elif method == "multiply-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = map(lambda x: x * value, table[c])

            elif method == "normalize-by-max":

                for c in options.columns:
                    m = max(table[c])
                    table[c] = map(lambda x: x / m, table[c])

            elif method == "kullback-leibler":
                options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                format = options.format
                if format is None:
                    format = "%f"
                    
                for x in range(0, len(options.columns) - 1):
                    for y in range(x + 1, len(options.columns)):
                        c1 = options.columns[x]
                        c2 = options.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log(p / q)
                            e2 += q * math.log(q / p)

                        options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (
                            fields[c1], fields[c2],
                            format % e1,
                            format % e2,
                            format % ((e1 + e2) / 2)))
                E.Stop()
                sys.exit(0)

            elif method == "rank":

                for c in options.columns:
                    tt = table[c]
                    t = zip(tt, range(nrows))
                    t.sort()
                    for i, n in zip(map(lambda x: x[1], t), range(nrows)):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[0])
                del options.parameters[0]
                new_value = float(options.parameters[0])
                del options.parameters[0]

                if method == "upper-bound":
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in options.columns:
                        for r in range(nrows):
                            if isinstance(table[c][r], float) and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns:
                    pvalues.extend(table[c])

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \
                    str(max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \
                    str(min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = map(
                    str, Stats.adjustPValues(pvalues,
                                             method=options.fdr_method))

                if options.fdr_add_column is None:
                    x = 0
                    for c in options.columns:
                        table[c] = qvalues[x:x + nrows]
                        x += nrows
                else:
                    # add new column headers
                    if len(options.columns) == 1:
                        fields.append(options.fdr_add_column)
                    else:
                        for co in options.columns:
                            fields.append(options.fdr_add_column + fields[c])

                    x = 0
                    for c in options.columns:
                        # add a new column
                        table.append(qvalues[x:x + nrows])
                        x += nrows
                    ncols += len(options.columns)

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table = CSV.readTable(
                    open(other_table_name, "r"),
                    with_header=options.has_headers,
                    as_rows=False)

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float(other_table[c][r])
                        except ValueError:
                            continue

                # set 0s to 1 in the other matrix
                for c in options.columns:
                    for r in range(nrows):
                        if isinstance(table[c][r], float) and \
                                isinstance(other_table[c][r], float) and \
                                other_table[c][r] != 0:
                            table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = options.missing_value

        # convert back
        if options.format is not None:
            for c in options.columns:
                for r in range(nrows):
                    if isinstance(table[c][r], float):
                        table[c][r] = format % table[c][r]

        options.stdout.write("\t".join(fields) + "\n")
        if options.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in options.sort_rows.split(","):
                if x not in old2new:
                    continue
                r = old2new[x]
                options.stdout.write(
                    "\t".join(map(str,
                                  [table[c][r] for c in range(ncols)])) + "\n")
        else:
            for r in range(nrows):
                options.stdout.write(
                    "\t".join(map(str,
                                  [table[c][r] for c in range(ncols)])) + "\n")

    E.Stop()
Ejemplo n.º 33
0
def buildUTRExtension(infile, outfile):
    '''build new utrs by building and fitting an HMM 
    to reads upstream and downstream of known genes.

    Works on output of buildGeneLevelReadExtension.

    Known problems

    * the size of the extension is limited by the window size

    * introns within UTRs are ignored.

    * UTR extension might be underestimated for highly expressed genes
      as relative read counts drop off quickly, even though there is
      a good amount of reads still present in the UTR.

    The model

    The model is a three-state model::

        UTR --|--> notUTR --|--> otherTranscript --|
          ^---|      ^------|              ^-------|
                     ^-----------------------------|

    The chain starts in UTR and ends in notUTr or otherTranscript.

    The otherTranscript state models peaks of within the upstream/
    downstream region of a gene. These peaks might correspond to
    additional exons or unknown transcripts. Without this state,
    the UTR might be artificially extend to include these peaks.

    Emissions are modelled with beta distributions. These
    distributions permit both bimodal (UTR) and unimodal (notUTR)
    distribution of counts.

    Parameter estimation

    Parameters are derived from known UTRs within full length 
    territories.
    
    Transitions and emissions for the otherTranscript state
    are set heuristically:

       * low probabibily for remaining in state "otherTranscript".
           * these transcripts should be short.

       * emissions biased towards high counts - only strong signals
           will be considered.

       * these could be estimated from known UTRs, but I am worried
           UTR extensions then will be diluted.
    

    Alternatives

    The method could be improved.

        * base level resolution? 
            * longer chains result in more data and longer running times.
            * the averaging in windows smoothes the data, which might have
                a beneficial effect.

        * raw counts instead of scaled counts?
            * better model, as highly expressed genes should give more
                confident predictions.

    '''

    # the bin size , see gtf2table - can be cleaned from column names
    # or better set as options in .ini file
    binsize = 100
    territory_size = 15000

    # read gene coordinates
    geneinfos = {}
    for x in CSV.DictReader(IOTools.openFile(infile), dialect='excel-tab'):
        contig, strand, start, end = x['contig'], x['strand'], int(
            x['start']), int(x['end'])
        geneinfos[x['gene_id']] = (contig, strand, start, end)

    infiles = [
        infile + ".readextension_upstream_sense.tsv.gz",
        infile + ".readextension_downstream_sense.tsv.gz"
    ]

    outdir = os.path.join(PARAMS["exportdir"], "utr_extension")

    R('''suppressMessages(library(RColorBrewer))''')
    R('''suppressMessages(library(MASS))''')
    R('''suppressMessages(library(HiddenMarkov))''')

    # for upstream, downstream
    upstream_utrs, downstream_utrs = {}, {}

    all_genes = set()

    for filename, new_utrs in zip(infiles, (upstream_utrs, downstream_utrs)):

        E.info("processing %s" % filename)

        parts = os.path.basename(filename).split(".")

        data = R(
            '''data = read.table( gzfile( "%(filename)s"), header=TRUE, fill=TRUE, row.names=1)'''
            % locals())

        ##########################################
        ##########################################
        ##########################################
        ## estimation
        ##########################################
        # take only those with a 'complete' territory
        R('''d = data[-which( apply( data,1,function(x)any(is.na(x)))),]''')
        # save UTR
        R('''utrs = d$utr''')
        # remove length and utr column
        R('''d = d[-c(1,2)]''')
        # remove those which are completely empty, logtransform or scale data and export
        R('''lraw = log10( d[-which( apply(d,1,function(x)all(x==0))),] + 1 )'''
          )

        utrs = R('''utrs = utrs[-which( apply(d,1,function(x)all(x==0)))]''')
        scaled = R(
            '''lscaled = t(scale(t(lraw), center=FALSE, scale=apply(lraw,1,max) ))'''
        )
        exons = R('''lraw[,1]''')

        #######################################################
        #######################################################
        #######################################################
        # do the estimation:
        E.debug("estimation: utrs=%i, exons=%i, vals=%i, dim=%s" %
                (len(utrs), len(exons), len(scaled), R.dim(scaled)))
        # counts within and outside UTRs
        within_utr, outside_utr, otherTranscript = [], [], []
        # number of transitions between utrs
        transitions = numpy.zeros((3, 3), numpy.int)

        for x in xrange(len(utrs)):
            utr, exon = utrs[x], exons[x]

            # only consider genes with expression coverage
            # note: expression level is logscaled here, 10^1 = 10
            if exon < 0.1: continue

            # first row is column names, so x + 1
            values = list(scaled.rx(x + 1, True))

            utr_bins = utr // binsize
            nonutr_bins = (territory_size - utr) // binsize

            # build transition matrix
            transitions[0][0] += utr_bins
            transitions[0][1] += 1
            transitions[1][1] += nonutr_bins

            outside_utr.extend([x for x in values[utr_bins:] if x <= 0.5])

            # ignore exon and zero counts
            within_utr.extend([x for x in values[1:utr_bins] if x > 0.1])

            # add only high counts to otherTranscript emissions
            otherTranscript.extend([x for x in values[utr_bins:] if x > 0.5])

        # estimation for
        # 5% chance of transiting to otherTranscript
        transitions[1][2] = transitions[1][1] * 0.05
        # 10% chance of remaining in otherTranscript
        transitions[2][1] = 900
        transitions[2][2] = 100

        E.info( "counting: (n,mean): within utr=%i,%f, outside utr=%i,%f, otherTranscript=%i,%f" % \
                    ( len(within_utr), numpy.mean(within_utr),
                      len(outside_utr), numpy.mean(outside_utr),
                      len(otherTranscript), numpy.mean(otherTranscript)) )

        ro.globalenv['transitions'] = R.matrix(transitions, nrow=3, ncol=3)
        R('''transitions = transitions / rowSums( transitions )''')
        ro.globalenv['within_utr'] = ro.FloatVector(within_utr[:10000])
        ro.globalenv['outside_utr'] = ro.FloatVector(outside_utr[:10000])
        ro.globalenv['otherTranscript'] = ro.FloatVector(
            otherTranscript[:10000])

        # estimate beta distribution parameters
        R('''doFit = function( data ) {
                   data[data == 0] = data[data == 0] + 0.001
                   data[data == 1] = data[data == 1] - 0.001
                   f = fitdistr( data, dbeta, list( shape1=0.5, shape2=0.5 ) )
                   return (f) }''')

        fit_within_utr = R(
            '''fit_within_utr = suppressMessages(doFit( within_utr))''')
        fit_outside_utr = R(
            '''fit_outside_utr = suppressMessages(doFit( outside_utr))''')
        fit_other = R(
            '''fit_otherTranscript = suppressMessages(doFit( otherTranscript))'''
        )

        within_a, within_b = list(fit_within_utr.rx("estimate"))[0]
        outside_a, outside_b = list(fit_outside_utr.rx("estimate"))[0]
        other_a, other_b = list(fit_other.rx("estimate"))[0]

        E.info( "beta estimates: within_utr=%f,%f outside=%f,%f, other=%f,%f" % \
                    (within_a, within_b, outside_a, outside_b, other_a, other_b))

        fn = ".".join((parts[0], parts[4], "fit", "png"))
        outfilename = os.path.join(outdir, fn)
        R.png(outfilename, height=1000, width=1000)

        R('''par(mfrow=c(3,1))''')
        R('''x=seq(0,1,0.02)''')
        R('''hist( within_utr, 50, col=rgb( 0,0,1,0.2) )''')
        R('''par(new=TRUE)''')
        R('''plot( x, dbeta( x, fit_within_utr$estimate['shape1'], fit_within_utr$estimate['shape2']), type='l', col='blue')'''
          )

        R('''hist( outside_utr, 50, col=rgb( 1,0,0,0.2 ) )''')
        R('''par(new=TRUE)''')
        R('''plot( x, dbeta( x, fit_outside_utr$estimate['shape1'], fit_outside_utr$estimate['shape2']), type='l', col='red')'''
          )

        R('''hist( otherTranscript, 50, col=rgb( 0,1,0,0.2 ) )''')
        R('''par(new=TRUE)''')
        R('''plot( x, dbeta( x, fit_otherTranscript$estimate['shape1'], fit_otherTranscript$estimate['shape2']), type='l', col='green')'''
          )
        R['dev.off']()

        #####################################################
        #####################################################
        #####################################################
        # build hmm
        # state 1 = UTR
        # state 2 = notUTR
        # state 3 = other transcript
        p = R('''betaparams = list( shape1=c(fit_within_utr$estimate['shape1'],
                                         fit_outside_utr$estimate['shape1'],
                                         fit_otherTranscript$estimate['shape1']),
                                shape2=c(fit_within_utr$estimate['shape2'],
                                         fit_outside_utr$estimate['shape2'],
                                         fit_otherTranscript$estimate['shape2'])) '''
              )
        R('''hmm = dthmm(NULL, transitions, c(1,0,0), "beta", betaparams )''')

        E.info("fitting starts")
        #####################################################
        #####################################################
        #####################################################
        # fit to every sequence
        genes = R('''rownames(data)''')
        all_genes.update(set(genes))
        utrs = R('''data$utr''')
        exons = R('''data$exon''')
        nseqs = len(utrs)

        counter = E.Counter()

        for idx in xrange(len(utrs)):

            gene_id = genes[idx]

            old_utr = utrs[idx]

            if idx % 100 == 0:
                E.debug("processing gene %i/%i" % (idx, len(utrs)))

            counter.input += 1

            # do not predict if terminal exon not expressed
            if exons[idx] < 1:
                counter.skipped_notexpressed += 1
                new_utrs[gene_id] = Utr._make(
                    (old_utr, None, None, "notexpressed"))
                continue

            R('''obs = data[%i,][-c(1,2)]''' % (idx + 1))
            # remove na
            obs = R('''obs = obs[!is.na(obs)]''')
            if len(obs) <= 1 or max(obs) == 0:
                new_utrs[gene_id] = Utr._make(
                    (old_utr, None, None, "no observations"))
                continue

            # normalize
            R('''obs = obs / max(obs)''')
            # add small epsilon to 0 and 1 values
            R('''obs[obs==0] = obs[obs==0] + 0.001 ''')
            R('''obs[obs==1] = obs[obs==1] - 0.001 ''')
            R('''hmm$x = obs''')

            states = None
            try:
                states = list(R('''states = Viterbi( hmm )'''))
            except ri.RRuntimeError, msg:
                counter.skipped_error += 1
                new_utrs[gene_id] = Utr._make((old_utr, None, None, "fail"))
                continue

            max_utr = binsize * (len(states) - 1)

            # subtract 1 for last exon
            try:
                new_utr = binsize * (states.index(2) - 1)
                new_utrs[gene_id] = Utr._make(
                    (old_utr, new_utr, max_utr, "ok"))
                counter.success += 1
            except ValueError:
                new_utrs[gene_id] = Utr._make(
                    (old_utr, max_utr, max_utr, "max"))
                counter.maxutr += 1
Ejemplo n.º 34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise "please specify two files to join."

    options.filename1, options.filename2 = args

    table1 = CSV.ReadTable(open(options.filename1, "r"))
    table2 = CSV.ReadTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines, dialect=options.csv_dialect)

        print "\t".join(fields)

        first_row = True
        for row in reader:
            row = CSV.ConvertDictionary(row)
            writer.writerow(row)

    E.Stop()
Ejemplo n.º 35
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="output rows are uniq.")

    parser.add_option("-1", "--join-fields1", dest="join_fields1", type="string",
                      help="join fields in first table.")
    parser.add_option("-2", "--join-fields2", dest="join_fields2", type="string",
                      help="join fields in second table.")
    parser.add_option("-m", "--method", dest="method", type="choice",
                      help="set operation to perform.", choices=("intersection", "rest", "union"))

    parser.set_defaults(
        remove=False,
        unique=False,
        join_fields1=None,
        join_fields2=None,
        method="intersection",
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    if not options.join_fields1 or not options.join_fields2:
        raise ValueError("please specify at least one join field per table")

    options.join_fields1 = options.join_fields1.split(",")
    options.join_fields2 = options.join_fields2.split(",")

    options.filename1, options.filename2 = args

    fields1, table1 = CSV.readTable(open(options.filename1, "r"))
    fields2, table2 = CSV.readTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    nfields1 = []
    for x in range(len(fields1)):
        if fields1[x] in options.join_fields1:
            nfields1.append(x)
    nfields2 = []
    for x in range(len(fields2)):
        if fields2[x] in options.join_fields2:
            nfields2.append(x)

    # calculate row indices: double keys are not taken care of here
    keys = {}
    for row1 in table1:
        v = [row1[x] for x in nfields1]
        key = hashlib.md5("".join(v)).digest()
        keys[key] = row1

    if options.method == "intersection":
        # build new field list
        take = list(range(len(fields1)))
        c = len(take)
        for x in fields2:
            if x not in options.join_fields2:
                take.append(c)
            c += 1

        t = fields1 + fields2

        new_fields = [t[x] for x in take]

        print("\t".join(new_fields))

        for row2 in table2:
            v = [row2[x] for x in nfields2]
            key = hashlib.md5("".join(v)).digest()
            if key in keys:
                new_row = keys[key] + row2
                outfile.write(
                    "\t".join([new_row[x] for x in take]) + "\n")

    elif options.method == "rest":

        new_fields = fields2
        print("\t".join(new_fields))

        for row2 in table2:
            v = [row2[x] for x in nfields2]
            key = hashlib.md5("".join(v)).digest()
            if key not in keys:
                outfile.write("\t".join(row2) + "\n")

    E.Stop()
Ejemplo n.º 36
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id",
        usage=globals()["__doc__"])

    parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend", dest="extension", type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size", dest="shift", type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size", dest="window_size", type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations", type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t", "--toolset", dest="toolset", type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment",
                               "dmr", "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w", "--bigwig-file", dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment", dest="treatment_files", type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control", dest="control_files", type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input", dest="input_files", type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip", action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata", dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file", dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method", dest="fdr_method", type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa", dest="bwa", action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique", dest="unique", type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms", dest="chroms", type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="Hsapiens.UCSC.hg19",
        genome_file=None,
        extend=0,
        shift=0,
        window_size=300,
        saturation_iterations=10,
        toolset=[],
        bigwig=False,
        treatment_files=[],
        control_files=[],
        input_files=[],
        output_rdata=False,
        input_rdata=None,
        is_medip=True,
        fdr_threshold=0.1,
        fdr_method="BH",
        bwa=False,
        unique=0.001,
        chroms=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in CSV.DictReader(options.stdin,
                                   dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" % (line['chr'],
                                      int(line['start']),
                                      int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError, msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return
Ejemplo n.º 37
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: csv2xls.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-o", "--outfile=", dest="output_filename", type="string",
                      help="write to output filename.")

    parser.set_defaults(
        output_filename=None,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if not options.output_filename:
        raise ValueError("please specify an output filename.")

    w = openpyxl.Workbook(optimized_write=True)

    # create styles
    header_style = GetHeaderStyle()
    data_style = GetDataStyle()

    for filename in args:

        lines = filter(lambda x: x[0] != "#", open(filename, "r").readlines())

        if len(lines) == 0:
            continue

        if options.loglevel >= 2:
            print "# read %i rows" % len(lines)
            sys.stdout.flush()

        headers = lines[0][:-1].split("\t")

        ws = w.add_sheet(os.path.basename(filename))

        cur_row = 0

        ws.append(headers)

        cur_row += 1

        reader = csv.DictReader(lines, dialect=options.csv_dialect)

        for row in reader:
            row = CSV.ConvertDictionary(row)

            data = [row.get(headers[x], "") for x in range(len(headers))]
            ws.append(data)

            cur_row += 1

    w.save(options.output_filename)

    E.Stop()
Ejemplo n.º 38
0
def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: analyze_readpositions.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--output-filename-pattern",
                      dest="output_filename_pattern",
                      type="string",
                      help="pattern for additional output files [%default].")

    parser.set_defaults(
        length=1000,
        minimum_coverage=0.90,
        maximum_reads=[1, 10, 20, 50, 100],
        output_filename_pattern="%s",
        normalize=True,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    fields, table = CSV.ReadTable(sys.stdin, dictreader=CSV.DictReaderLarge)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    coverage_5prime = numpy.zeros(options.length, numpy.float)
    coverage_3prime = numpy.zeros(options.length, numpy.float)

    coverage_maxreads5prime = numpy.zeros(options.length, numpy.float)
    coverage_maxreads3prime = numpy.zeros(options.length, numpy.float)

    coverage_full5prime = numpy.zeros(options.length, numpy.float)
    coverage_full3prime = numpy.zeros(options.length, numpy.float)

    coverage_min5prime = numpy.zeros(options.length, numpy.float)
    coverage_min3prime = numpy.zeros(options.length, numpy.float)

    histograms = []
    for x in range(len(options.maximum_reads)):
        histograms.append([
            numpy.zeros(options.length, numpy.float),
            numpy.zeros(options.length, numpy.float), 0
        ])

    ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0
    for row in table:
        length, covered, meancov, data, nreads = (int(row["cov_nval"]),
                                                  float(row["cov_covered"]),
                                                  float(row["cov_mean"]),
                                                  row["cov_values"],
                                                  int(row["nover2"]))
        ninput += 1
        if length < options.length:
            nlength += 1
            continue

        if data == "na":
            nskipped += 1
            continue

        noutput += 1
        mincov = covered / length
        values = map(float, data.split(";"))
        m = max(values)
        values = [x / m for x in values]
        coverage_5prime += values[0:1000]
        coverage_3prime += values[-1000:]

        if mincov >= 1.0:
            coverage_full5prime += values[0:1000]
            coverage_full3prime += values[-1000:]
            nfull += 1

        if meancov >= options.minimum_coverage:
            coverage_min5prime += values[0:1000]
            coverage_min3prime += values[-1000:]
            nmincov += 1

        for maxreads in range(len(options.maximum_reads)):
            if nreads <= options.maximum_reads[maxreads]:
                histograms[maxreads][0] += values[0:1000]
                histograms[maxreads][1] += values[-1000:]
                histograms[maxreads][2] += 1

    if options.normalize:
        for x5, x3 in ((coverage_5prime, coverage_3prime),
                       (coverage_min5prime, coverage_min3prime),
                       (coverage_full5prime, coverage_full3prime)):
            m = max((max(x5), max(x3)))
            x3 /= m
            x5 /= m

        for x5, x3, c in histograms:
            m = max((max(x5), max(x3)))
            x5 /= m
            x3 /= m

    outfile = options.stdout
    outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'",
                             "mincov-3'", "full-5'", "full-3'")) + "\n")

    for x in range(0, options.length):
        outfile.write( "\t".join( [ "%6.4f" % x for x in \
                                        (x,
                                         coverage_5prime[x],
                                         coverage_3prime[x],
                                         coverage_min5prime[x],
                                         coverage_min3prime[x],
                                         coverage_full5prime[x],
                                         coverage_full3prime[x] ) ] ) + "\n" )

    outfile5 = open(options.output_filename_pattern % "reads5", "w")
    outfile3 = open(options.output_filename_pattern % "reads3", "w")

    outfile5.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    outfile3.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    for x in range(0, options.length):
        outfile5.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][0][x]
            for y in range(len(options.maximum_reads))
        ])))
        outfile3.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][1][x]
            for y in range(len(options.maximum_reads))
        ])))

    E.info( "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i" %\
                (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength) )

    E.Stop()
Ejemplo n.º 39
0
              ('none', 'all' ), 
              ('kappa', 'all' ),
              ('omega', 'all' ),
              ('ds', 'all'),
              )

    map_model2params = {
        'none' : 8,
        'ds' : 7,
        'omega' : 6,
        'kappa' : 6,
        'omega-ds' : 5,
        'kappa-ds' : 5,
        'all' : 4 }

    reader = CSV.DictReader( sys.stdin,
                             dialect=options.csv_dialect )

    stats = {}
    options.stdout.write( "id" )
    for a, b in tests:
        options.stdout.write( "\t%s:%s\tp%s:%s" % (a, b, a, b))
        stats[(a,b)] = 0

    options.stdout.write( "\n" )

    ninput, noutput, nskipped, nerrors, ntests = 0, 0, 0, 0, 0

    for row in reader:
        ninput += 1

        if int(row['N:len']) <= options.min_length or int(row['C:len']) <= options.min_length :
Ejemplo n.º 40
0
        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait(dbhandle, statement, error, options.retry)
        existing_tables = set([x[0] for x in cc])
        cc.close()

        quick_import_statement = \
            "sqlite3 -header -csv -separator '\t' %s '.import %%s %s'" % \
            (options.database, options.tablename)

    if options.header is not None:
        options.header = [x.strip() for x in options.header.split(",")]

    if options.utf:
        reader = CSV.UnicodeDictReader(infile,
                                       dialect=options.dialect,
                                       fieldnames=options.header)
    else:
        reader = CSV.DictReader(infile,
                                dialect=options.dialect,
                                fieldnames=options.header)

    if options.replace_header:
        reader.next()

    E.info("reading %i columns to guess column types" % options.guess_size)

    rows = []
    for row in reader:
        if None in row:
            raise ValueError("undefined columns in input file at row: %s" %
Ejemplo n.º 41
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option(
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help="pattern for additional output files [%default].",
    )

    parser.set_defaults(
        length=1000,
        minimum_coverage=0.90,
        maximum_reads=[1, 10, 20, 50, 100],
        output_filename_pattern="%s",
        normalize=True,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    fields, table = CSV.readTable(sys.stdin, dictreader=CSV.DictReaderLarge)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    coverage_5prime = numpy.zeros(options.length, numpy.float)
    coverage_3prime = numpy.zeros(options.length, numpy.float)

    coverage_maxreads5prime = numpy.zeros(options.length, numpy.float)
    coverage_maxreads3prime = numpy.zeros(options.length, numpy.float)

    coverage_full5prime = numpy.zeros(options.length, numpy.float)
    coverage_full3prime = numpy.zeros(options.length, numpy.float)

    coverage_min5prime = numpy.zeros(options.length, numpy.float)
    coverage_min3prime = numpy.zeros(options.length, numpy.float)

    histograms = []
    for x in range(len(options.maximum_reads)):
        histograms.append([numpy.zeros(options.length, numpy.float), numpy.zeros(options.length, numpy.float), 0])

    ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0
    for row in table:
        length, covered, meancov, data, nreads = (
            int(row["cov_nval"]),
            float(row["cov_covered"]),
            float(row["cov_mean"]),
            row["cov_values"],
            int(row["nover2"]),
        )
        ninput += 1
        if length < options.length:
            nlength += 1
            continue

        if data == "na":
            nskipped += 1
            continue

        noutput += 1
        mincov = covered / length
        values = list(map(float, data.split(";")))
        m = max(values)
        values = [x / m for x in values]
        coverage_5prime += values[0:1000]
        coverage_3prime += values[-1000:]

        if mincov >= 1.0:
            coverage_full5prime += values[0:1000]
            coverage_full3prime += values[-1000:]
            nfull += 1

        if meancov >= options.minimum_coverage:
            coverage_min5prime += values[0:1000]
            coverage_min3prime += values[-1000:]
            nmincov += 1

        for maxreads in range(len(options.maximum_reads)):
            if nreads <= options.maximum_reads[maxreads]:
                histograms[maxreads][0] += values[0:1000]
                histograms[maxreads][1] += values[-1000:]
                histograms[maxreads][2] += 1

    if options.normalize:
        for x5, x3 in (
            (coverage_5prime, coverage_3prime),
            (coverage_min5prime, coverage_min3prime),
            (coverage_full5prime, coverage_full3prime),
        ):
            m = max((max(x5), max(x3)))
            x3 /= m
            x5 /= m

        for x5, x3, c in histograms:
            m = max((max(x5), max(x3)))
            x5 /= m
            x3 /= m

    outfile = options.stdout
    outfile.write(
        "\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'", "mincov-3'", "full-5'", "full-3'")) + "\n"
    )

    for x in range(0, options.length):
        outfile.write(
            "\t".join(
                [
                    "%6.4f" % x
                    for x in (
                        x,
                        coverage_5prime[x],
                        coverage_3prime[x],
                        coverage_min5prime[x],
                        coverage_min3prime[x],
                        coverage_full5prime[x],
                        coverage_full3prime[x],
                    )
                ]
            )
            + "\n"
        )

    outfile5 = IOTools.openFile(options.output_filename_pattern % "reads5", "w")
    outfile3 = IOTools.openFile(options.output_filename_pattern % "reads3", "w")

    outfile5.write(
        "\t".join(["distance"] + ["reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads))])
        + "\n"
    )
    outfile3.write(
        "\t".join(["distance"] + ["reads%i" % options.maximum_reads[y] for y in range(len(options.maximum_reads))])
        + "\n"
    )
    for x in range(0, options.length):
        outfile5.write(
            "%i\t%s\n" % (x, "\t".join(["%6.4f" % histograms[y][0][x] for y in range(len(options.maximum_reads))]))
        )
        outfile3.write(
            "%i\t%s\n" % (x, "\t".join(["%6.4f" % histograms[y][1][x] for y in range(len(options.maximum_reads))]))
        )

    E.info(
        "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i"
        % (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength)
    )

    E.Stop()
Ejemplo n.º 42
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_sites_slr.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("summary-slr", "summary-filtered",
                               "over-representation", "positive-site-table",
                               "negative-site-table", "neutral-site-table",
                               "positive-site-list", "negative-site-list",
                               "neutral-site-list"),
                      help="method to apply.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix for rows.")

    parser.add_option("-s",
                      "--filename-sites",
                      dest="filename_sites",
                      type="string",
                      help="filename with sites information.")

    parser.add_option("-l",
                      "--filename-log",
                      dest="filename_log",
                      type="string",
                      help="filename with logging information.")

    parser.add_option(
        "-m",
        "--filename-mali",
        dest="filename_mali",
        type="string",
        help=
        "filename of multiple alignment, that was input to SLR. If given, is used to filter indels."
    )

    parser.add_option(
        "--filter-probability",
        dest="filter_probability",
        type="float",
        help="threshold for probability above which to include positive sites."
    )

    parser.add_option("--no-header",
                      dest="write_header",
                      action="store_false",
                      help="only output header.")

    parser.add_option("--only-header",
                      dest="only_header",
                      action="store_true",
                      help="only output header.")

    parser.add_option("--significance-threshold",
                      dest="significance_threshold",
                      type="float",
                      help="threshold for significance tests [%default].")

    parser.add_option("--use-adjusted",
                      dest="use_adjusted",
                      action="store_true",
                      help="use SLR adjusted probability values.")

    parser.add_option("--truncate-sites-list",
                      dest="truncate_sites_list",
                      type="int",
                      help="truncate sites list after ## entries (0 for all).")

    parser.add_option(
        "--context-size",
        dest="context_size",
        type="int",
        help="size of left/right context around a selected residue.")

    parser.set_defaults(
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        filename_sites="-",
        filename_log=None,
        filename_mali=None,
        significance_threshold=0.05,
        write_header=True,
        only_header=False,
        use_adjusted=False,
        context_size=0,
        truncate_sites_list=0,
    )

    (options, args) = E.Start(parser)

    slr = WrapperSlr.Slr()

    # write headers
    if "%s" in options.filename_sites:
        options.prefix = True

    if options.method == "summary-slr":

        # write header
        if options.write_header or options.only_header:

            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# This uses the thresholds as set in SLR. Use "counts" for filtering
# residues based on your own thresholds
""")
            thresholds = "95%", "99%", "95% corrected", "99% corrected"

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write(
                "ltree\tomega\tkappa\tlnL\tnsites\tnsyn\tngap\t")
            options.stdout.write("\t".join(
                map(lambda x: "npos_" + x.replace(" ", "_"), thresholds)))
            options.stdout.write("\t")
            options.stdout.write("\t".join(
                map(lambda x: "nneg_" + x.replace(" ", "_"), thresholds)))
            options.stdout.write("\n")

    elif options.method == "summary-filtered":

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# This method uses the supplied threshold and the multiple alignment to filter.
# All positions that are above the threshold (P-Value) and which are located in
# indels: >= 1 sequence missing from column, are removed.
""")

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write(
                "ltree\tomega\tkappa\tlnL\tnsites\tnfiltered\tntotal\tnsyn\tnneg\tnpos\n"
            )

    elif options.method in ("positive-site-table", "negative-site-table",
                            "neutral-site-table"):

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# Note: sequence positions are 1-based, but mali positions are 0-based.
# Residues in indel positions have been removed and signifnicance was determined according
# with a threshold of %5.2e
""" % options.significance_threshold)

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write("cluster\tnsites\tp-value\tsites\n")

    elif options.method in ("positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Sites under positive/neutral/negative selection according to SLR
#
# Note: sequence positions are 1-based, but mali positions are 0-based.
# Residues in indel positions have been removed and signifnicance was determined according
# with a threshold of %5.2e
""" % options.significance_threshold)

            if options.prefix:
                options.stdout.write("prefix\t")

            options.stdout.write(
                "sequence\tn\taa\tseq_pos\tmali_pos\tcontext\n")

    elif options.method == "over-representation":

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write("""# Genes with over-represented sites.
#
# This method uses as input the output of summary-filtered.
""")

    if options.only_header:
        sys.exit(0)

    if options.method in ("summary-slr", "summary-filtered",
                          "positive-site-table", "negative-site-table",
                          "neutral-site-table", "positive-site-list",
                          "negative-site-list", "neutral-site-list"):

        ninput, noutput, nskipped = 0, 0, 0

        if "%s" in options.filename_sites:

            headers, table = CSV.ReadTable(sys.stdin)

            fprefix = headers.index("prefix")

            try:
                fsignificance = headers.index("p")
            except ValueError:
                fsignificance = None

            for row in table:

                id = row[fprefix]
                if fsignificance is not None:
                    p_value = row[fsignificance]
                else:
                    p_value = None

                ninput += 1

                fn = re.sub("%s", id, options.filename_sites)
                if not os.path.exists(fn):
                    nskipped += 1
                    continue

                lines_sites = open(fn, "r").readlines()
                if options.filename_log:
                    lines_log = open(re.sub("%s", id, options.filename_log),
                                     "r").readlines()

                result = slr.parseOutput(lines_sites, lines_log)

                if options.method in ("summary-filtered",
                                      "positive-site-table",
                                      "negative-site-table",
                                      "neutral-site-table"):
                    mali = Mali.Mali()
                    mali.readFromFile(
                        open(re.sub("%s", id, options.filename_mali), "r"))
                else:
                    mali = None

                ProcessResult(result,
                              options,
                              mali,
                              prefix=id,
                              p_value=p_value)
                noutput += 1
        else:
            if options.filename_sites == "-":
                lines_sites = sys.stdin.readlines()
            else:
                lines_sites = open(options.filename_sites, "r").readlines()

            ninput += 1
            if options.filename_log:
                lines_log = open(options.filename_log, "r").readlines()

            result = slr.parseOutput(lines_sites, lines_log)

            if options.filename_mali:
                mali = Mali.Mali()
                mali.readFromFile(open(options.filename_mali, "r"))
            else:
                if options.method == "summary-filtered":
                    raise "please supply a multiple alignment for filtering."

                mali = None

            ProcessResult(result, options, mali, prefix=options.prefix)
            noutput += 1

        if options.loglevel >= 1:
            options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                                 (ninput, noutput, nskipped))

    else:
        if options.method == "over-representation":

            results = []
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                data = line[:-1].split("\t")
                if data[0] == "prefix":
                    continue

                results.append(
                    Result(data[0], int(data[6]), int(data[7]), int(data[8]),
                           int(data[9]), int(data[10])))

            # probability of a single site being positive
            ntotal = sum(map(lambda x: x.mNTotal, results))
            npositives = sum(map(lambda x: x.mNPositive, results))
            p = float(npositives) / float(ntotal)

            if options.loglevel >= 1:
                options.stdlog.write("# sites: total=%i, positive=%i, p=%f\n" %
                                     (ntotal, npositives, p))

            new_results = []
            for result in results:
                if result.mNTotal == 0:
                    continue

                # use -1, because I need P( x >= X)
                # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X )
                # = P (x > X ).
                r = scipy.stats.binom.sf(result.mNPositive - 1, result.mNTotal,
                                         p)

                result.mSignificance = r

                if r < options.significance_threshold:
                    new_results.append(result)

            new_results.sort(
                lambda x, y: cmp(x.mSignificance, y.mSignificance))

            options.stdlog.write(Result().getHeader() + "\n")

            for result in new_results:
                options.stdout.write(str(result) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write("# ntotal=%i, npos=%i\n" %
                                     (len(results), len(new_results)))

    E.Stop()
Ejemplo n.º 43
0
def createTable(dbhandle,
                error,
                options,
                rows=None,
                headers=None,
                first_column=None,
                existing_tables=[]):

    # create table by guessing column types from data type.
    if rows:
        map_column2type, ignored, max_values = CSV.GetMapColumn2Type(
            rows, ignore_empty=options.ignore_empty, get_max_values=True)
        if ignored:
            E.info("ignored columns: %s" % str(ignored))

        headers = map_column2type.keys()
        headers.sort()

    elif headers:
        map_column2type = dict(zip(headers, [
            None,
        ] * len(headers)))
        ignored = 0

    columns_to_ignore = set([x.lower() for x in options.ignore_columns])
    columns_to_rename = dict(
        [x.lower().split(":") for x in options.rename_columns])

    take = []
    # associate headers to field names
    columns = []
    present = {}
    for header_index, h in enumerate(headers):
        hh = h
        if options.lowercase:
            hh = string.lower(h)

        if hh in columns_to_ignore:
            continue

        if hh in present:
            if options.ignore_duplicates:
                continue
            else:
                raise ValueError("duplicate column %s" % hh)

        present[hh] = 1
        take.append(h)
        if map_column2type[h] == int:
            max_value = max_values[h]
            if max_value > 2147483647:
                t = "BIGINT DEFAULT '0'"
            elif max_value > 32767:
                t = "INTEGER DEFAULT '0'"
            else:
                t = "SMALLINT DEFAULT '0'"

        elif map_column2type[h] == float:
            t = "FLOAT DEFAULT '0'"
        else:
            t = "TEXT"

        # remove special characters from column names
        if hh == "":
            if first_column is not None and header_index == 0:
                hh = first_column
            else:
                raise ValueError("column '%s' without header " % h)
        hh = columns_to_rename.get(hh, hh)
        hh = re.sub('''['"]''', "", hh)
        hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh)
        if hh[0] in "0123456789":
            hh = "_" + hh
        columns.append("%s %s" % (hh, t))

    # delete old table if it exists
    while 1:
        try:
            cc = dbhandle.cursor()
            cc.execute("DROP TABLE IF EXISTS '%s'" % options.tablename)
            dbhandle.commit()
            cc.close()
            E.info("existing table %s deleted" % options.tablename)
        except sqlite3.OperationalError, msg:
            E.warn(msg)
            time.sleep(5)
            continue
        except error, msg:
            E.warn("could not delete existing table %s: %s" %
                   (options.tablename, str(msg)))
            dbhandle.rollback()
            if not options.retry:
                raise error, msg
            elif options.tablename in existing_tables:
                # table exists, but drop did not work (e.g. database lock)
                time.sleep(5)
                continue
            else:
                # table might not have existed
                break
Ejemplo n.º 44
0
def createTable(
    dbhandle,
    error,
    tablename,
    options,
    retry=True,
    ignore_empty=True,
    ignore_columns=[],
    rename_columns=[],
    lowercase=False,
    ignore_duplicates=True,
    indices=[],
    rows=None,
    headers=None,
    first_column=None,
    existing_tables=set(),
    append=False,
):

    # create table by guessing column types from data type.
    if rows:
        map_column2type, ignored, max_values = CSV.getMapColumn2Type(
            rows, ignore_empty=ignore_empty, get_max_values=True
        )
        if ignored:
            E.info("ignored columns: %s" % str(ignored))

        headers = list(map_column2type.keys())
        headers.sort()

    elif headers:
        map_column2type = dict(list(zip(headers, [None] * len(headers))))
        ignored = 0

    columns_to_ignore = set([x.lower() for x in ignore_columns])
    columns_to_rename = dict([x.lower().split(":") for x in rename_columns])

    take = []
    # associate headers to field names
    columns = []
    present = {}
    for header_index, h in enumerate(headers):
        hh = h
        if lowercase:
            hh = string.lower(h)

        if hh in columns_to_ignore:
            continue

        if hh in present:
            if ignore_duplicates:
                continue
            else:
                raise ValueError("duplicate column %s" % hh)

        present[hh] = 1
        take.append(h)
        if map_column2type[h] == int:
            max_value = max_values[h]
            if max_value > 2147483647:
                t = "BIGINT DEFAULT '0'"
            elif max_value > 32767:
                t = "INTEGER DEFAULT '0'"
            else:
                t = "SMALLINT DEFAULT '0'"

        elif map_column2type[h] == float:
            t = "FLOAT DEFAULT '0'"
        else:
            if h in options.indices:
                t = options.index
            else:
                t = options.text

        # remove special characters from column names
        if hh == "":
            if first_column is not None and header_index == 0:
                hh = first_column
            else:
                raise ValueError("column '%s' without header " % h)
        hh = columns_to_rename.get(hh, hh)
        hh = re.sub("""['"]""", "", hh)
        hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh)
        if hh[0] in "0123456789":
            hh = "_" + hh
        columns.append("%s %s" % (hh, t))

    if not options.append:
        # delete old table if it exists
        while 1:
            try:
                cc = dbhandle.cursor()
                # mysql: removed '' around table name
                statement = "DROP TABLE IF EXISTS %s" % tablename
                E.debug(statement)
                cc.execute(statement)
                dbhandle.commit()
                cc.close()
                E.info("existing table %s deleted" % tablename)
            except sqlite3.OperationalError as msg:
                E.warn(msg)
                time.sleep(5)
                continue
            except error as msg:
                E.warn("could not delete existing table %s: %s" % (tablename, str(msg)))
                dbhandle.rollback()
                if not retry:
                    raise error(msg)
                elif tablename in existing_tables:
                    # table exists, but drop did not work (e.g. database lock)
                    time.sleep(5)
                    continue
                else:
                    # table might not have existed
                    break
            break

        # create new table
        statement = "CREATE TABLE %s ( %s );" % (tablename, ", ".join(columns))

        E.debug("table create:\n# %s" % (statement))

        while 1:
            try:
                cc = dbhandle.cursor()
                cc.execute(statement)
                cc.close()
                dbhandle.commit()
            except error as msg:
                E.warn("table creation failed: msg=%s, statement=\n  %s" % (msg, statement))
                # TODO: check for database locked msg
                if not retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error("%s: %s" % (msg, statement))
                time.sleep(5)
                continue
            break

        E.info("table %s created successfully." % tablename)

    return take, map_column2type, ignored
Ejemplo n.º 45
0
def createTable(dbhandle,
                error,
                tablename,
                options,
                retry=True,
                ignore_empty=True,
                ignore_columns=[],
                rename_columns=[],
                lowercase=False,
                ignore_duplicates=True,
                indices=[],
                rows=None,
                headers=None,
                first_column=None,
                existing_tables=set(),
                append=False):

    # create table by guessing column types from data type.
    if rows:
        map_column2type, ignored, max_values = CSV.getMapColumn2Type(
            rows, ignore_empty=ignore_empty, get_max_values=True)
        if ignored:
            E.info("ignored columns: %s" % str(ignored))

        headers = list(map_column2type.keys())
        headers.sort()

    elif headers:
        map_column2type = dict(list(zip(headers, [
            None,
        ] * len(headers))))
        ignored = 0

    columns_to_ignore = set([x.lower() for x in ignore_columns])
    columns_to_rename = dict([x.lower().split(":") for x in rename_columns])

    take = []
    # associate headers to field names
    columns = []
    present = {}
    for header_index, h in enumerate(headers):
        hh = h
        if lowercase:
            hh = string.lower(h)

        if hh in columns_to_ignore:
            continue

        if hh in present:
            if ignore_duplicates:
                continue
            else:
                raise ValueError("duplicate column %s" % hh)

        present[hh] = 1
        take.append(h)
        if map_column2type[h] == int:
            max_value = max_values[h]
            if max_value > 2147483647:
                t = "BIGINT DEFAULT '0'"
            elif max_value > 32767:
                t = "INTEGER DEFAULT '0'"
            else:
                t = "SMALLINT DEFAULT '0'"

        elif map_column2type[h] == float:
            t = "FLOAT DEFAULT '0'"
        else:
            if h in options.indices:
                t = options.index
            else:
                t = options.text

        # remove special characters from column names
        if hh == "":
            if first_column is not None and header_index == 0:
                hh = first_column
            else:
                raise ValueError("column '%s' without header " % h)
        hh = columns_to_rename.get(hh, hh)
        hh = re.sub('''['"]''', "", hh)
        hh = re.sub("[,;.:\-\+/ ()%?]", "_", hh)
        if hh[0] in "0123456789":
            hh = "_" + hh
        columns.append("%s %s" % (hh, t))

    if not options.append:
        # delete old table if it exists
        while 1:
            try:
                cc = dbhandle.cursor()
                # mysql: removed '' around table name
                statement = "DROP TABLE IF EXISTS %s" % tablename
                E.debug(statement)
                cc.execute(statement)
                dbhandle.commit()
                cc.close()
                E.info("existing table %s deleted" % tablename)
            except sqlite3.OperationalError as msg:
                E.warn(msg)
                time.sleep(5)
                continue
            except error as msg:
                E.warn("could not delete existing table %s: %s" %
                       (tablename, str(msg)))
                dbhandle.rollback()
                if not retry:
                    raise error(msg)
                elif tablename in existing_tables:
                    # table exists, but drop did not work (e.g. database lock)
                    time.sleep(5)
                    continue
                else:
                    # table might not have existed
                    break
            break

        # create new table
        statement = "CREATE TABLE %s ( %s );" % (tablename, ", ".join(columns))

        E.debug("table create:\n# %s" % (statement))

        while 1:
            try:
                cc = dbhandle.cursor()
                cc.execute(statement)
                cc.close()
                dbhandle.commit()
            except error as msg:
                E.warn("table creation failed: msg=%s, statement=\n  %s" %
                       (msg, statement))
                # TODO: check for database locked msg
                if not retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error("%s: %s" % (msg, statement))
                time.sleep(5)
                continue
            break

        E.info("table %s created successfully." % tablename)

    return take, map_column2type, ignored