Example #1
0
def chunk_iterator_column(infile, args, prefix, use_header=False):
    """split at column.

    The table need not be sorted by this column.
    If num_files is given, files will randomly created
    and tags according to column randomly assigned.


    """

    column, max_files = args
    files = IOTools.FilePool()
    header = False

    if max_files:
        map_tag2file = {}

    for line in infile:
        if line[0] == "#":
            continue

        if not header and use_header:
            files.setHeader(line)
            header = True
            continue

        key = line[:-1].split("\t")[column]
        if max_files:
            if key in map_tag2file:
                key = map_tag2file[key]
            else:
                n = "%010i" % (len(map_tag2file) % max_files)
                map_tag2file[key] = n
                key = n

        files.write("%s/%s.in" % (prefix, key), line)

    for filename, count in list(files.items()):
        E.info("created file %s with %i items" % (filename, count))
        yield filename
Example #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--min-overlap",
                      dest="min_overlap",
                      type="int",
                      help="minimum overlap")

    parser.add_option(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type="string",
        help="regular expression to extract window coordinates from "
        "test id [%default]")

    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="invert direction of fold change [%default]")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    outfiles = IOTools.FilePool(options.output_filename_pattern)

    if options.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(options.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in IOTools.iterate(options.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = list(map(int, (start, end)))

            yield DATA._make(
                (data.test_id, contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = next(data)
        entries = [last]

        while 1:
            d = next(data)
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    options.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    group_id = 0

    for group in grouper(iter(all_data), distance=options.min_overlap):
        group_id += 1

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (str(group_id), g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.control_mean for x in group]) / n))

        options.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    # create empty files
    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.stop()