Example #1
0
def combine_counters(work_dir, n_map_shards, n_reduce_shards):
    filenames = map(lambda (work_dir, shard):
                    os.path.join(work_dir, 'map-%d.counters' % shard),
                    zip([work_dir] * n_map_shards, range(n_map_shards)))
    filenames += map(lambda (work_dir, shard):
                     os.path.join(work_dir, 'combine-%d.counters' % shard),
                     zip([work_dir] * n_map_shards, range(n_map_shards)))
    filenames += map(lambda (work_dir, shard):
                     os.path.join(work_dir, 'reduce-%d.counters' % shard),
                     zip([work_dir] * n_reduce_shards, range(n_reduce_shards)))
    return MRCounter.sum(
        imap(MRCounter.deserialize,
             read_files(filter(os.path.exists, filenames))))
Example #2
0
def main():

    args = parse_args()

    # count exactly how many input lines we have so we can balance work.
    glob_pattern = path_join(args.work_dir,
                             args.input_prefix + '_count.[0-9]*')
    count_ff = glob(glob_pattern)
    if not count_ff:
        raise RuntimeError("Step {} shuffler: not input files found matching "
                           "pattern {}".format(args.step_idx, glob_pattern))
    logger.info("Step {} shuffler: counting entries from {}"
                .format(args.step_idx, count_ff))
    num_entries = sum(imap(int, read_files(count_ff)))

    in_ff = sorted(glob(path_join(args.work_dir,
                                  args.input_prefix + '.[0-9]*')))
    sources = [open(f, 'r') for f in in_ff]

    step = get_step(args)
    n_output_files = step.n_reducers

    out_format = path_join(args.work_dir, args.output_prefix + '.%d')
    outputs = [open(out_format % i, 'w') for i in range(n_output_files)]

    # To cleanly separate reducer outputs by key groups we need to unpack
    # values on shuffling and compare keys. Every index change has to be
    # accompanied by a key change, otherwise index change is postponed.
    old_key = None
    old_index = 0
    lines_written = 0
    for count, line in enumerate(heapq.merge(*sources)):
        key = json.loads(line)[0]
        index = count * n_output_files / num_entries

        # postpone switching to new index until a change in key also observed
        if old_index != index and old_key != key:
            old_index = index
        outputs[old_index].write(line)
        lines_written += 1

        old_key = key

    for source in sources:
        source.close()

    for output in outputs:
        output.close()

    logger.info("Step {} shuffler: lines written: {}"
                .format(args.step_idx, lines_written))
Example #3
0
def run_shuffle(args):

    # count exactly how many input lines we have so we can balance work.
    glob_pattern = path_join(args.work_dir,
                             (args.input_prefix % '[0-9]*') + '.count')
    count_ff = glob(glob_pattern)
    if not count_ff:
        raise RuntimeError("Step {} shuffler: not input files found matching "
                           "pattern {}".format(args.step_idx, glob_pattern))
    logger.info("Step {} shuffler: counting entries from {}"
                .format(args.step_idx, count_ff))
    num_entries = sum(imap(int, read_files(count_ff)))

    in_pattern = path_join(args.work_dir, args.input_prefix % '[0-9]*')
    in_pattern_re = re.compile(in_pattern)
    logger.info('Looking for files that match %s', in_pattern)

    # since Python does not do extended globs, need to filter-out bad matches
    # using a regex
    in_ff = sorted([f for f in glob(in_pattern)
                    if in_pattern_re.match(f) is not None])
    logger.info('Found files: {}'.format(in_ff))
    sources = [open_gz(f, 'r') for f in in_ff]

    n_output_files = args.n_reducers

    out_format = path_join(args.work_dir, args.output_prefix % '%d')
    outputs = [open_gz(out_format % i, 'w') for i in range(n_output_files)]

    # To cleanly separate reducer outputs by key groups we need to unpack
    # values on shuffling and compare keys. Every index change has to be
    # accompanied by a key change, otherwise index change is postponed.
    old_key = None
    old_index = 0
    lines_written = 0
    for count, line in enumerate(heapq.merge(*sources)):
        key = json.loads(line)[0]
        index = count * n_output_files / num_entries

        # postpone switching to new index until a change in key also observed
        if old_index != index and old_key != key:
            old_index = index
        outputs[old_index].write(line)
        lines_written += 1

        old_key = key

    for source in sources:
        source.close()

    for output in outputs:
        output.close()

    done_pattern = path_join(args.work_dir, "shuffle-%d.done")
    done_names = [done_pattern % i for i in range(n_output_files)]
    for name in done_names:
        with open(name, 'w') as fhandle:
            fhandle.write('')

    logger.info("Step {} shuffler: lines written: {}"
                .format(args.step_idx, lines_written))