def main():
    args = parse_args()

    shards = map(int, args.shards.split(','))
    logger.info("Scheduling shards {} on one reducer node".format(shards))
    pool = Pool(processes=len(shards))

    # Note: wrapping arguments to do_shard into a tuple since multiprocessing
    # does not support map functions with >1 argument and using a lambda
    # will result a pickling error since python's pickle is horrible
    pool.map(do_shard, [(args, shard) for shard in shards])
Exemple #2
0
def main():

    args = parse_args()

    # count exactly how many input lines we have so we can balance work.
    glob_pattern = path_join(args.work_dir,
                             args.input_prefix + '_count.[0-9]*')
    count_ff = glob(glob_pattern)
    if not count_ff:
        raise RuntimeError("Step {} shuffler: not input files found matching "
                           "pattern {}".format(args.step_idx, glob_pattern))
    logger.info("Step {} shuffler: counting entries from {}"
                .format(args.step_idx, count_ff))
    num_entries = sum(imap(int, read_files(count_ff)))

    in_ff = sorted(glob(path_join(args.work_dir,
                                  args.input_prefix + '.[0-9]*')))
    sources = [open(f, 'r') for f in in_ff]

    step = get_step(args)
    n_output_files = step.n_reducers

    out_format = path_join(args.work_dir, args.output_prefix + '.%d')
    outputs = [open(out_format % i, 'w') for i in range(n_output_files)]

    # To cleanly separate reducer outputs by key groups we need to unpack
    # values on shuffling and compare keys. Every index change has to be
    # accompanied by a key change, otherwise index change is postponed.
    old_key = None
    old_index = 0
    lines_written = 0
    for count, line in enumerate(heapq.merge(*sources)):
        key = json.loads(line)[0]
        index = count * n_output_files / num_entries

        # postpone switching to new index until a change in key also observed
        if old_index != index and old_key != key:
            old_index = index
        outputs[old_index].write(line)
        lines_written += 1

        old_key = key

    for source in sources:
        source.close()

    for output in outputs:
        output.close()

    logger.info("Step {} shuffler: lines written: {}"
                .format(args.step_idx, lines_written))
def do_shard(t):
    # Uses workaround to show traceback of uncaught exceptions (which by
    # default python's multiprocessing module fails to provide):
    # http://seasonofcode.com/posts/python-multiprocessing-and-exceptions.html
    try:
        args, shard = t  # unwrap argument
        with MRTimer() as timer:
            reduce_one_shard.reduce(shard, args)
        logger.info("Shard {} reduced: {}".format(shard, str(timer)))
    except Exception as e:
        exc_buffer = StringIO()
        traceback.print_exc(file=exc_buffer)
        logger.error('Uncaught exception while reducing shard {}. {}'
                     .format(shard, exc_buffer.getvalue()))
        raise e
Exemple #4
0
def main():
    args = parse_args()

    # find the combine function.
    job = get_instance(args)
    step = job.get_step(args.step_idx)
    combine_func = step.combiner

    shard = args.shard
    in_fh = args.input
    out_fn = path_join(args.work_dir, args.output_prefix + '.%d' % shard)
    logger.info("combiner {}: output -> {}".format(shard, out_fn))

    last_key = None
    values = []

    count_written = 0
    count_seen = 0
    with open(out_fn, 'w') as out_fh:
        for line in in_fh:
            count_seen += 1
            key, value = json.loads(line)
            if key == last_key:
                # extend previous run
                values.append(value)
            else:
                # end previous run
                if values:
                    for kv in combine_func(last_key, values):
                        count_written += 1
                        out_fh.write(json.dumps(kv) + '\n')

                # start new run
                last_key = key
                values = [value]
        # dump any remaining values
        if values:
            for kv in combine_func(last_key, values):
                count_written += 1
                out_fh.write(json.dumps(kv) + '\n')

    counters = job._counters
    counters.incr("combiner", "seen", count_seen)
    counters.incr("combiner", "written", count_written)

    # write out the counters to file.
    f = path_join(args.work_dir, 'combine.counters.%d' % shard)
    logger.info("combiner {}: counters -> {}".format(shard, f))
    with open(f, 'w') as fh:
        fh.write(counters.serialize())

    # write how many entries were written for reducer balancing purposes.
    f = path_join(args.work_dir, args.output_prefix + '_count.%d' % shard)
    logger.info("combiner {}: lines written -> {}".format(shard, f))
    with open(f, 'w') as fh:
        fh.write(str(count_written))
Exemple #5
0
def schedule_machines(args, command, done_file_pattern, n_shards):

    def wrap_cmd(command, use_domino):
        if use_domino:
            pre = 'domino run %s ' % EXEC_SCRIPT
            post = ''
        else:
            pre = '%s ' % EXEC_SCRIPT
            post = ' &'
        return '%s%s%s' % (pre, command, post)

    shard2state = dict(zip(
        range(n_shards),
        [ShardState.NOT_STARTED] * n_shards))

    while True:
        # go to disk and look for shard done files.
        update_shards_done(args, done_file_pattern, n_shards, args.use_domino,
                           shard2state)

        logger.info(show_shard_state(shard2state, args.n_shards_per_machine))

        if are_all_shards_done(shard2state):
            break

        # if we can start any more domino jobs (per n_concurrent_machines
        # restriction), get the ones to start.
        start_me = get_shard_groups_to_start(
            args.n_concurrent_machines, args.n_shards_per_machine, shard2state)

        # start the jobs.
        if start_me:
            logger.info('Starting shard groups: %s', start_me)
        for shards in start_me:
            # execute command.
            cmd = command % ','.join(map(str, shards))
            cmd = wrap_cmd(cmd, args.use_domino)
            logger.info("Starting process: {}".format(cmd))
            os.system(cmd)

            # note them as started.
            for shard in shards:
                shard2state[shard] = ShardState.IN_PROGRESS

        # wait to poll.
        time.sleep(args.poll_done_interval_sec)
Exemple #6
0
def map(shard, args):

    # find the map function.
    job = get_instance(args)
    step = job.get_step(args.step_idx)
    map_func = step.mapper
    n_shards = step.n_mappers
    combine_func = step.combiner

    assert 0 <= shard < n_shards

    if combine_func is None:
        out_fn = path_join(args.work_dir, args.output_prefix + '.%d' % shard)
        logger.info("mapper {}: output -> {}".format(shard, out_fn))
        proc_sort = Popen(['sort', '-o', out_fn], bufsize=4096, stdin=PIPE)
        proc = proc_sort
    else:
        cmd_opts = ['python', '-m', 'mrdomino.combine',
                    '--job_module', args.job_module,
                    '--job_class', args.job_class,
                    '--step_idx', str(args.step_idx),
                    '--work_dir', args.work_dir,
                    '--output_prefix', args.output_prefix,
                    '--shard', str(shard)]
        logger.info("mapper {}: starting combiner: {}"
                    .format(shard, create_cmd(cmd_opts)))
        proc_combine = Popen(cmd_opts, bufsize=4096, stdin=PIPE)
        proc_sort = Popen(['sort'], bufsize=4096, stdin=PIPE,
                          stdout=proc_combine.stdin)
        proc = proc_combine

    if args.step_idx == 0:
        # first step
        if job.INPUT_PROTOCOL == protocol.JSONProtocol:
            unpack_tuple = True
        elif job.INPUT_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = False
        else:
            raise ValueError("unsupported protocol: {}"
                             .format(job.INPUT_PROTOCOL))
    elif args.step_idx > 0:
        # intermediate step
        if job.INTERNAL_PROTOCOL == protocol.JSONProtocol:
            unpack_tuple = True
        elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = False
        else:
            raise ValueError("unsupported protocol: {}"
                             .format(job.INTERNAL_PROTOCOL))
    else:
        raise ValueError("step_idx={} cannot be negative"
                         .format(args.step_idx))

    # process each line of input and sort for the merge step.
    # using with block here ensures that proc_sort.stdin is closed on exit and
    # that it won't block the pipeline
    count_written = 0
    count_seen = 0
    with proc_sort.stdin as in_fh:
        for line in each_input_line(args.input_files, shard, n_shards):
            count_seen += 1
            kv = json.loads(line)
            k, v = kv if unpack_tuple else (None, kv)
            for kv in map_func(k, v):
                in_fh.write(json.dumps(kv) + '\n')
                count_written += 1

    counters = job._counters
    counters.incr("mapper", "seen", count_seen)
    counters.incr("mapper", "written", count_written)

    # write out the counters to file.
    f = path_join(args.work_dir, 'map.counters.%d' % shard)
    logger.info("mapper {}: counters -> {}".format(shard, f))
    with open(f, 'w') as fh:
        fh.write(counters.serialize())

    # write how many entries were written for reducer balancing purposes.
    # note that if combiner is present, we delegate this responsibility to it.
    if combine_func is None:
        f = path_join(args.work_dir, args.output_prefix + '_count.%d' % shard)
        logger.info("mapper {}: lines written -> {}".format(shard, f))
        with open(f, 'w') as fh:
            fh.write(str(count_written))

    # `communicate' will wait for subprocess to terminate
    comb_stdout, comb_stderr = proc.communicate()

    # finally note that we are done.
    f = path_join(args.work_dir, 'map.done.%d' % shard)
    logger.info("mapper {}: done -> {}".format(shard, f))
    with open(f, 'w') as fh:
        fh.write('')
Exemple #7
0
def main():

    args = parse_args()
    logger.info('Mapreduce step: %s', args)

    logger.info('%d input files.', len(args.input_files))

    work_dir = args.work_dir
    logger.info('Working directory: %s', work_dir)

    job = get_instance(args)
    step = job.get_step(args.step_idx)
    logger.info('Starting %d mappers.', step.n_mappers)

    # create map command
    cmd_opts = [
        'mrdomino.map_one_machine',
        '--step_idx', args.step_idx,
        '--shards', '%s',
        '--input_files', ' '.join(args.input_files),
        '--job_module', args.job_module,
        '--job_class', args.job_class,
        '--work_dir', work_dir
    ]
    cmd = create_cmd(cmd_opts)

    schedule_machines(
        args,
        command=cmd,
        done_file_pattern=os.path.join(work_dir, 'map.done.%d'),
        n_shards=step.n_mappers)

    counter = combine_counters(
        work_dir, step.n_mappers, step.n_reducers)

    # shuffle mapper outputs to reducer inputs.
    cmd = create_cmd([EXEC_SCRIPT, 'mrdomino.shuffle',
                      '--work_dir', work_dir,
                      '--input_prefix', 'map.out',
                      '--output_prefix', 'reduce.in',
                      '--job_module', args.job_module,
                      '--job_class', args.job_class,
                      '--step_idx', args.step_idx])
    wait_cmd(cmd, logger, "Shuffling")

    logger.info('Starting %d reducers.', step.n_reducers)
    cmd = create_cmd(['mrdomino.reduce_one_machine',
                      '--step_idx', args.step_idx,
                      '--shards', '%s',
                      '--job_module', args.job_module,
                      '--job_class', args.job_class,
                      '--input_prefix', 'reduce.in',
                      '--work_dir', work_dir])
    schedule_machines(
        args,
        command=cmd,
        done_file_pattern=os.path.join(work_dir, 'reduce.done.%d'),
        n_shards=step.n_reducers)

    counter = combine_counters(
        work_dir, step.n_mappers, step.n_reducers)
    logger.info(('Step %d counters:\n' % args.step_idx) + counter.show())

    if args.step_idx == args.total_steps - 1:

        logger.info('Joining reduce outputs')

        if job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = True
        elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONProtocol:
            raise RuntimeError("if internal protocol is value-based, "
                               "output protocol must also be so")
        elif job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONProtocol:
            unpack_tuple = False
        elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \
                job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = False
        else:
            raise ValueError("unsupported output protocol: {}"
                             .format(job.OUTPUT_PROTOCOL))

        # make sure that files are sorted by shard number
        glob_prefix = 'reduce.out'
        filenames = glob(path_join(work_dir, glob_prefix + '.[0-9]*'))
        prefix_match = re.compile('.*\\b' + glob_prefix + '\\.(\\d+)$')
        presorted = []
        for filename in filenames:
            match = prefix_match.match(filename)
            if match is not None:
                presorted.append((int(match.group(1)), filename))
        filenames = [filename[1] for filename in sorted(presorted)]
        out_f = path_join(args.output_dir, 'reduce.out')
        with open(out_f, 'w') as out_fh:
            for kv in read_lines(filenames):
                if unpack_tuple:
                    _, v = json.loads(kv)
                    v = json.dumps(v) + "\n"
                else:
                    v = kv
                out_fh.write(v)

    # done.
    logger.info('Mapreduce step done.')
def reduce(shard, args):

    # find the reduce function.
    job = get_instance(args)
    step = job.get_step(args.step_idx)
    reduce_func = step.reducer

    # default to work_dir if output_dir is not set
    work_dir = args.work_dir
    output_dir = args.output_dir
    if output_dir is None:
        output_dir = work_dir

    # process each (key, value) pair.
    out_fn = path_join(output_dir, args.output_prefix + '.%d' % shard)
    logger.info("reducer {}: output -> {}".format(shard, out_fn))

    assert args.input_prefix is not None
    in_f = path_join(work_dir, args.input_prefix + '.%d' % shard)
    logger.info("reducer {}: input <- {}".format(shard, in_f))
    input_stream = partial(open, in_f, 'r')

    if args.step_idx >= 0:
        if job.INTERNAL_PROTOCOL == protocol.JSONProtocol:
            unpack_tuple = False
        elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol:
            unpack_tuple = True
        else:
            raise ValueError("unsupported protocol: {}"
                             .format(job.INTERNAL_PROTOCOL))
    else:
        raise ValueError("step_idx={} cannot be negative"
                         .format(args.step_idx))

    count_written = 0
    count_seen = 0
    with nested_context(input_stream(), open(out_fn, 'w')) as (in_fh, out_fh):
        last_key = None
        values = []
        for line in in_fh:
            count_seen += 1
            key, value = json.loads(line)
            if key == last_key:
                # extend previous run
                values.append(value)
            else:
                # end previous run
                if values:
                    for kv in reduce_func(last_key, values):
                        k, v = kv if unpack_tuple else (None, kv)
                        count_written += 1
                        out_fh.write(json.dumps(v) + '\n')

                # start new run
                last_key = key
                values = [value]
        # dump any remaining values
        if values:
            for kv in reduce_func(last_key, values):
                k, v = kv if unpack_tuple else (None, kv)
                count_written += 1
                out_fh.write(json.dumps(v) + '\n')

    counters = job._counters
    counters.incr("reducer", "seen", count_seen)
    counters.incr("reducer", "written", count_written)

    # write out the counters to file.
    f = path_join(output_dir, 'reduce.counters.%d' % shard)
    logger.info("reducer {}: counters -> {}".format(shard, f))
    with open(f, 'w') as fh:
        fh.write(counters.serialize())

    # finally note that we are done.
    f = path_join(output_dir, 'reduce.done.%d' % shard)
    logger.info("reducer {}: done -> {}".format(shard, f))
    with open(f, 'w') as fh:
        fh.write('')