def main(): args = parse_args() shards = map(int, args.shards.split(',')) logger.info("Scheduling shards {} on one reducer node".format(shards)) pool = Pool(processes=len(shards)) # Note: wrapping arguments to do_shard into a tuple since multiprocessing # does not support map functions with >1 argument and using a lambda # will result a pickling error since python's pickle is horrible pool.map(do_shard, [(args, shard) for shard in shards])
def main(): args = parse_args() # count exactly how many input lines we have so we can balance work. glob_pattern = path_join(args.work_dir, args.input_prefix + '_count.[0-9]*') count_ff = glob(glob_pattern) if not count_ff: raise RuntimeError("Step {} shuffler: not input files found matching " "pattern {}".format(args.step_idx, glob_pattern)) logger.info("Step {} shuffler: counting entries from {}" .format(args.step_idx, count_ff)) num_entries = sum(imap(int, read_files(count_ff))) in_ff = sorted(glob(path_join(args.work_dir, args.input_prefix + '.[0-9]*'))) sources = [open(f, 'r') for f in in_ff] step = get_step(args) n_output_files = step.n_reducers out_format = path_join(args.work_dir, args.output_prefix + '.%d') outputs = [open(out_format % i, 'w') for i in range(n_output_files)] # To cleanly separate reducer outputs by key groups we need to unpack # values on shuffling and compare keys. Every index change has to be # accompanied by a key change, otherwise index change is postponed. old_key = None old_index = 0 lines_written = 0 for count, line in enumerate(heapq.merge(*sources)): key = json.loads(line)[0] index = count * n_output_files / num_entries # postpone switching to new index until a change in key also observed if old_index != index and old_key != key: old_index = index outputs[old_index].write(line) lines_written += 1 old_key = key for source in sources: source.close() for output in outputs: output.close() logger.info("Step {} shuffler: lines written: {}" .format(args.step_idx, lines_written))
def do_shard(t): # Uses workaround to show traceback of uncaught exceptions (which by # default python's multiprocessing module fails to provide): # http://seasonofcode.com/posts/python-multiprocessing-and-exceptions.html try: args, shard = t # unwrap argument with MRTimer() as timer: reduce_one_shard.reduce(shard, args) logger.info("Shard {} reduced: {}".format(shard, str(timer))) except Exception as e: exc_buffer = StringIO() traceback.print_exc(file=exc_buffer) logger.error('Uncaught exception while reducing shard {}. {}' .format(shard, exc_buffer.getvalue())) raise e
def main(): args = parse_args() # find the combine function. job = get_instance(args) step = job.get_step(args.step_idx) combine_func = step.combiner shard = args.shard in_fh = args.input out_fn = path_join(args.work_dir, args.output_prefix + '.%d' % shard) logger.info("combiner {}: output -> {}".format(shard, out_fn)) last_key = None values = [] count_written = 0 count_seen = 0 with open(out_fn, 'w') as out_fh: for line in in_fh: count_seen += 1 key, value = json.loads(line) if key == last_key: # extend previous run values.append(value) else: # end previous run if values: for kv in combine_func(last_key, values): count_written += 1 out_fh.write(json.dumps(kv) + '\n') # start new run last_key = key values = [value] # dump any remaining values if values: for kv in combine_func(last_key, values): count_written += 1 out_fh.write(json.dumps(kv) + '\n') counters = job._counters counters.incr("combiner", "seen", count_seen) counters.incr("combiner", "written", count_written) # write out the counters to file. f = path_join(args.work_dir, 'combine.counters.%d' % shard) logger.info("combiner {}: counters -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write(counters.serialize()) # write how many entries were written for reducer balancing purposes. f = path_join(args.work_dir, args.output_prefix + '_count.%d' % shard) logger.info("combiner {}: lines written -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write(str(count_written))
def schedule_machines(args, command, done_file_pattern, n_shards): def wrap_cmd(command, use_domino): if use_domino: pre = 'domino run %s ' % EXEC_SCRIPT post = '' else: pre = '%s ' % EXEC_SCRIPT post = ' &' return '%s%s%s' % (pre, command, post) shard2state = dict(zip( range(n_shards), [ShardState.NOT_STARTED] * n_shards)) while True: # go to disk and look for shard done files. update_shards_done(args, done_file_pattern, n_shards, args.use_domino, shard2state) logger.info(show_shard_state(shard2state, args.n_shards_per_machine)) if are_all_shards_done(shard2state): break # if we can start any more domino jobs (per n_concurrent_machines # restriction), get the ones to start. start_me = get_shard_groups_to_start( args.n_concurrent_machines, args.n_shards_per_machine, shard2state) # start the jobs. if start_me: logger.info('Starting shard groups: %s', start_me) for shards in start_me: # execute command. cmd = command % ','.join(map(str, shards)) cmd = wrap_cmd(cmd, args.use_domino) logger.info("Starting process: {}".format(cmd)) os.system(cmd) # note them as started. for shard in shards: shard2state[shard] = ShardState.IN_PROGRESS # wait to poll. time.sleep(args.poll_done_interval_sec)
def map(shard, args): # find the map function. job = get_instance(args) step = job.get_step(args.step_idx) map_func = step.mapper n_shards = step.n_mappers combine_func = step.combiner assert 0 <= shard < n_shards if combine_func is None: out_fn = path_join(args.work_dir, args.output_prefix + '.%d' % shard) logger.info("mapper {}: output -> {}".format(shard, out_fn)) proc_sort = Popen(['sort', '-o', out_fn], bufsize=4096, stdin=PIPE) proc = proc_sort else: cmd_opts = ['python', '-m', 'mrdomino.combine', '--job_module', args.job_module, '--job_class', args.job_class, '--step_idx', str(args.step_idx), '--work_dir', args.work_dir, '--output_prefix', args.output_prefix, '--shard', str(shard)] logger.info("mapper {}: starting combiner: {}" .format(shard, create_cmd(cmd_opts))) proc_combine = Popen(cmd_opts, bufsize=4096, stdin=PIPE) proc_sort = Popen(['sort'], bufsize=4096, stdin=PIPE, stdout=proc_combine.stdin) proc = proc_combine if args.step_idx == 0: # first step if job.INPUT_PROTOCOL == protocol.JSONProtocol: unpack_tuple = True elif job.INPUT_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = False else: raise ValueError("unsupported protocol: {}" .format(job.INPUT_PROTOCOL)) elif args.step_idx > 0: # intermediate step if job.INTERNAL_PROTOCOL == protocol.JSONProtocol: unpack_tuple = True elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = False else: raise ValueError("unsupported protocol: {}" .format(job.INTERNAL_PROTOCOL)) else: raise ValueError("step_idx={} cannot be negative" .format(args.step_idx)) # process each line of input and sort for the merge step. # using with block here ensures that proc_sort.stdin is closed on exit and # that it won't block the pipeline count_written = 0 count_seen = 0 with proc_sort.stdin as in_fh: for line in each_input_line(args.input_files, shard, n_shards): count_seen += 1 kv = json.loads(line) k, v = kv if unpack_tuple else (None, kv) for kv in map_func(k, v): in_fh.write(json.dumps(kv) + '\n') count_written += 1 counters = job._counters counters.incr("mapper", "seen", count_seen) counters.incr("mapper", "written", count_written) # write out the counters to file. f = path_join(args.work_dir, 'map.counters.%d' % shard) logger.info("mapper {}: counters -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write(counters.serialize()) # write how many entries were written for reducer balancing purposes. # note that if combiner is present, we delegate this responsibility to it. if combine_func is None: f = path_join(args.work_dir, args.output_prefix + '_count.%d' % shard) logger.info("mapper {}: lines written -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write(str(count_written)) # `communicate' will wait for subprocess to terminate comb_stdout, comb_stderr = proc.communicate() # finally note that we are done. f = path_join(args.work_dir, 'map.done.%d' % shard) logger.info("mapper {}: done -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write('')
def main(): args = parse_args() logger.info('Mapreduce step: %s', args) logger.info('%d input files.', len(args.input_files)) work_dir = args.work_dir logger.info('Working directory: %s', work_dir) job = get_instance(args) step = job.get_step(args.step_idx) logger.info('Starting %d mappers.', step.n_mappers) # create map command cmd_opts = [ 'mrdomino.map_one_machine', '--step_idx', args.step_idx, '--shards', '%s', '--input_files', ' '.join(args.input_files), '--job_module', args.job_module, '--job_class', args.job_class, '--work_dir', work_dir ] cmd = create_cmd(cmd_opts) schedule_machines( args, command=cmd, done_file_pattern=os.path.join(work_dir, 'map.done.%d'), n_shards=step.n_mappers) counter = combine_counters( work_dir, step.n_mappers, step.n_reducers) # shuffle mapper outputs to reducer inputs. cmd = create_cmd([EXEC_SCRIPT, 'mrdomino.shuffle', '--work_dir', work_dir, '--input_prefix', 'map.out', '--output_prefix', 'reduce.in', '--job_module', args.job_module, '--job_class', args.job_class, '--step_idx', args.step_idx]) wait_cmd(cmd, logger, "Shuffling") logger.info('Starting %d reducers.', step.n_reducers) cmd = create_cmd(['mrdomino.reduce_one_machine', '--step_idx', args.step_idx, '--shards', '%s', '--job_module', args.job_module, '--job_class', args.job_class, '--input_prefix', 'reduce.in', '--work_dir', work_dir]) schedule_machines( args, command=cmd, done_file_pattern=os.path.join(work_dir, 'reduce.done.%d'), n_shards=step.n_reducers) counter = combine_counters( work_dir, step.n_mappers, step.n_reducers) logger.info(('Step %d counters:\n' % args.step_idx) + counter.show()) if args.step_idx == args.total_steps - 1: logger.info('Joining reduce outputs') if job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = True elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONProtocol: raise RuntimeError("if internal protocol is value-based, " "output protocol must also be so") elif job.INTERNAL_PROTOCOL == protocol.JSONProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONProtocol: unpack_tuple = False elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol and \ job.OUTPUT_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = False else: raise ValueError("unsupported output protocol: {}" .format(job.OUTPUT_PROTOCOL)) # make sure that files are sorted by shard number glob_prefix = 'reduce.out' filenames = glob(path_join(work_dir, glob_prefix + '.[0-9]*')) prefix_match = re.compile('.*\\b' + glob_prefix + '\\.(\\d+)$') presorted = [] for filename in filenames: match = prefix_match.match(filename) if match is not None: presorted.append((int(match.group(1)), filename)) filenames = [filename[1] for filename in sorted(presorted)] out_f = path_join(args.output_dir, 'reduce.out') with open(out_f, 'w') as out_fh: for kv in read_lines(filenames): if unpack_tuple: _, v = json.loads(kv) v = json.dumps(v) + "\n" else: v = kv out_fh.write(v) # done. logger.info('Mapreduce step done.')
def reduce(shard, args): # find the reduce function. job = get_instance(args) step = job.get_step(args.step_idx) reduce_func = step.reducer # default to work_dir if output_dir is not set work_dir = args.work_dir output_dir = args.output_dir if output_dir is None: output_dir = work_dir # process each (key, value) pair. out_fn = path_join(output_dir, args.output_prefix + '.%d' % shard) logger.info("reducer {}: output -> {}".format(shard, out_fn)) assert args.input_prefix is not None in_f = path_join(work_dir, args.input_prefix + '.%d' % shard) logger.info("reducer {}: input <- {}".format(shard, in_f)) input_stream = partial(open, in_f, 'r') if args.step_idx >= 0: if job.INTERNAL_PROTOCOL == protocol.JSONProtocol: unpack_tuple = False elif job.INTERNAL_PROTOCOL == protocol.JSONValueProtocol: unpack_tuple = True else: raise ValueError("unsupported protocol: {}" .format(job.INTERNAL_PROTOCOL)) else: raise ValueError("step_idx={} cannot be negative" .format(args.step_idx)) count_written = 0 count_seen = 0 with nested_context(input_stream(), open(out_fn, 'w')) as (in_fh, out_fh): last_key = None values = [] for line in in_fh: count_seen += 1 key, value = json.loads(line) if key == last_key: # extend previous run values.append(value) else: # end previous run if values: for kv in reduce_func(last_key, values): k, v = kv if unpack_tuple else (None, kv) count_written += 1 out_fh.write(json.dumps(v) + '\n') # start new run last_key = key values = [value] # dump any remaining values if values: for kv in reduce_func(last_key, values): k, v = kv if unpack_tuple else (None, kv) count_written += 1 out_fh.write(json.dumps(v) + '\n') counters = job._counters counters.incr("reducer", "seen", count_seen) counters.incr("reducer", "written", count_written) # write out the counters to file. f = path_join(output_dir, 'reduce.counters.%d' % shard) logger.info("reducer {}: counters -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write(counters.serialize()) # finally note that we are done. f = path_join(output_dir, 'reduce.done.%d' % shard) logger.info("reducer {}: done -> {}".format(shard, f)) with open(f, 'w') as fh: fh.write('')