def parse_args(): parser = ArgumentParser() parser.add_argument('--input_files', type=str, nargs='+', help='list of input files to mappers') parser.add_argument('--output_dir', type=str, default='out', help='directory to write output files to') parser.add_argument('--work_dir', type=str, required=True, help='temporary working directory') parser.add_argument('--job_module', type=str, required=True) parser.add_argument('--job_class', type=str, required=True) parser.add_argument('--step_idx', type=int, required=True, help='Index of this step (zero-base)') parser.add_argument('--total_steps', type=int, required=True, help='total number of steps') parser.add_argument('--use_domino', type=int, default=1, help='which platform to run on (local or domino)') parser.add_argument('--n_concurrent_machines', type=int, default=2, help='maximum number of domino jobs to be running ' 'at once') parser.add_argument('--n_shards_per_machine', type=int, default=1, help='number of processes to spawn per domino job ' '(-1 for all)') parser.add_argument('--poll_done_interval_sec', type=int, default=45, help='interval between successive checks that we ' 'are done') args = parser.parse_args() # verify functions exist. step = get_step(args) assert step.mapper is not None assert step.reducer is not None return args
def main(): args = parse_args() # count exactly how many input lines we have so we can balance work. glob_pattern = path_join(args.work_dir, args.input_prefix + '_count.[0-9]*') count_ff = glob(glob_pattern) if not count_ff: raise RuntimeError("Step {} shuffler: not input files found matching " "pattern {}".format(args.step_idx, glob_pattern)) logger.info("Step {} shuffler: counting entries from {}" .format(args.step_idx, count_ff)) num_entries = sum(imap(int, read_files(count_ff))) in_ff = sorted(glob(path_join(args.work_dir, args.input_prefix + '.[0-9]*'))) sources = [open(f, 'r') for f in in_ff] step = get_step(args) n_output_files = step.n_reducers out_format = path_join(args.work_dir, args.output_prefix + '.%d') outputs = [open(out_format % i, 'w') for i in range(n_output_files)] # To cleanly separate reducer outputs by key groups we need to unpack # values on shuffling and compare keys. Every index change has to be # accompanied by a key change, otherwise index change is postponed. old_key = None old_index = 0 lines_written = 0 for count, line in enumerate(heapq.merge(*sources)): key = json.loads(line)[0] index = count * n_output_files / num_entries # postpone switching to new index until a change in key also observed if old_index != index and old_key != key: old_index = index outputs[old_index].write(line) lines_written += 1 old_key = key for source in sources: source.close() for output in outputs: output.close() logger.info("Step {} shuffler: lines written: {}" .format(args.step_idx, lines_written))