Ejemplo n.º 1
0
def parse_args():
    parser = ArgumentParser()
    parser.add_argument('--input_files', type=str, nargs='+',
                        help='list of input files to mappers')
    parser.add_argument('--output_dir', type=str, default='out',
                        help='directory to write output files to')
    parser.add_argument('--work_dir', type=str, required=True,
                        help='temporary working directory')
    parser.add_argument('--job_module', type=str, required=True)
    parser.add_argument('--job_class', type=str, required=True)
    parser.add_argument('--step_idx', type=int, required=True,
                        help='Index of this step (zero-base)')
    parser.add_argument('--total_steps', type=int, required=True,
                        help='total number of steps')
    parser.add_argument('--use_domino', type=int, default=1,
                        help='which platform to run on (local or domino)')
    parser.add_argument('--n_concurrent_machines', type=int, default=2,
                        help='maximum number of domino jobs to be running '
                        'at once')
    parser.add_argument('--n_shards_per_machine', type=int, default=1,
                        help='number of processes to spawn per domino job '
                        '(-1 for all)')
    parser.add_argument('--poll_done_interval_sec', type=int, default=45,
                        help='interval between successive checks that we '
                        'are done')
    args = parser.parse_args()

    # verify functions exist.
    step = get_step(args)
    assert step.mapper is not None
    assert step.reducer is not None

    return args
Ejemplo n.º 2
0
def main():

    args = parse_args()

    # count exactly how many input lines we have so we can balance work.
    glob_pattern = path_join(args.work_dir,
                             args.input_prefix + '_count.[0-9]*')
    count_ff = glob(glob_pattern)
    if not count_ff:
        raise RuntimeError("Step {} shuffler: not input files found matching "
                           "pattern {}".format(args.step_idx, glob_pattern))
    logger.info("Step {} shuffler: counting entries from {}"
                .format(args.step_idx, count_ff))
    num_entries = sum(imap(int, read_files(count_ff)))

    in_ff = sorted(glob(path_join(args.work_dir,
                                  args.input_prefix + '.[0-9]*')))
    sources = [open(f, 'r') for f in in_ff]

    step = get_step(args)
    n_output_files = step.n_reducers

    out_format = path_join(args.work_dir, args.output_prefix + '.%d')
    outputs = [open(out_format % i, 'w') for i in range(n_output_files)]

    # To cleanly separate reducer outputs by key groups we need to unpack
    # values on shuffling and compare keys. Every index change has to be
    # accompanied by a key change, otherwise index change is postponed.
    old_key = None
    old_index = 0
    lines_written = 0
    for count, line in enumerate(heapq.merge(*sources)):
        key = json.loads(line)[0]
        index = count * n_output_files / num_entries

        # postpone switching to new index until a change in key also observed
        if old_index != index and old_key != key:
            old_index = index
        outputs[old_index].write(line)
        lines_written += 1

        old_key = key

    for source in sources:
        source.close()

    for output in outputs:
        output.close()

    logger.info("Step {} shuffler: lines written: {}"
                .format(args.step_idx, lines_written))