Ejemplo n.º 1
0
def sort_main():
    N = args.num_parts
    partitions = _load_manifest(constants.INPUT_MANIFEST_FILE)
    boundaries = sortlib.get_boundaries(N)
    mapper_results = np.empty((N, N), dtype=object)
    for part_id, node, path in partitions:
        if not args.skip_input:
            opt = {
                "resources": {
                    f"node:{node}": 1 / args.num_parts
                },
                "memory": args.part_size * 2,
            }
        else:
            opt = {}
        mapper_results[part_id, :] = mapper.options(**opt).remote(
            boundaries, part_id, path)

    reducer_results = []
    for r in range(N):
        if not args.skip_output:
            opt = {"memory": args.part_size * 2}
        else:
            opt = {"memory": args.part_size * 2}
        blocks = mapper_results[:, r].tolist()
        ret = reducer.options(**opt).remote(r, *blocks)
        reducer_results.append(ret)

    reducer_results = ray.get(reducer_results)
    with open(constants.OUTPUT_MANIFEST_FILE, "w") as fout:
        writer = csv.writer(fout)
        writer.writerows(reducer_results)
Ejemplo n.º 2
0
def sort_main():
    partitions = _load_manifest(constants.INPUT_MANIFEST_FILE)
    boundaries = sortlib.get_boundaries(args.num_reducers)
    mapper_results = np.empty((args.num_mappers, args.num_reducers),
                              dtype=object)
    for part_id, node, path in partitions:
        opt = {} if args.skip_input else {
            "resources": {
                f"node:{node}": 1 / args.num_mappers
            },
            "memory": args.input_part_size * 1.2,
        }
        opt.update(num_returns=args.num_reducers)
        mapper_results[part_id, :] = mapper.options(**opt).remote(
            boundaries, part_id, path)

    reducer_results = []
    for r in range(args.num_reducers):
        opt = {
            "memory": args.output_part_size * 1.0,
        }
        blocks = mapper_results[:, r].tolist()
        ret = reducer.options(**opt).remote(r, *blocks)
        reducer_results.append(ret)

    reducer_results = ray.get(reducer_results)
    if not args.skip_output:
        with open(constants.OUTPUT_MANIFEST_FILE, "w") as fout:
            writer = csv.writer(fout)
            writer.writerows(reducer_results)
Ejemplo n.º 3
0
def sort_main(args: Args):
    parts = _load_manifest(args, constants.INPUT_MANIFEST_FILE)
    assert len(parts) == args.num_mappers
    boundaries = sortlib.get_boundaries(args.num_reducers)

    mapper_opt = {
        "num_returns": args.num_reducers,
        "num_cpus": os.cpu_count() / args.num_concurrent_rounds,
    }  # Load balance across worker nodes by setting `num_cpus`.
    merge_results = np.empty((args.num_rounds, args.num_reducers),
                             dtype=object)

    part_id = 0
    with worker_placement_groups(args) as pgs:
        for round in range(args.num_rounds):
            # Limit the number of in-flight rounds.
            num_extra_rounds = round - args.num_concurrent_rounds + 1
            if num_extra_rounds > 0:
                ray.wait(
                    [f for f in merge_results.flatten() if f is not None],
                    num_returns=num_extra_rounds * args.num_reducers,
                )

            # Submit map tasks.
            mapper_results = np.empty(
                (args.num_mappers_per_round, args.num_reducers), dtype=object)
            for _ in range(args.num_mappers_per_round):
                _, node, path = parts[part_id]
                m = part_id % args.num_mappers_per_round
                mapper_results[m, :] = mapper.options(**mapper_opt).remote(
                    args, part_id, boundaries, path)
                part_id += 1

            # Submit merge tasks.
            merge_results[round, :] = [
                merge_mapper_blocks.options(placement_group=pgs[r]).remote(
                    args, r, round, *mapper_results[:, r].tolist())
                for r in range(args.num_reducers)
            ]

            # Delete local references to mapper results.
            mapper_results = None

        # Submit second-stage reduce tasks.
        reducer_results = [
            final_merge.options(placement_group=pgs[r]).remote(
                args, r, *merge_results[:, r].tolist())
            for r in range(args.num_reducers)
        ]
        reducer_results = ray.get(reducer_results)

    if not args.skip_output:
        with open(constants.OUTPUT_MANIFEST_FILE, "w") as fout:
            writer = csv.writer(fout)
            writer.writerows(reducer_results)

    logging.info(ray.internal.internal_api.memory_summary(stats_only=True))