コード例 #1
0
ファイル: compute.py プロジェクト: DmitriGekhtman/ray
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList[Any]) -> BlockList[Any]:
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        kwargs = remote_args.copy()
        kwargs["num_returns"] = 2

        @ray.remote(**kwargs)
        def wrapped_fn(block: Block, meta: BlockMetadata):
            new_block = fn(block)
            accessor = BlockAccessor.for_block(new_block)
            new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                                     size_bytes=accessor.size_bytes(),
                                     schema=accessor.schema(),
                                     input_files=meta.input_files)
            return new_block, new_meta

        refs = [
            wrapped_fn.remote(b, m)
            for b, m in zip(blocks, blocks.get_metadata())
        ]
        new_blocks, new_metadata = zip(*refs)

        map_bar.block_until_complete(list(new_blocks))
        new_metadata = ray.get(list(new_metadata))
        return BlockList(list(new_blocks), list(new_metadata))
コード例 #2
0
    def write_datasource(self, datasource: Datasource[T],
                         **write_args) -> None:
        """Write the dataset to a custom datasource.

        Examples:
            >>> ds.write_datasource(CustomDatasourceImpl(...))

        Time complexity: O(dataset size / parallelism)

        Args:
            datasource: The datasource to write to.
            write_args: Additional write args to pass to the datasource.
        """

        write_tasks = datasource.prepare_write(self._blocks, **write_args)
        progress = ProgressBar("Write Progress", len(write_tasks))

        @ray.remote
        def remote_write(task: WriteTask) -> Any:
            return task()

        write_task_outputs = [remote_write.remote(w) for w in write_tasks]
        try:
            progress.block_until_complete(write_task_outputs)
            datasource.on_write_complete(write_tasks,
                                         ray.get(write_task_outputs))
        except Exception as e:
            datasource.on_write_failed(write_tasks, e)
            raise
        finally:
            progress.close()
コード例 #3
0
def sample_boundaries(blocks: BlockList[T], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    n_samples = int(num_reducers * 10 / len(blocks))

    @ray.remote
    def sample_block(block: Block[T]) -> np.ndarray:
        return BlockAccessor.for_block(block).sample(n_samples, key)

    sample_results = [sample_block.remote(block) for block in blocks]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    sample_bar.block_until_complete(sample_results)
    sample_bar.close()

    samples = ray.get(sample_results)
    sample_items = np.concatenate(samples)
    sample_items.sort()
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.arange(0, 1, 1 / num_reducers)
    ]
    return ret[1:]
コード例 #4
0
def simple_shuffle(input_blocks: BlockList[T],
                   output_num_blocks: int) -> BlockList[T]:
    input_num_blocks = len(input_blocks)

    @ray.remote(num_returns=output_num_blocks)
    def shuffle_map(block: Block[T]) -> List[Block[T]]:
        block = BlockAccessor.for_block(block)
        slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
        slices = []
        for i in range(output_num_blocks):
            slices.append(
                block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))
        num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
        assert num_rows == block.num_rows(), (num_rows, block.num_rows())
        # Needed to handle num_returns=1 edge case in Ray API.
        if len(slices) == 1:
            return slices[0]
        else:
            return slices

    @ray.remote(num_returns=2)
    def shuffle_reduce(
            *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata):
        builder = DelegatingArrowBlockBuilder()
        assert len(mapper_outputs) == input_num_blocks
        for block in mapper_outputs:
            builder.add_block(block)
        new_block = builder.build()
        accessor = BlockAccessor.for_block(new_block)
        new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                     size_bytes=accessor.size_bytes(),
                                     schema=accessor.schema(),
                                     input_files=None)
        return new_block, new_metadata

    map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks)

    shuffle_map_out = [shuffle_map.remote(block) for block in input_blocks]
    if output_num_blocks == 1:
        # Handle the num_returns=1 edge case which doesn't return a list.
        shuffle_map_out = [[x] for x in shuffle_map_out]
    map_bar.block_until_complete([x[0] for x in shuffle_map_out])
    map_bar.close()

    reduce_bar = ProgressBar("Shuffle Reduce",
                             position=0,
                             total=output_num_blocks)
    shuffle_reduce_out = [
        shuffle_reduce.remote(
            *[shuffle_map_out[i][j] for i in range(input_num_blocks)])
        for j in range(output_num_blocks)
    ]
    new_blocks, new_metadata = zip(*shuffle_reduce_out)
    reduce_bar.block_until_complete(list(new_blocks))
    new_metadata = ray.get(list(new_metadata))
    reduce_bar.close()

    return BlockList(list(new_blocks), list(new_metadata))
コード例 #5
0
ファイル: compute.py プロジェクト: vinamrabenara/ray
    def apply(self, fn: Any, remote_args: dict,
              blocks: List[Block[T]]) -> List[ObjectRef[Block]]:
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        if remote_args:
            fn = ray.remote(**remote_args)(fn)
        else:
            fn = ray.remote(fn)
        blocks = [fn.remote(b) for b in blocks]

        map_bar.block_until_complete(blocks)
        return blocks
コード例 #6
0
def sort_impl(blocks: BlockList[T],
              key: SortKeyT,
              descending: bool = False) -> BlockList[T]:
    if len(blocks) == 0:
        return BlockList([], [])

    if isinstance(key, str):
        key = [(key, "descending" if descending else "ascending")]

    if isinstance(key, list):
        descending = key[0][1] == "descending"

    num_mappers = len(blocks)
    num_reducers = num_mappers
    boundaries = sample_boundaries(blocks, key, num_reducers)
    if descending:
        boundaries.reverse()

    @ray.remote(num_returns=num_reducers)
    def sort_block(block, boundaries):
        return BlockAccessor.for_block(block).sort_and_partition(
            boundaries, key, descending)

    @ray.remote(num_returns=2)
    def merge_sorted_blocks(*blocks: List[Block[T]]) -> Block[T]:
        if len(blocks) == 1:
            blocks = blocks[0]  # Python weirdness
        return BlockAccessor.for_block(blocks[0]).merge_sorted_blocks(
            list(blocks), key, descending)

    map_results = np.empty((num_mappers, num_reducers), dtype=object)
    for i, block in enumerate(blocks):
        map_results[i, :] = sort_block.remote(block, boundaries)
    map_bar = ProgressBar("Sort Map", len(map_results))
    map_bar.block_until_complete([ret[0] for ret in map_results])
    map_bar.close()

    reduce_results = []
    for j in range(num_reducers):
        ret = merge_sorted_blocks.remote(*map_results[:, j].tolist())
        reduce_results.append(ret)
    merge_bar = ProgressBar("Sort Merge", len(reduce_results))
    merge_bar.block_until_complete([ret[0] for ret in reduce_results])
    merge_bar.close()

    blocks = [b for b, _ in reduce_results]
    metadata = ray.get([m for _, m in reduce_results])
    return BlockList(blocks, metadata)
コード例 #7
0
ファイル: compute.py プロジェクト: nikitavemuri/ray
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList[Any]) -> BlockList[Any]:
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        kwargs = remote_args.copy()
        kwargs["num_returns"] = 2

        # Lazy init to avoid circular import. TODO(ekl) move these into a
        # separate remote functions file.
        global _remote_fn
        if _remote_fn is None:
            _remote_fn = ray.remote(map_block)

        refs = [
            _remote_fn.options(**kwargs).remote(b, m, fn)
            for b, m in zip(blocks, blocks.get_metadata())
        ]
        new_blocks, new_metadata = zip(*refs)

        map_bar.block_until_complete(list(new_blocks))
        new_metadata = ray.get(list(new_metadata))
        return BlockList(list(new_blocks), list(new_metadata))
コード例 #8
0
ファイル: shuffle.py プロジェクト: vinamrabenara/ray
def simple_shuffle(input_blocks: List[ObjectRef[Block[T]]],
                   output_num_blocks: int) -> List[ObjectRef[Block[T]]]:
    input_num_blocks = len(input_blocks)

    @ray.remote(num_returns=output_num_blocks)
    def shuffle_map(block: Block[T]) -> List[Block[T]]:
        slice_sz = max(1, block.num_rows() // output_num_blocks)
        slices = []
        for i in range(output_num_blocks):
            slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz))
        return slices

    @ray.remote
    def shuffle_reduce(*mapper_outputs: List[Block[T]]) -> Block[T]:
        builder = DelegatingArrowBlockBuilder()
        assert len(mapper_outputs) == input_num_blocks
        for block in mapper_outputs:
            builder.add_block(block)
        return builder.build()

    map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks)
    reduce_bar = ProgressBar("Shuffle Reduce",
                             position=1,
                             total=output_num_blocks)

    shuffle_map_out = [shuffle_map.remote(block) for block in input_blocks]
    map_bar.block_until_complete([x[0] for x in shuffle_map_out])

    shuffle_reduce_out = [
        shuffle_reduce.remote(
            *[shuffle_map_out[i][j] for i in range(input_num_blocks)])
        for j in range(output_num_blocks)
    ]
    reduce_bar.block_until_complete(shuffle_reduce_out)

    map_bar.close()
    reduce_bar.close()
    return shuffle_reduce_out