Esempio n. 1
0
def test_memory_release_pipeline(shutdown_only, lazy_input):
    context = DatasetContext.get_current()
    # Disable stage fusion so we can keep reads and maps from being fused together,
    # since we're trying to test multi-stage memory releasing here.
    context.optimize_fuse_stages = False
    # This object store allocation can hold at most 1 copy of the transformed dataset.
    if lazy_input:
        object_store_memory = 3000e6
    else:
        object_store_memory = 3000e6

    n = 10
    info = ray.init(num_cpus=n, object_store_memory=object_store_memory)
    if lazy_input:
        ds = ray.data.read_datasource(
            OnesSource(),
            parallelism=n,
            n_per_block=100 * 1024 * 1024,
        )
    else:
        ds = ray.data.from_items(list(range(n)), parallelism=n)

    # Create a single-window pipeline.
    pipe = ds.window(blocks_per_window=n)

    # Round 1.
    def gen(x):
        import time

        # TODO(Clark): Remove this sleep once we have fixed memory pressure handling.
        time.sleep(2)
        if isinstance(x, np.ndarray):
            return x
        else:
            return np.ones(100 * 1024 * 1024, dtype=np.uint8)

    pipe = pipe.map(gen)

    def inc(x):
        import time

        # TODO(Clark): Remove this sleep once we have fixed memory pressure handling.
        time.sleep(2)
        return x + 1

    num_rounds = 10
    for _ in range(num_rounds):
        pipe = pipe.map(inc)

    for block in pipe.iter_batches(batch_size=None):
        for arr in block:
            np.testing.assert_equal(
                arr,
                np.ones(100 * 1024 * 1024, dtype=np.uint8) + num_rounds,
            )
    meminfo = memory_summary(info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo
Esempio n. 2
0
 def prepare_read(self, parallelism: int):
     value = DatasetContext.get_current().foo
     meta = BlockMetadata(
         num_rows=1,
         size_bytes=8,
         schema=None,
         input_files=None,
         exec_stats=None)
     return [ReadTask(lambda: [[value]], meta)]
Esempio n. 3
0
def test_dataset_pipeline_stats_basic(ray_start_regular_shared):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    ds = ray.data.range(1000, parallelism=10)
    ds = ds.map_batches(lambda x: x)
    pipe = ds.repeat(5)
    pipe = pipe.map(lambda x: x)
    for batch in pipe.iter_batches():
        pass
    stats = canonicalize(pipe.stats())
    assert (stats == """== Pipeline Window N ==
Stage N read->map_batches: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used

Stage N map: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used

== Pipeline Window N ==
Stage N read->map_batches: [execution cached]

Stage N map: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used

== Pipeline Window N ==
Stage N read->map_batches: [execution cached]

Stage N map: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used

##### Overall Pipeline Time Breakdown #####
* Time stalled waiting for next dataset: T min, T max, T mean, T total

DatasetPipeline iterator time breakdown:
* Waiting for next dataset: T
* In ray.wait(): T
* In ray.get(): T
* In format_batch(): T
* In user code: T
* Total time: T
""")
Esempio n. 4
0
def test_repeat_forever(ray_start_regular_shared):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    ds = ray.data.range(10)
    pipe = ds.repeat()
    assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)"
    for i, v in enumerate(pipe.iter_rows()):
        assert v == i % 10, (v, i, i % 10)
        if i > 1000:
            break
Esempio n. 5
0
    def __next__(self):
        output = None

        while output is None:
            if all(s is None for s in self._stages):
                raise StopIteration

            # Wait for any completed stages.
            pending = [s for s in self._stages if s is not None]
            ready, _ = ray.wait(pending, timeout=0.1, num_returns=len(pending))

            # Bubble elements down the pipeline as they become ready.
            for i in range(len(self._stages))[::-1]:
                is_last = i + 1 >= len(self._stages)
                next_slot_free = is_last or self._stages[i + 1] is None
                if not next_slot_free:
                    continue

                slot_ready = self._stages[i] in ready
                if not slot_ready:
                    continue

                # Bubble.
                result = ray.get(self._stages[i])
                if self._bars:
                    self._bars[i].update(1)
                self._stages[i] = None
                if is_last:
                    output = result
                else:
                    fn = self._pipeline._stages[i]
                    self._stages[i + 1] = pipeline_stage.remote(
                        lambda: fn(result), DatasetContext.get_current())

            # Pull a new element for the initial slot if possible.
            if self._stages[0] is None:
                try:
                    self._stages[0] = pipeline_stage.remote(
                        next(self._iter), DatasetContext.get_current())
                except StopIteration:
                    pass

        return output
Esempio n. 6
0
    def iter_blocks_with_metadata(
        self,
        block_for_metadata: bool = False,
    ) -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]:
        """Iterate over the blocks along with their metadata.

        Note that, if block_for_metadata is False (default), this iterator returns
        pre-read metadata from the ReadTasks given to this LazyBlockList so it doesn't
        have to block on the execution of the read tasks. Therefore, the metadata may be
        under-specified, e.g. missing schema or the number of rows. If fully-specified
        block metadata is required, pass block_for_metadata=True.

        The length of this iterator is not known until execution.

        Args:
            block_for_metadata: Whether we should block on the execution of read tasks
                in order to obtain fully-specified block metadata.

        Returns:
            An iterator of block references and the corresponding block metadata.
        """
        context = DatasetContext.get_current()
        outer = self

        class Iter:
            def __init__(self):
                self._base_iter = outer._iter_block_partition_refs()
                self._pos = -1
                self._buffer = []

            def __iter__(self):
                return self

            def __next__(self):
                while not self._buffer:
                    self._pos += 1
                    if context.block_splitting_enabled:
                        part_ref, _ = next(self._base_iter)
                        partition = ray.get(part_ref)
                    else:
                        block_ref, metadata_ref = next(self._base_iter)
                        if block_for_metadata:
                            # This blocks until the read task completes, returning
                            # fully-specified block metadata.
                            metadata = ray.get(metadata_ref)
                        else:
                            # This does not block, returning (possibly under-specified)
                            # pre-read block metadata.
                            metadata = outer._tasks[self._pos].get_metadata()
                        partition = [(block_ref, metadata)]
                    for block_ref, metadata in partition:
                        self._buffer.append((block_ref, metadata))
                return self._buffer.pop(0)

        return Iter()
Esempio n. 7
0
 def is_read_stage_equivalent(self) -> bool:
     """Return whether this plan can be executed as only a read stage."""
     context = DatasetContext.get_current()
     remaining_stages = self._stages_after_snapshot
     if (context.optimize_fuse_stages and remaining_stages
             and isinstance(remaining_stages[0], RandomizeBlocksStage)):
         remaining_stages = remaining_stages[1:]
     return (self.has_lazy_input() and not self._stages_before_snapshot
             and not remaining_stages
             and (not self._snapshot_blocks
                  or isinstance(self._snapshot_blocks, LazyBlockList)))
Esempio n. 8
0
def test_window_randomize_fusion(ray_start_regular_shared):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    context.optimize_fuse_read_stages = True
    context.optimize_reorder_stages = True

    pipe = ray.data.range(100).randomize_block_order().window().map_batches(
        lambda x: x)
    pipe.take()
    stats = pipe.stats()
    assert "read->randomize_block_order->map_batches" in stats, stats
Esempio n. 9
0
def from_pandas_refs(
    dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]]
) -> Dataset[ArrowRow]:
    """Create a dataset from a list of Ray object references to Pandas
    dataframes.

    Args:
        dfs: A Ray object references to pandas dataframe, or a list of
             Ray object references to pandas dataframes.

    Returns:
        Dataset holding Arrow records read from the dataframes.
    """
    if isinstance(dfs, ray.ObjectRef):
        dfs = [dfs]
    elif isinstance(dfs, list):
        for df in dfs:
            if not isinstance(df, ray.ObjectRef):
                raise ValueError(
                    "Expected list of Ray object refs, "
                    f"got list containing {type(df)}"
                )
    else:
        raise ValueError(
            "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}"
        )

    context = DatasetContext.get_current()
    if context.enable_pandas_block:
        get_metadata = cached_remote_fn(_get_metadata)
        metadata = ray.get([get_metadata.remote(df) for df in dfs])
        return Dataset(
            ExecutionPlan(
                BlockList(dfs, metadata),
                DatasetStats(stages={"from_pandas_refs": metadata}, parent=None),
            ),
            0,
            False,
        )

    df_to_block = cached_remote_fn(_df_to_block, num_returns=2)

    res = [df_to_block.remote(df) for df in dfs]
    blocks, metadata = map(list, zip(*res))
    metadata = ray.get(metadata)
    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_pandas_refs": metadata}, parent=None),
        ),
        0,
        False,
    )
Esempio n. 10
0
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> import ray
        >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP
        >>> ds # doctest: +SKIP
        Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>)
        >>> ds.take(2) # doctest: +SKIP
        [1, 2]

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """

    detected_parallelism, _ = _autodetect_parallelism(
        parallelism,
        ray.util.get_current_placement_group(),
        DatasetContext.get_current(),
    )
    block_size = max(
        1,
        len(items) // detected_parallelism,
    )

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        stats = BlockExecStats.builder()
        builder = DelegatingBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=stats.build()))
        i += block_size

    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_items": metadata}, parent=None),
        ),
        0,
        False,
    )
Esempio n. 11
0
def _map_block_split(block: Block, fn: Any,
                     input_files: List[str]) -> BlockPartition:
    output = []
    for new_block in fn(block):
        accessor = BlockAccessor.for_block(new_block)
        new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                                 size_bytes=accessor.size_bytes(),
                                 schema=accessor.schema(),
                                 input_files=input_files)
        owner = DatasetContext.get_current().block_owner
        output.append((ray.put(new_block, _owner=owner), new_meta))
    return output
Esempio n. 12
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: BlockList) -> BlockList:
        context = DatasetContext.get_current()

        # Handle empty datasets.
        if blocks.initial_num_blocks() == 0:
            return blocks

        blocks = blocks.get_blocks_with_metadata()
        map_bar = ProgressBar("Map Progress", total=len(blocks))

        if context.block_splitting_enabled:
            map_block = cached_remote_fn(_map_block_split).options(
                **remote_args)
            refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks]
        else:
            map_block = cached_remote_fn(_map_block_nosplit).options(
                **dict(remote_args, num_returns=2))
            all_refs = [
                map_block.remote(b, fn, m.input_files) for b, m in blocks
            ]
            data_refs = [r[0] for r in all_refs]
            refs = [r[1] for r in all_refs]

        # Common wait for non-data refs.
        try:
            results = map_bar.fetch_until_complete(refs)
        except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e:
            # One or more mapper tasks failed, or we received a SIGINT signal
            # while waiting; either way, we cancel all map tasks.
            for ref in refs:
                ray.cancel(ref)
            # Wait until all tasks have failed or been cancelled.
            for ref in refs:
                try:
                    ray.get(ref)
                except (ray.exceptions.RayTaskError,
                        ray.exceptions.TaskCancelledError):
                    pass
            # Reraise the original task failure exception.
            raise e from None

        new_blocks, new_metadata = [], []
        if context.block_splitting_enabled:
            for result in results:
                for block, metadata in result:
                    new_blocks.append(block)
                    new_metadata.append(metadata)
        else:
            for block, metadata in zip(data_refs, results):
                new_blocks.append(block)
                new_metadata.append(metadata)
        return BlockList(list(new_blocks), list(new_metadata))
Esempio n. 13
0
        def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]:
            # Implicitly trigger S3 subsystem initialization by importing
            # pyarrow.fs.
            import pyarrow.fs  # noqa: F401

            # Deserialize after loading the filesystem class.
            try:
                _register_parquet_file_fragment_serialization()
                pieces: List[
                    "pyarrow._dataset.ParquetFileFragment"
                ] = cloudpickle.loads(serialized_pieces)
            finally:
                _deregister_parquet_file_fragment_serialization()

            # Ensure that we're reading at least one dataset fragment.
            assert len(pieces) > 0

            from pyarrow.dataset import _get_partition_keys

            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )

            logger.debug(f"Reading {len(pieces)} parquet pieces")
            use_threads = reader_args.pop("use_threads", False)
            for piece in pieces:
                part = _get_partition_keys(piece.partition_expression)
                batches = piece.to_batches(
                    use_threads=use_threads,
                    columns=columns,
                    schema=schema,
                    batch_size=PARQUET_READER_ROW_BATCH_SIZE,
                    **reader_args,
                )
                for batch in batches:
                    table = pyarrow.Table.from_batches([batch], schema=schema)
                    if part:
                        for col, value in part.items():
                            table = table.set_column(
                                table.schema.get_field_index(col),
                                col,
                                pa.array([value] * len(table)),
                            )
                    # If the table is empty, drop it.
                    if table.num_rows > 0:
                        output_buffer.add_block(table)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()
Esempio n. 14
0
def test_autodetect_parallelism(avail_cpus, data_size, expected):
    class MockReader:
        def estimate_inmemory_data_size(self):
            return data_size

    result, _ = _autodetect_parallelism(
        parallelism=-1,
        cur_pg=None,
        ctx=DatasetContext.get_current(),
        reader=MockReader(),
        avail_cpus=avail_cpus,
    )
    assert result == expected, (result, expected)
def test_read(ray_start_regular_shared):
    class CustomDatasource(Datasource):
        def prepare_read(self, parallelism: int):
            value = DatasetContext.get_current().foo
            meta = BlockMetadata(num_rows=1,
                                 size_bytes=8,
                                 schema=None,
                                 input_files=None)
            return [ReadTask(lambda: [[value]], meta)]

    context = DatasetContext.get_current()
    context.foo = 12345
    assert ray.data.read_datasource(CustomDatasource()).take_all()[0] == 12345
Esempio n. 16
0
 def merge_sorted_blocks(
     blocks: List[Block[T]], key: "SortKeyT", _descending: bool
 ) -> Tuple[Block[T], BlockMetadata]:
     stats = BlockExecStats.builder()
     blocks = [b for b in blocks if b.num_rows > 0]
     if len(blocks) == 0:
         ret = ArrowBlockAccessor._empty_table()
     else:
         concat_and_sort = get_concat_and_sort_transform(
             DatasetContext.get_current()
         )
         ret = concat_and_sort(blocks, key, _descending)
     return ret, ArrowBlockAccessor(ret).get_metadata(None, exec_stats=stats.build())
Esempio n. 17
0
    def _split(self, n: int, splitter: Callable[[Dataset],
                                                "DatasetPipeline[T]"]):

        coordinator = PipelineSplitExecutorCoordinator.remote(
            self, n, splitter, DatasetContext.get_current())
        if self._executed[0]:
            raise RuntimeError("Pipeline cannot be read multiple times.")
        self._executed[0] = True

        class SplitIterator:
            def __init__(self, split_index, coordinator):
                self.split_index = split_index
                self.coordinator = coordinator
                self.warn_threshold = 100
                self.wait_delay_s = 0.1

            def __iter__(self):
                return self

            def __next__(self):
                ds = None
                tries = 0
                while ds is None:
                    ds = ray.get(
                        self.coordinator.next_dataset_if_ready.remote(
                            self.split_index))
                    # Wait for other shards to catch up reading.
                    if not ds:
                        time.sleep(self.wait_delay_s)
                        tries += 1
                    if tries > self.warn_threshold:
                        print("Warning: reader on shard {} of the pipeline "
                              "has been blocked more than {}s waiting for "
                              "other readers to catch up. All pipeline shards "
                              "must be read from concurrently.".format(
                                  self.split_index,
                                  self.wait_delay_s * self.warn_threshold,
                              ))
                        self.warn_threshold *= 2
                return lambda: ds

        return [
            # Disable progress bars for the split readers since they would
            # overwhelm the console.
            DatasetPipeline(
                SplitIterator(idx, coordinator),
                length=self._length,
                progress_bars=False,
            ) for idx in range(n)
        ]
Esempio n. 18
0
File: plan.py Progetto: alipay/ray
 def can_fuse(self, prev: Stage):
     context = DatasetContext.get_current()
     # TODO(ekl) also support fusing shuffle stages to subsequent 1:1 stages.
     if not context.optimize_fuse_shuffle_stages:
         return False
     if not self.supports_block_udf:
         return False
     if not isinstance(prev, OneToOneStage):
         return False
     if prev.compute != "tasks":
         return False
     if any(k not in INHERITABLE_REMOTE_ARGS for k in prev.ray_remote_args):
         return False
     return True
Esempio n. 19
0
def test_dataset_stats_read_parquet(ray_start_regular_shared, tmp_path):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    ds = ray.data.range(1000, parallelism=10)
    ds.write_parquet(str(tmp_path))
    ds = ray.data.read_parquet(str(tmp_path)).map(lambda x: x)
    stats = canonicalize(ds.stats())
    assert (stats == """Stage N read->map: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used
""")
Esempio n. 20
0
def test_auto_parallelism_basic(shutdown_only):
    ray.init(num_cpus=8)
    context = DatasetContext.get_current()
    context.min_parallelism = 1
    # Datasource bound.
    ds = ray.data.range_tensor(5, shape=(100,), parallelism=-1)
    assert ds.num_blocks() == 5, ds
    # CPU bound. TODO(ekl) we should fix range datasource to respect parallelism more
    # properly, currently it can go a little over.
    ds = ray.data.range_tensor(10000, shape=(100,), parallelism=-1)
    assert ds.num_blocks() == 16, ds
    # Block size bound.
    ds = ray.data.range_tensor(100000000, shape=(100,), parallelism=-1)
    assert ds.num_blocks() == 150, ds
Esempio n. 21
0
def test_dataset_pipeline_split_stats_basic(ray_start_regular_shared):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    ds = ray.data.range(1000, parallelism=10)
    pipe = ds.repeat(2)

    @ray.remote
    def consume(split):
        for batch in split.iter_batches():
            pass
        return split.stats()

    s0, s1 = pipe.split(2)
    stats = ray.get([consume.remote(s0), consume.remote(s1)])
    assert (canonicalize(stats[0]) == """== Pipeline Window Z ==
Stage N read: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used

Dataset iterator time breakdown:
* In ray.wait(): T
* In ray.get(): T
* In format_batch(): T
* In user code: T
* Total time: T

== Pipeline Window N ==
Stage N read: N/N blocks executed in T
* Remote wall time: T min, T max, T mean, T total
* Remote cpu time: T min, T max, T mean, T total
* Output num rows: N min, N max, N mean, N total
* Output size bytes: N min, N max, N mean, N total
* Tasks per node: N min, N max, N mean; N nodes used

Dataset iterator time breakdown:
* In ray.wait(): T
* In ray.get(): T
* In format_batch(): T
* In user code: T
* Total time: T

##### Overall Pipeline Time Breakdown #####
* Time stalled waiting for next dataset: T min, T max, T mean, T total
* Time in dataset iterator: T
* Time in user code: T
* Total time: T
""")
Esempio n. 22
0
def _read_pieces(
        block_udf, reader_args, columns, schema,
        serialized_pieces: List[_SerializedPiece]
) -> Iterator["pyarrow.Table"]:
    # Deserialize after loading the filesystem class.
    pieces: List[
        "pyarrow._dataset.ParquetFileFragment"] = _deserialize_pieces_with_retry(
            serialized_pieces)

    # Ensure that we're reading at least one dataset fragment.
    assert len(pieces) > 0

    import pyarrow as pa
    from pyarrow.dataset import _get_partition_keys

    ctx = DatasetContext.get_current()
    output_buffer = BlockOutputBuffer(
        block_udf=block_udf,
        target_max_block_size=ctx.target_max_block_size,
    )

    logger.debug(f"Reading {len(pieces)} parquet pieces")
    use_threads = reader_args.pop("use_threads", False)
    for piece in pieces:
        part = _get_partition_keys(piece.partition_expression)
        batches = piece.to_batches(
            use_threads=use_threads,
            columns=columns,
            schema=schema,
            batch_size=PARQUET_READER_ROW_BATCH_SIZE,
            **reader_args,
        )
        for batch in batches:
            table = pa.Table.from_batches([batch], schema=schema)
            if part:
                for col, value in part.items():
                    table = table.set_column(
                        table.schema.get_field_index(col),
                        col,
                        pa.array([value] * len(table)),
                    )
            # If the table is empty, drop it.
            if table.num_rows > 0:
                output_buffer.add_block(table)
                if output_buffer.has_next():
                    yield output_buffer.next()
    output_buffer.finalize()
    if output_buffer.has_next():
        yield output_buffer.next()
Esempio n. 23
0
        def read_files(
            read_paths: List[str],
            fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper],
        ) -> Iterable[Block]:
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf,
                target_max_block_size=ctx.target_max_block_size)
            for read_path in read_paths:
                compression = open_stream_args.pop("compression", None)
                if compression is None:
                    import pyarrow as pa

                    try:
                        # If no compression manually given, try to detect
                        # compression codec from path.
                        compression = pa.Codec.detect(read_path).name
                    except (ValueError, TypeError):
                        # Arrow's compression inference on the file path
                        # doesn't work for Snappy, so we double-check ourselves.
                        import pathlib

                        suffix = pathlib.Path(read_path).suffix
                        if suffix and suffix[1:] == "snappy":
                            compression = "snappy"
                        else:
                            compression = None
                if compression == "snappy":
                    # Pass Snappy compression as a reader arg, so datasource subclasses
                    # can manually handle streaming decompression in
                    # self._read_stream().
                    reader_args["compression"] = compression
                    reader_args["filesystem"] = fs
                elif compression is not None:
                    # Non-Snappy compression, pass as open_input_stream() arg so Arrow
                    # can take care of streaming decompression for us.
                    open_stream_args["compression"] = compression
                with self._open_input_source(fs, read_path,
                                             **open_stream_args) as f:
                    for data in read_stream(f, read_path, **reader_args):
                        output_buffer.add_block(data)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()
Esempio n. 24
0
def test_memory_release_lazy(shutdown_only):
    context = DatasetContext.get_current()
    # Ensure that stage fusion is enabled.
    context.optimize_fuse_stages = True
    info = ray.init(num_cpus=1, object_store_memory=1500e6)
    ds = ray.data.range(10)

    # Should get fused into single stage.
    ds = ds.experimental_lazy()
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8))
    ds.fully_executed()
    meminfo = memory_summary(info.address_info["address"], stats_only=True)
    assert "Spilled" not in meminfo, meminfo
Esempio n. 25
0
 def _optimize(self) -> Tuple[BlockList, DatasetStats, List[Stage]]:
     """Apply stage fusion optimizations, returning an updated source block list and
     associated stats, and a set of optimized stages.
     """
     context = DatasetContext.get_current()
     blocks, stats, stages = self._get_source_blocks_and_stages()
     if context.optimize_fuse_stages:
         if context.optimize_fuse_read_stages:
             # If using a lazy datasource, rewrite read stage into one-to-one stage
             # so it can be fused into downstream stages.
             blocks, stats, stages = _rewrite_read_stages(
                 blocks, stats, stages, self._dataset_uuid)
         stages = _fuse_one_to_one_stages(stages)
         self._last_optimized_stages = stages
     return blocks, stats, stages
Esempio n. 26
0
 def process_block(
     self, block: Block, input_files: List[str]
 ) -> Iterable[Tuple[Block, BlockMetadata]]:
     output = []
     for new_block in fn(block):
         accessor = BlockAccessor.for_block(new_block)
         new_metadata = BlockMetadata(
             num_rows=accessor.num_rows(),
             size_bytes=accessor.size_bytes(),
             schema=accessor.schema(),
             input_files=input_files)
         owner = DatasetContext.get_current().block_owner
         output.append((ray.put(new_block,
                                _owner=owner), new_metadata))
     return output
Esempio n. 27
0
def test_optimize_equivalent_remote_args(ray_start_regular_shared):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    context.optimize_fuse_read_stages = True
    context.optimize_fuse_shuffle_stages = True

    equivalent_kwargs = [
        {},
        {
            "resources": {
                "blah": 0
            }
        },
        {
            "resources": {
                "blah": None
            }
        },
        {
            "num_cpus": None
        },
        {
            "num_cpus": 1
        },
        {
            "num_cpus": 1,
            "num_gpus": 0
        },
        {
            "num_cpus": 1,
            "num_gpus": None
        },
    ]

    for kwa in equivalent_kwargs:
        for kwb in equivalent_kwargs:
            print("CHECKING", kwa, kwb)
            pipe = ray.data.range(3).repeat(2)
            pipe = pipe.map_batches(lambda x: x, compute="tasks", **kwa)
            pipe = pipe.map_batches(lambda x: x, compute="tasks", **kwb)
            pipe.take()
            expect_stages(
                pipe,
                1,
                [
                    "read->map_batches->map_batches",
                ],
            )
Esempio n. 28
0
def test_dataset_stats_shuffle(ray_start_regular_shared):
    context = DatasetContext.get_current()
    context.optimize_fuse_stages = True
    ds = ray.data.range(1000, parallelism=10)
    ds = ds.random_shuffle().repartition(1, shuffle=True)
    stats = canonicalize(ds.stats())
    assert (
        stats
        == """Stage N read->random_shuffle: executed in T

    Substage Z read->random_shuffle_map: N/N blocks executed
    * Remote wall time: T min, T max, T mean, T total
    * Remote cpu time: T min, T max, T mean, T total
    * Peak heap memory usage (MiB): N min, N max, N mean
    * Output num rows: N min, N max, N mean, N total
    * Output size bytes: N min, N max, N mean, N total
    * Tasks per node: N min, N max, N mean; N nodes used

    Substage N random_shuffle_reduce: N/N blocks executed
    * Remote wall time: T min, T max, T mean, T total
    * Remote cpu time: T min, T max, T mean, T total
    * Peak heap memory usage (MiB): N min, N max, N mean
    * Output num rows: N min, N max, N mean, N total
    * Output size bytes: N min, N max, N mean, N total
    * Tasks per node: N min, N max, N mean; N nodes used

Stage N repartition: executed in T

    Substage Z repartition_map: N/N blocks executed
    * Remote wall time: T min, T max, T mean, T total
    * Remote cpu time: T min, T max, T mean, T total
    * Peak heap memory usage (MiB): N min, N max, N mean
    * Output num rows: N min, N max, N mean, N total
    * Output size bytes: N min, N max, N mean, N total
    * Tasks per node: N min, N max, N mean; N nodes used

    Substage N repartition_reduce: N/N blocks executed
    * Remote wall time: T min, T max, T mean, T total
    * Remote cpu time: T min, T max, T mean, T total
    * Peak heap memory usage (MiB): N min, N max, N mean
    * Output num rows: N min, N max, N mean, N total
    * Output size bytes: N min, N max, N mean, N total
    * Tasks per node: N min, N max, N mean; N nodes used
"""
    )
Esempio n. 29
0
def _get_or_create_stats_actor():
    # Need to re-create it if Ray restarts (mostly for unit tests).
    if (not _stats_actor[0] or not ray.is_initialized()
            or _stats_actor[1] != ray.get_runtime_context().job_id.hex()):
        ctx = DatasetContext.get_current()
        _stats_actor[0] = _StatsActor.options(
            name="datasets_stats_actor",
            get_if_exists=True,
            scheduling_strategy=ctx.scheduling_strategy,
        ).remote()
        _stats_actor[1] = ray.get_runtime_context().job_id.hex()

        # Clear the actor handle after Ray reinits since it's no longer valid.
        def clear_actor():
            _stats_actor[0] = None

        ray.worker._post_init_hooks.append(clear_actor)
    return _stats_actor[0]
Esempio n. 30
0
 def __call__(self) -> BlockPartition:
     context = DatasetContext.get_current()
     result = self._read_fn()
     if not hasattr(result, "__iter__"):
         DeprecationWarning(
             "Read function must return Iterable[Block], got {}. "
             "Probably you need to return `[block]` instead of "
             "`block`.".format(result))
     partition: BlockPartition = []
     for block in result:
         metadata = BlockAccessor.for_block(block).get_metadata(
             input_files=self._metadata.input_files)
         assert context.block_owner
         partition.append((ray.put(block, _owner=context.block_owner),
                           metadata))
     if len(partition) == 0:
         raise ValueError("Read task must return non-empty list.")
     return partition