Ejemplo n.º 1
0
    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=stats.build())
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block
 def prepare_read(self, parallelism: int):
     value = DatasetContext.get_current().foo
     meta = BlockMetadata(num_rows=1,
                          size_bytes=8,
                          schema=None,
                          input_files=None)
     return [ReadTask(lambda: [[value]], meta)]
Ejemplo n.º 3
0
    def prepare_read(
        self,
        parallelism: int,
        dataset_factory: Callable[[], "tf.data.Dataset"],
    ) -> List[ReadTask]:
        """Return a read task that loads a TensorFlow dataset.

        Arguments:
            parallelism: This argument isn't used.
            dataset_factory: A no-argument function that returns the TensorFlow dataset
                to be read.
        """
        if parallelism > 1:
            logger.warn(
                "`SimpleTensorFlowDatasource` doesn't support parallel reads. The "
                "`parallelism` argument will be ignored."
            )

        def read_fn() -> Iterator[Block]:
            # Load the entire dataset into memory.
            block = list(dataset_factory())
            # Store the data in a single block.
            yield block

        metadata = BlockMetadata(
            num_rows=None,
            size_bytes=None,
            schema=None,
            input_files=None,
            exec_stats=None,
        )
        return [ReadTask(read_fn, metadata)]
Ejemplo n.º 4
0
    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        start_time, start_cpu = time.perf_counter(), time.process_time()
        exec_stats = BlockExecStats()

        # Execute the read task.
        block = task()

        exec_stats.cpu_time_s = time.process_time() - start_cpu
        exec_stats.wall_time_s = time.perf_counter() - start_time
        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = exec_stats
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=exec_stats)
        stats_actor.add.remote(stats_uuid, i, metadata)
        return block
Ejemplo n.º 5
0
    def prepare_read(
        self,
        parallelism: int,
        n_per_block: int,
    ) -> List[ReadTask]:
        read_tasks: List[ReadTask] = []
        meta = BlockMetadata(
            num_rows=1,
            size_bytes=n_per_block,
            schema=None,
            input_files=None,
            exec_stats=None,
        )

        for _ in range(parallelism):
            read_tasks.append(
                ReadTask(lambda: [[np.ones(n_per_block, dtype=np.uint8)]],
                         meta))
        return read_tasks
Ejemplo n.º 6
0
    def prepare_read(
        self, parallelism: int, n: int, num_columns: int
    ) -> List[ReadTask]:
        _check_pyarrow_version()
        import pyarrow

        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)

        def make_block(count: int, num_columns: int) -> Block:
            return pyarrow.Table.from_arrays(
                np.random.randint(
                    np.iinfo(np.int64).max, size=(num_columns, count), dtype=np.int64
                ),
                names=[f"c_{i}" for i in range(num_columns)],
            )

        schema = pyarrow.Table.from_pydict(
            {f"c_{i}": [0] for i in range(num_columns)}
        ).schema

        i = 0
        while i < n:
            count = min(block_size, n - i)
            meta = BlockMetadata(
                num_rows=count,
                size_bytes=8 * count * num_columns,
                schema=schema,
                input_files=None,
                exec_stats=None,
            )
            read_tasks.append(
                ReadTask(
                    lambda count=count, num_columns=num_columns: [
                        make_block(count, num_columns)
                    ],
                    meta,
                )
            )
            i += block_size

        return read_tasks
Ejemplo n.º 7
0
def _execute_read_task(
    i: int,
    task: ReadTask,
    context: DatasetContext,
    stats_uuid: str,
    stats_actor: ray.actor.ActorHandle,
) -> Tuple[MaybeBlockPartition, BlockPartitionMetadata]:
    DatasetContext._set_current(context)
    stats = BlockExecStats.builder()

    # Execute the task.
    block = task()

    metadata = task.get_metadata()
    if context.block_splitting_enabled:
        metadata.exec_stats = stats.build()
    else:
        metadata = BlockAccessor.for_block(block).get_metadata(
            input_files=metadata.input_files, exec_stats=stats.build())
    stats_actor.record_task.remote(stats_uuid, i, metadata)
    return block, metadata