コード例 #1
0
    def prepare_read(self, parallelism: int, n: int,
                     use_arrow: bool) -> List[ReadTask]:
        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)

        # Example of a read task. In a real datasource, this would pull data
        # from an external system instead of generating dummy data.
        def make_block(start: int, count: int) -> Block:
            if use_arrow:
                return pyarrow.Table.from_arrays(
                    [np.arange(start, start + count)], names=["value"])
            else:
                return list(builtins.range(start, start + count))

        i = 0
        while i < n:
            count = min(block_size, n - i)
            if use_arrow:
                import pyarrow
                schema = pyarrow.Table.from_pydict({"value": [0]}).schema
            else:
                schema = int
            read_tasks.append(
                ReadTask(
                    lambda i=i, count=count: make_block(i, count),
                    BlockMetadata(
                        num_rows=count,
                        size_bytes=8 * count,
                        schema=schema,
                        input_files=None)))
            i += block_size

        return read_tasks
コード例 #2
0
ファイル: compute.py プロジェクト: DmitriGekhtman/ray
 def wrapped_fn(block: Block, meta: BlockMetadata):
     new_block = fn(block)
     accessor = BlockAccessor.for_block(new_block)
     new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                              size_bytes=accessor.size_bytes(),
                              schema=accessor.schema(),
                              input_files=meta.input_files)
     return new_block, new_meta
コード例 #3
0
ファイル: compute.py プロジェクト: ddworak94/ray
 def process_block(
         self, block: Block[T],
         meta: BlockMetadata) -> (Block[U], BlockMetadata):
     new_block = fn(block)
     accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                  size_bytes=accessor.size_bytes(),
                                  schema=accessor.schema(),
                                  input_files=meta.input_files)
     return new_block, new_metadata
コード例 #4
0
ファイル: dataset.py プロジェクト: DmitriGekhtman/ray
 def truncate(block: Block, meta: BlockMetadata,
              count: int) -> (Block, BlockMetadata):
     block = BlockAccessor.for_block(block)
     logger.debug("Truncating last block to size: {}".format(count))
     new_block = block.slice(0, count, copy=True)
     accessor = BlockAccessor.for_block(new_block)
     new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                              size_bytes=accessor.size_bytes(),
                              schema=meta.schema,
                              input_files=meta.input_files)
     return new_block, new_meta
コード例 #5
0
 def shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata):
     builder = DelegatingArrowBlockBuilder()
     assert len(mapper_outputs) == input_num_blocks
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                  size_bytes=accessor.size_bytes(),
                                  schema=accessor.schema(),
                                  input_files=None)
     return new_block, new_metadata
コード例 #6
0
def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    @ray.remote
    def remote_read(task: ReadTask) -> Block:
        return task()

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(lambda task=task: remote_read.remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:

        @ray.remote
        def get_schema(block: Block) -> Any:
            return BlockAccessor.for_block(block).schema()

        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)
コード例 #7
0
ファイル: read_api.py プロジェクト: AmeerHajAli/ray
def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.

    Examples:
        >>> # Read a directory of files in remote storage.
        >>> ray.data.read_parquet("s3://bucket/path")

        >>> # Read multiple local files.
        >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"])

    Args:
        paths: A single file path or a list of file paths (or directories).
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.

    Returns:
        Dataset holding Arrow records read from the specified paths.
    """
    import pyarrow.parquet as pq

    if filesystem is None:
        filesystem, paths = _parse_paths(paths)
    pq_ds = pq.ParquetDataset(paths,
                              **arrow_parquet_args,
                              filesystem=filesystem)
    pieces = pq_ds.pieces

    read_tasks = [[] for _ in builtins.range(parallelism)]
    # TODO(ekl) support reading row groups (maybe as an option)
    for i, piece in enumerate(pq_ds.pieces):
        read_tasks[i % len(read_tasks)].append(piece)
    nonempty_tasks = [r for r in read_tasks if r]

    @ray.remote
    def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]):
        import pyarrow
        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [piece.to_table() for piece in pieces]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
        else:
            table = tables[0]
        return table

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
    for pieces in nonempty_tasks:
        calls.append(lambda pieces=pieces: gen_read.remote(pieces))
        piece_metadata = []
        for p in pieces:
            try:
                piece_metadata.append(p.metadata)
            except AttributeError:
                break
        input_files = [p.path for p in pieces]
        if len(piece_metadata) == len(pieces):
            # Piece metadata was available, constructo a normal BlockMetadata.
            block_metadata = BlockMetadata(
                num_rows=sum(m.num_rows for m in piece_metadata),
                size_bytes=sum(
                    sum(
                        m.row_group(i).total_byte_size
                        for i in builtins.range(m.num_row_groups))
                    for m in piece_metadata),
                schema=piece_metadata[0].schema.to_arrow_schema(),
                input_files=input_files)
        else:
            # Piece metadata was not available, construct an empty
            # BlockMetadata.
            block_metadata = BlockMetadata(num_rows=None,
                                           size_bytes=None,
                                           schema=None,
                                           input_files=input_files)
        metadata.append(block_metadata)

    return Dataset(LazyBlockList(calls, metadata))