Example #1
0
def read_datasource(datasource: Datasource[T],
                    *,
                    parallelism: int = 200,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    @ray.remote
    def remote_read(task: ReadTask) -> Block:
        return task()

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(lambda task=task: remote_read.remote(task))
        metadata.append(task.get_metadata())

    block_list = LazyBlockList(calls, metadata)

    # Get the schema from the first block synchronously.
    if metadata and metadata[0].schema is None:

        @ray.remote
        def get_schema(block: Block) -> Any:
            return BlockAccessor.for_block(block).schema()

        schema0 = ray.get(get_schema.remote(next(iter(block_list))))
        block_list.set_metadata(
            0,
            BlockMetadata(
                num_rows=metadata[0].num_rows,
                size_bytes=metadata[0].size_bytes,
                schema=schema0,
                input_files=metadata[0].input_files,
            ))

    return Dataset(block_list)
Example #2
0
def read_datasource(datasource: Datasource[T],
                    parallelism: int = 200,
                    **read_args) -> Dataset[T]:
    """Read a dataset from a custom data source.

    Args:
        datasource: The datasource to read data from.
        parallelism: The requested parallelism of the read.
        read_args: Additional kwargs to pass to the datasource impl.

    Returns:
        Dataset holding the data read from the datasource.
    """

    read_tasks = datasource.prepare_read(parallelism, **read_args)

    @ray.remote
    def remote_read(task: ReadTask) -> Block:
        return task()

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []

    for task in read_tasks:
        calls.append(lambda task=task: remote_read.remote(task))
        metadata.append(task.get_metadata())

    return Dataset(LazyBlockList(calls, metadata))
Example #3
0
def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.

    Examples:
        # Read a directory of files in remote storage.
        >>> ds.read_parquet("s3://bucket/path")

        # Read multiple local files.
        >>> ds.read_parquet(["/path/to/file1", "/path/to/file2"])

    Args:
        paths: A single file path or a list of file paths (or directories).
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.

    Returns:
        Dataset holding Arrow records read from the specified paths.
    """
    import pyarrow.parquet as pq

    pq_ds = pq.ParquetDataset(paths, **arrow_parquet_args)

    read_tasks = [[] for _ in builtins.range(parallelism)]
    # TODO(ekl) support reading row groups (maybe as an option)
    for i, piece in enumerate(pq_ds.pieces):
        read_tasks[i % len(read_tasks)].append(piece)
    nonempty_tasks = [r for r in read_tasks if r]
    partitions = pq_ds.partitions

    @ray.remote
    def gen_read(pieces: List[pq.ParquetDatasetPiece]):
        import pyarrow
        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [
            piece.read(columns=columns,
                       use_threads=False,
                       partitions=partitions) for piece in pieces
        ]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
        else:
            table = tables[0]
        return ArrowBlock(table)

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
    for pieces in nonempty_tasks:
        calls.append(lambda pieces=pieces: gen_read.remote(pieces))
        piece_metadata = [p.get_metadata() for p in pieces]
        metadata.append(
            BlockMetadata(num_rows=sum(m.num_rows for m in piece_metadata),
                          size_bytes=sum(
                              sum(
                                  m.row_group(i).total_byte_size
                                  for i in builtins.range(m.num_row_groups))
                              for m in piece_metadata),
                          schema=piece_metadata[0].schema.to_arrow_schema(),
                          input_files=[p.path for p in pieces]))

    return Dataset(LazyBlockList(calls, metadata))
Example #4
0
def read_parquet(paths: Union[str, List[str]],
                 filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                 columns: Optional[List[str]] = None,
                 parallelism: int = 200,
                 **arrow_parquet_args) -> Dataset[ArrowRow]:
    """Create an Arrow dataset from parquet files.

    Examples:
        >>> # Read a directory of files in remote storage.
        >>> ray.data.read_parquet("s3://bucket/path")

        >>> # Read multiple local files.
        >>> ray.data.read_parquet(["/path/to/file1", "/path/to/file2"])

    Args:
        paths: A single file path or a list of file paths (or directories).
        filesystem: The filesystem implementation to read from.
        columns: A list of column names to read.
        parallelism: The amount of parallelism to use for the dataset.
        arrow_parquet_args: Other parquet read options to pass to pyarrow.

    Returns:
        Dataset holding Arrow records read from the specified paths.
    """
    import pyarrow.parquet as pq

    if filesystem is None:
        filesystem, paths = _parse_paths(paths)
    pq_ds = pq.ParquetDataset(paths,
                              **arrow_parquet_args,
                              filesystem=filesystem)
    pieces = pq_ds.pieces

    read_tasks = [[] for _ in builtins.range(parallelism)]
    # TODO(ekl) support reading row groups (maybe as an option)
    for i, piece in enumerate(pq_ds.pieces):
        read_tasks[i % len(read_tasks)].append(piece)
    nonempty_tasks = [r for r in read_tasks if r]

    @ray.remote
    def gen_read(pieces: List["pyarrow._dataset.ParquetFileFragment"]):
        import pyarrow
        logger.debug("Reading {} parquet pieces".format(len(pieces)))
        tables = [piece.to_table() for piece in pieces]
        if len(tables) > 1:
            table = pyarrow.concat_tables(tables)
        else:
            table = tables[0]
        return table

    calls: List[Callable[[], ObjectRef[Block]]] = []
    metadata: List[BlockMetadata] = []
    for pieces in nonempty_tasks:
        calls.append(lambda pieces=pieces: gen_read.remote(pieces))
        piece_metadata = []
        for p in pieces:
            try:
                piece_metadata.append(p.metadata)
            except AttributeError:
                break
        input_files = [p.path for p in pieces]
        if len(piece_metadata) == len(pieces):
            # Piece metadata was available, constructo a normal BlockMetadata.
            block_metadata = BlockMetadata(
                num_rows=sum(m.num_rows for m in piece_metadata),
                size_bytes=sum(
                    sum(
                        m.row_group(i).total_byte_size
                        for i in builtins.range(m.num_row_groups))
                    for m in piece_metadata),
                schema=piece_metadata[0].schema.to_arrow_schema(),
                input_files=input_files)
        else:
            # Piece metadata was not available, construct an empty
            # BlockMetadata.
            block_metadata = BlockMetadata(num_rows=None,
                                           size_bytes=None,
                                           schema=None,
                                           input_files=input_files)
        metadata.append(block_metadata)

    return Dataset(LazyBlockList(calls, metadata))