Beispiel #1
0
def _get_metadata(pieces: List["pyarrow._dataset.ParquetFileFragment"],
                  schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None):
    piece_metadata = []
    for p in pieces:
        try:
            piece_metadata.append(p.metadata)
        except AttributeError:
            break
    input_files = [p.path for p in pieces]
    if len(piece_metadata) == len(pieces):
        # Piece metadata was available, construct a normal
        # BlockMetadata.
        block_metadata = BlockMetadata(
            num_rows=sum(m.num_rows for m in piece_metadata),
            size_bytes=sum(
                sum(
                    m.row_group(i).total_byte_size
                    for i in range(m.num_row_groups)) for m in piece_metadata),
            schema=schema,
            input_files=input_files)
    else:
        # Piece metadata was not available, construct an empty
        # BlockMetadata.
        block_metadata = BlockMetadata(num_rows=None,
                                       size_bytes=None,
                                       schema=schema,
                                       input_files=input_files)
    return block_metadata
 def _get_block_metadata(
     self,
     paths: List[str],
     schema: Optional[Union[type, "pyarrow.lib.Schema"]],
     *,
     pieces: List["pyarrow.dataset.ParquetFileFragment"],
     prefetched_metadata: Optional[List["pyarrow.parquet.FileMetaData"]],
 ) -> BlockMetadata:
     if prefetched_metadata is not None and len(prefetched_metadata) == len(pieces):
         # Piece metadata was available, construct a normal
         # BlockMetadata.
         block_metadata = BlockMetadata(
             num_rows=sum(m.num_rows for m in prefetched_metadata),
             size_bytes=sum(
                 sum(m.row_group(i).total_byte_size for i in range(m.num_row_groups))
                 for m in prefetched_metadata
             ),
             schema=schema,
             input_files=paths,
             exec_stats=None,
         )  # Exec stats filled in later.
     else:
         # Piece metadata was not available, construct an empty
         # BlockMetadata.
         block_metadata = BlockMetadata(
             num_rows=None, size_bytes=None, schema=schema, input_files=paths
         )
     return block_metadata
Beispiel #3
0
def _build_block_metadata(
        pieces: List["pyarrow.dataset.ParquetFileFragment"],
        metadata: List["pyarrow.parquet.FileMetaData"],
        schema: Optional[Union[type, "pyarrow.lib.Schema"]]) -> BlockMetadata:
    input_files = [p.path for p in pieces]
    if len(metadata) == len(pieces):
        # Piece metadata was available, construct a normal
        # BlockMetadata.
        block_metadata = BlockMetadata(
            num_rows=sum(m.num_rows for m in metadata),
            size_bytes=sum(
                sum(
                    m.row_group(i).total_byte_size
                    for i in range(m.num_row_groups)) for m in metadata),
            schema=schema,
            input_files=input_files,
            exec_stats=BlockExecStats.TODO)
    else:
        # Piece metadata was not available, construct an empty
        # BlockMetadata.
        block_metadata = BlockMetadata(num_rows=None,
                                       size_bytes=None,
                                       schema=schema,
                                       input_files=input_files)
    return block_metadata
Beispiel #4
0
    def prepare_read(self,
                     parallelism: int,
                     paths: Union[str, List[str]],
                     filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                     schema: Optional[Union[type,
                                            "pyarrow.lib.Schema"]] = None,
                     **reader_args) -> List[ReadTask]:
        """Creates and returns read tasks for a file-based datasource.
        """
        import pyarrow as pa
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        paths, file_infos = _expand_paths(paths, filesystem)
        file_sizes = [file_info.size for file_info in file_infos]

        read_file = self._read_file

        filesystem = _wrap_s3_serialization_workaround(filesystem)

        def read_files(read_paths: List[str],
                       fs: Union["pyarrow.fs.FileSystem",
                                 _S3FileSystemWrapper]):
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            builder = DelegatingArrowBlockBuilder()
            for read_path in read_paths:
                with fs.open_input_stream(read_path) as f:
                    data = read_file(f, read_path, **reader_args)
                    if isinstance(data, pa.Table) or isinstance(
                            data, np.ndarray):
                        builder.add_block(data)
                    else:
                        builder.add(data)
            return builder.build()

        read_tasks = []
        for read_paths, file_sizes in zip(
                np.array_split(paths, parallelism),
                np.array_split(file_sizes, parallelism)):
            if len(read_paths) <= 0:
                continue

            if self._rows_per_file() is None:
                num_rows = None
            else:
                num_rows = len(read_paths) * self._rows_per_file()
            read_task = ReadTask(lambda read_paths=read_paths: read_files(
                read_paths, filesystem),
                                 BlockMetadata(num_rows=num_rows,
                                               size_bytes=sum(file_sizes),
                                               schema=schema,
                                               input_files=read_paths))
            read_tasks.append(read_task)

        return read_tasks
Beispiel #5
0
 def _get_block_metadata(
     self,
     paths: List[str],
     schema: Optional[Union[type, "pyarrow.lib.Schema"]],
     *,
     rows_per_file: Optional[int],
     file_sizes: List[Optional[int]],
 ) -> BlockMetadata:
     if rows_per_file is None:
         num_rows = None
     else:
         num_rows = len(paths) * rows_per_file
     return BlockMetadata(
         num_rows=num_rows,
         size_bytes=None if None in file_sizes else sum(file_sizes),
         schema=schema,
         input_files=paths,
         exec_stats=None,
     )  # Exec stats filled in later.
Beispiel #6
0
    def prepare_read(
        self,
        parallelism: int,
        paths: Union[str, List[str]],
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
        open_stream_args: Optional[Dict[str, Any]] = None,
        _block_udf: Optional[Callable[[Block], Block]] = None,
        **reader_args,
    ) -> List[ReadTask]:
        """Creates and returns read tasks for a file-based datasource."""
        _check_pyarrow_version()
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        paths, file_infos = _expand_paths(paths, filesystem)
        file_sizes = [file_info.size for file_info in file_infos]

        read_stream = self._read_stream

        filesystem = _wrap_s3_serialization_workaround(filesystem)

        if open_stream_args is None:
            open_stream_args = {}

        def read_files(
            read_paths: List[str],
            fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper],
        ) -> Iterable[Block]:
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )
            for read_path in read_paths:
                with fs.open_input_stream(read_path, **open_stream_args) as f:
                    for data in read_stream(f, read_path, **reader_args):
                        output_buffer.add_block(data)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()

        read_tasks = []
        for read_paths, file_sizes in zip(
            np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)
        ):
            if len(read_paths) <= 0:
                continue

            if self._rows_per_file() is None:
                num_rows = None
            else:
                num_rows = len(read_paths) * self._rows_per_file()
            meta = BlockMetadata(
                num_rows=num_rows,
                size_bytes=sum(file_sizes),
                schema=schema,
                input_files=read_paths,
                exec_stats=None,
            )  # Exec stats filled in later.
            read_task = ReadTask(
                lambda read_paths=read_paths: read_files(read_paths, filesystem), meta
            )
            read_tasks.append(read_task)

        return read_tasks