def _read_parquet_metadata_file( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], version_id: Optional[str] = None, ignore_null: bool = False, pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Optional[Dict[str, str]]: pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs) with open_s3_object( path=path, mode="rb", version_id=version_id, use_threads=use_threads, s3_block_size=131_072, # 128 KB (128 * 2**10) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, coerce_int96_timestamp_unit=pyarrow_args["coerce_int96_timestamp_unit"] ) if pq_file is None: return None return _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None, ignore_null=ignore_null )[0]
def _read_parquet_metadata_file( path: str, use_threads: bool, boto3_session: boto3.Session) -> Dict[str, str]: data: pyarrow.parquet.ParquetDataset = _read_parquet_init( path=path, filters=None, dataset=False, use_threads=use_threads, boto3_session=boto3_session) return _data_types.athena_types_from_pyarrow_schema( schema=data.schema.to_arrow_schema(), partitions=None)[0]
def _read_parquet_metadata_file( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool ) -> Dict[str, str]: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=131_072, # 128 KB (128 * 2**10) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f) return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
def _read_parquet_metadata_file( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> Dict[str, str]: fs: s3fs.S3FileSystem = _utils.get_fs( s3fs_block_size=4_194_304, session=boto3_session, s3_additional_kwargs=s3_additional_kwargs # 4 MB (4 * 2**20) ) with _utils.open_file(fs=fs, path=path, mode="rb") as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f) return _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
def _read_parquet_chunked( paths: List[str], chunked: Union[bool, int], columns: Optional[List[str]], categories: Optional[List[str]], validate_schema: bool, safe: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if validate_schema is True and last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) for i in range(num_row_groups): _logger.debug("Reading Row Group %s...", i) df: pd.DataFrame = _arrowtable2df( table=pq_file.read_row_group(i=i, columns=columns, use_threads=use_threads, use_pandas_metadata=False), categories=categories, safe=safe, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=None) while len(df.index) >= chunked: yield df.iloc[:chunked] df = df.iloc[chunked:] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")
def _read_parquet_chunked( # pylint: disable=too-many-branches paths: List[str], chunked: Union[bool, int], validate_schema: bool, ignore_index: Optional[bool], columns: Optional[List[str]], categories: Optional[List[str]], safe: bool, map_types: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[ pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, read_dictionary=categories) if pq_file is None: continue if validate_schema is True: schema: Dict[ str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) use_threads_flag: bool = use_threads if isinstance( use_threads, bool) else bool(use_threads > 1) # iter_batches is only available for pyarrow >= 3.0.0 if callable(getattr(pq_file, "iter_batches", None)): chunk_generator = _pyarrow_chunk_generator( pq_file=pq_file, chunked=chunked, columns=columns, use_threads_flag=use_threads_flag) else: chunk_generator = _row_group_chunk_generator( pq_file=pq_file, columns=columns, use_threads_flag=use_threads_flag, num_row_groups=num_row_groups) for chunk in chunk_generator: df: pd.DataFrame = _arrowtable2df( table=chunk, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=ignore_index) while len(df.index) >= chunked: yield df.iloc[:chunked, :].copy() df = df.iloc[chunked:, :] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")