def extract_partitions_metadata_from_paths( path: str, paths: List[str] ) -> Tuple[Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: """Extract partitions metadata from Amazon S3 paths.""" path = path if path.endswith("/") else f"{path}/" partitions_types: Dict[str, str] = {} partitions_values: Dict[str, List[str]] = {} for p in paths: if path not in p: raise exceptions.InvalidArgumentValue( f"Object {p} is not under the root path ({path})." ) # pragma: no cover path_wo_filename: str = p.rpartition("/")[0] + "/" if path_wo_filename not in partitions_values: path_wo_prefix: str = path_wo_filename.replace(f"{path}/", "") dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)] if dirs: values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs] # type: ignore values_dics: Dict[str, str] = dict(values_tups) p_values: List[str] = list(values_dics.values()) p_types: Dict[str, str] = {x: "string" for x in values_dics.keys()} if not partitions_types: partitions_types = p_types if p_values: partitions_types = p_types partitions_values[path_wo_filename] = p_values elif p_types != partitions_types: # pragma: no cover raise exceptions.InvalidSchemaConvergence( f"At least two different partitions schema detected: {partitions_types} and {p_types}" ) if not partitions_types: return None, None return partitions_types, partitions_values
def read_parquet_metadata_internal( path: Union[str, List[str]], dtype: Optional[Dict[str, str]], sampling: float, dataset: bool, use_threads: bool, boto3_session: Optional[boto3.Session], ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[ str, List[str]]]]: """Handle wr.s3.read_parquet_metadata internally.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) if dataset is True: if isinstance(path, str): _path: Optional[str] = path if path.endswith("/") else f"{path}/" paths: List[str] = path2list(path=_path, boto3_session=session) else: # pragma: no cover raise exceptions.InvalidArgumentType( "Argument <path> must be str if dataset=True.") else: if isinstance(path, str): _path = None paths = path2list(path=path, boto3_session=session) elif isinstance(path, list): _path = None paths = path else: # pragma: no cover raise exceptions.InvalidArgumentType( f"Argument path must be str or List[str] instead of {type(path)}." ) schemas: List[Dict[str, str]] = [ _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) for x in _utils.list_sampling(lst=paths, sampling=sampling) ] _logger.debug("schemas: %s", schemas) columns_types: Dict[str, str] = {} for schema in schemas: for column, _dtype in schema.items(): if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." ) columns_types[column] = _dtype partitions_types: Optional[Dict[str, str]] = None partitions_values: Optional[Dict[str, List[str]]] = None if (dataset is True) and (_path is not None): partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths( path=_path, paths=paths) if dtype: for k, v in dtype.items(): if columns_types and k in columns_types: columns_types[k] = v if partitions_types and k in partitions_types: partitions_types[k] = v _logger.debug("columns_types: %s", columns_types) return columns_types, partitions_types, partitions_values
def _validate_schemas(schemas: Tuple[Dict[str, str], ...]) -> None: if len(schemas) < 2: return None first: Dict[str, str] = schemas[0] for schema in schemas[1:]: if first != schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n 1 - {first}\n 2 - {schema}." ) return None
def _merge_schemas(schemas: Tuple[Dict[str, str], ...]) -> Dict[str, str]: columns_types: Dict[str, str] = {} for schema in schemas: for column, dtype in schema.items(): if (column in columns_types) and (columns_types[column] != dtype): raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." ) columns_types[column] = dtype return columns_types
def _read_parquet_chunked( paths: List[str], chunked: Union[bool, int], columns: Optional[List[str]], categories: Optional[List[str]], validate_schema: bool, safe: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if validate_schema is True and last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) for i in range(num_row_groups): _logger.debug("Reading Row Group %s...", i) df: pd.DataFrame = _arrowtable2df( table=pq_file.read_row_group(i=i, columns=columns, use_threads=use_threads, use_pandas_metadata=False), categories=categories, safe=safe, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=None) while len(df.index) >= chunked: yield df.iloc[:chunked] df = df.iloc[chunked:] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")
def _read_parquet_chunked( # pylint: disable=too-many-branches paths: List[str], chunked: Union[bool, int], validate_schema: bool, ignore_index: Optional[bool], columns: Optional[List[str]], categories: Optional[List[str]], safe: bool, map_types: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[ pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, read_dictionary=categories) if pq_file is None: continue if validate_schema is True: schema: Dict[ str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) use_threads_flag: bool = use_threads if isinstance( use_threads, bool) else bool(use_threads > 1) # iter_batches is only available for pyarrow >= 3.0.0 if callable(getattr(pq_file, "iter_batches", None)): chunk_generator = _pyarrow_chunk_generator( pq_file=pq_file, chunked=chunked, columns=columns, use_threads_flag=use_threads_flag) else: chunk_generator = _row_group_chunk_generator( pq_file=pq_file, columns=columns, use_threads_flag=use_threads_flag, num_row_groups=num_row_groups) for chunk in chunk_generator: df: pd.DataFrame = _arrowtable2df( table=chunk, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=ignore_index) while len(df.index) >= chunked: yield df.iloc[:chunked, :].copy() df = df.iloc[chunked:, :] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")