def _read_schemas_from_files( paths: List[str], sampling: float, use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> Tuple[Dict[str, str], ...]: paths = _utils.list_sampling(lst=paths, sampling=sampling) schemas: Tuple[Dict[str, str], ...] = tuple() n_paths: int = len(paths) if use_threads is False or n_paths == 1: schemas = tuple( _read_parquet_metadata_file( path=p, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads) for p in paths) elif n_paths > 1: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: schemas = tuple( executor.map( _read_parquet_metadata_file, paths, itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), # Boto3.Session itertools.repeat(s3_additional_kwargs), itertools.repeat(use_threads), )) _logger.debug("schemas: %s", schemas) return schemas
def read_parquet_metadata_internal( path: Union[str, List[str]], dtype: Optional[Dict[str, str]], sampling: float, dataset: bool, use_threads: bool, boto3_session: Optional[boto3.Session], ) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[ str, List[str]]]]: """Handle wr.s3.read_parquet_metadata internally.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) if dataset is True: if isinstance(path, str): _path: Optional[str] = path if path.endswith("/") else f"{path}/" paths: List[str] = path2list(path=_path, boto3_session=session) else: # pragma: no cover raise exceptions.InvalidArgumentType( "Argument <path> must be str if dataset=True.") else: if isinstance(path, str): _path = None paths = path2list(path=path, boto3_session=session) elif isinstance(path, list): _path = None paths = path else: # pragma: no cover raise exceptions.InvalidArgumentType( f"Argument path must be str or List[str] instead of {type(path)}." ) schemas: List[Dict[str, str]] = [ _read_parquet_metadata_file(path=x, use_threads=use_threads, boto3_session=session) for x in _utils.list_sampling(lst=paths, sampling=sampling) ] _logger.debug("schemas: %s", schemas) columns_types: Dict[str, str] = {} for schema in schemas: for column, _dtype in schema.items(): if (column in columns_types) and (columns_types[column] != _dtype): # pragma: no cover raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})." ) columns_types[column] = _dtype partitions_types: Optional[Dict[str, str]] = None partitions_values: Optional[Dict[str, List[str]]] = None if (dataset is True) and (_path is not None): partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths( path=_path, paths=paths) if dtype: for k, v in dtype.items(): if columns_types and k in columns_types: columns_types[k] = v if partitions_types and k in partitions_types: partitions_types[k] = v _logger.debug("columns_types: %s", columns_types) return columns_types, partitions_types, partitions_values
def _read_schemas_from_files( paths: List[str], sampling: float, use_threads: Union[bool, int], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], version_ids: Optional[Dict[str, str]] = None, ignore_null: bool = False, pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Tuple[Dict[str, str], ...]: paths = _utils.list_sampling(lst=paths, sampling=sampling) schemas: Tuple[Optional[Dict[str, str]], ...] = tuple() n_paths: int = len(paths) cpus: int = _utils.ensure_cpu_count(use_threads) if cpus == 1 or n_paths == 1: schemas = tuple( _read_parquet_metadata_file( path=p, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, version_id=version_ids.get(p) if isinstance(version_ids, dict) else None, ignore_null=ignore_null, pyarrow_additional_kwargs=pyarrow_additional_kwargs, ) for p in paths ) elif n_paths > 1: versions = [version_ids.get(p) if isinstance(version_ids, dict) else None for p in paths] with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: schemas = tuple( executor.map( _read_parquet_metadata_file, paths, itertools.repeat(_utils.boto3_to_primitives(boto3_session=boto3_session)), # Boto3.Session itertools.repeat(s3_additional_kwargs), itertools.repeat(use_threads), versions, itertools.repeat(ignore_null), itertools.repeat(pyarrow_additional_kwargs), ) ) schemas = cast(Tuple[Dict[str, str], ...], tuple(x for x in schemas if x is not None)) _logger.debug("schemas: %s", schemas) return schemas