def _to_parquet_chunked( file_path: str, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], compression: Optional[str], pyarrow_additional_kwargs: Optional[Dict[str, Any]], table: pa.Table, max_rows_by_file: int, num_of_rows: int, cpus: int, ) -> List[str]: chunks: int = math.ceil(num_of_rows / max_rows_by_file) use_threads: Union[bool, int] = cpus > 1 proxy: _WriteProxy = _WriteProxy(use_threads=use_threads) for chunk in range(chunks): offset: int = chunk * max_rows_by_file write_path: str = _get_file_path(chunk, file_path) proxy.write( func=_write_chunk, file_path=write_path, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, compression=compression, pyarrow_additional_kwargs=pyarrow_additional_kwargs, table=table, offset=offset, chunk_size=max_rows_by_file, use_threads=use_threads, ) return proxy.close() # blocking
def _to_buckets( func: Callable[..., List[str]], df: pd.DataFrame, path_root: str, bucketing_info: Tuple[List[str], int], filename_prefix: str, boto3_session: boto3.Session, use_threads: Union[bool, int], proxy: Optional[_WriteProxy] = None, **func_kwargs: Any, ) -> List[str]: _proxy: _WriteProxy = proxy if proxy else _WriteProxy(use_threads=False) bucket_number_series = df.astype("O").apply( lambda row: _get_bucket_number(bucketing_info[ 1], [row[col_name] for col_name in bucketing_info[0]]), axis="columns", ) bucket_number_series = bucket_number_series.astype( pd.CategoricalDtype(range(bucketing_info[1]))) for bucket_number, subgroup in df.groupby(by=bucket_number_series, observed=False): _proxy.write( func=func, df=subgroup, path_root=path_root, filename_prefix=f"{filename_prefix}_bucket-{bucket_number:05d}", boto3_session=boto3_session, use_threads=use_threads, **func_kwargs, ) if proxy: return [] paths: List[str] = _proxy.close() # blocking return paths
def _to_partitions( func: Callable[..., List[str]], concurrent_partitioning: bool, df: pd.DataFrame, path_root: str, use_threads: bool, mode: str, partition_cols: List[str], boto3_session: boto3.Session, **func_kwargs: Any, ) -> Tuple[List[str], Dict[str, List[str]]]: partitions_values: Dict[str, List[str]] = {} proxy: _WriteProxy = _WriteProxy(use_threads=concurrent_partitioning) for keys, subgroup in df.groupby(by=partition_cols, observed=True): subgroup = subgroup.drop(partition_cols, axis="columns") keys = (keys, ) if not isinstance(keys, tuple) else keys subdir = "/".join( [f"{name}={val}" for name, val in zip(partition_cols, keys)]) prefix: str = f"{path_root}{subdir}/" if mode == "overwrite_partitions": delete_objects(path=prefix, use_threads=use_threads, boto3_session=boto3_session) proxy.write(func=func, df=subgroup, path_root=prefix, boto3_session=boto3_session, **func_kwargs) partitions_values[prefix] = [str(k) for k in keys] paths: List[str] = proxy.close() # blocking return paths, partitions_values
def _to_partitions( func: Callable[..., List[str]], concurrent_partitioning: bool, df: pd.DataFrame, path_root: str, use_threads: Union[bool, int], mode: str, partition_cols: List[str], partitions_types: Optional[Dict[str, str]], catalog_id: Optional[str], database: Optional[str], table: Optional[str], table_type: Optional[str], transaction_id: Optional[str], bucketing_info: Optional[Tuple[List[str], int]], filename_prefix: str, boto3_session: boto3.Session, **func_kwargs: Any, ) -> Tuple[List[str], Dict[str, List[str]]]: partitions_values: Dict[str, List[str]] = {} proxy: _WriteProxy = _WriteProxy(use_threads=concurrent_partitioning) for keys, subgroup in df.groupby(by=partition_cols, observed=True): subgroup = subgroup.drop(partition_cols, axis="columns") keys = (keys, ) if not isinstance(keys, tuple) else keys subdir = "/".join( [f"{name}={val}" for name, val in zip(partition_cols, keys)]) prefix: str = f"{path_root}{subdir}/" if mode == "overwrite_partitions": if (table_type == "GOVERNED") and (table is not None) and (database is not None): del_objects: List[Dict[ str, Any]] = lakeformation._get_table_objects( # pylint: disable=protected-access catalog_id=catalog_id, database=database, table=table, transaction_id=transaction_id, # type: ignore partition_cols=partition_cols, partitions_values=keys, partitions_types=partitions_types, boto3_session=boto3_session, ) if del_objects: lakeformation._update_table_objects( # pylint: disable=protected-access catalog_id=catalog_id, database=database, table=table, transaction_id=transaction_id, # type: ignore del_objects=del_objects, boto3_session=boto3_session, ) else: delete_objects( path=prefix, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=func_kwargs.get( "s3_additional_kwargs"), ) if bucketing_info: _to_buckets( func=func, df=subgroup, path_root=prefix, bucketing_info=bucketing_info, boto3_session=boto3_session, use_threads=use_threads, proxy=proxy, filename_prefix=filename_prefix, **func_kwargs, ) else: proxy.write( func=func, df=subgroup, path_root=prefix, filename_prefix=filename_prefix, boto3_session=boto3_session, use_threads=use_threads, **func_kwargs, ) partitions_values[prefix] = [str(k) for k in keys] paths: List[str] = proxy.close() # blocking return paths, partitions_values