def delete_all_partitions( table: str, database: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[List[str]]: """Delete all partitions in a AWS Glue Catalog table. Parameters ---------- table : str Table name. database : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- List[List[str]] Partitions values. Examples -------- >>> import awswrangler as wr >>> partitions = wr.catalog.delete_all_partitions( ... table='my_table', ... database='awswrangler_test', ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) _logger.debug("Fetching existing partitions...") partitions_values: List[List[str]] = list( _get_partitions(database=database, table=table, boto3_session=session, catalog_id=catalog_id).values()) _logger.debug("Number of old partitions: %s", len(partitions_values)) _logger.debug("Deleting existing partitions...") delete_partitions( table=table, database=database, catalog_id=catalog_id, partitions_values=partitions_values, boto3_session=boto3_session, ) return partitions_values
def _create_table( # pylint: disable=too-many-branches,too-many-statements database: str, table: str, description: Optional[str], parameters: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], table_exist: bool, projection_enabled: bool, partitions_types: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], catalog_id: Optional[str], ) -> None: # Description mode = _update_if_necessary(dic=table_input, key="Description", value=description, mode=mode) # Parameters parameters = parameters if parameters else {} for k, v in parameters.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=k, value=v, mode=mode) # Projection if projection_enabled is True: table_input["Parameters"]["projection.enabled"] = "true" partitions_types = partitions_types if partitions_types else {} projection_types = projection_types if projection_types else {} projection_ranges = projection_ranges if projection_ranges else {} projection_values = projection_values if projection_values else {} projection_intervals = projection_intervals if projection_intervals else {} projection_digits = projection_digits if projection_digits else {} projection_types = {sanitize_column_name(k): v for k, v in projection_types.items()} projection_ranges = {sanitize_column_name(k): v for k, v in projection_ranges.items()} projection_values = {sanitize_column_name(k): v for k, v in projection_values.items()} projection_intervals = {sanitize_column_name(k): v for k, v in projection_intervals.items()} projection_digits = {sanitize_column_name(k): v for k, v in projection_digits.items()} for k, v in projection_types.items(): dtype: Optional[str] = partitions_types.get(k) if dtype is None: raise exceptions.InvalidArgumentCombination( f"Column {k} appears as projected column but not as partitioned column." ) if dtype == "date": table_input["Parameters"][f"projection.{k}.format"] = "yyyy-MM-dd" elif dtype == "timestamp": table_input["Parameters"][f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss" table_input["Parameters"][f"projection.{k}.interval.unit"] = "SECONDS" table_input["Parameters"][f"projection.{k}.interval"] = "1" for k, v in projection_types.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.type", value=v, mode=mode) for k, v in projection_ranges.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.range", value=v, mode=mode) for k, v in projection_values.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.values", value=v, mode=mode) for k, v in projection_intervals.items(): mode = _update_if_necessary( dic=table_input["Parameters"], key=f"projection.{k}.interval", value=str(v), mode=mode ) for k, v in projection_digits.items(): mode = _update_if_necessary( dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode ) else: table_input["Parameters"]["projection.enabled"] = "false" # Column comments columns_comments = columns_comments if columns_comments else {} columns_comments = {sanitize_column_name(k): v for k, v in columns_comments.items()} if columns_comments: for col in table_input["StorageDescriptor"]["Columns"]: name: str = col["Name"] if name in columns_comments: mode = _update_if_necessary(dic=col, key="Comment", value=columns_comments[name], mode=mode) for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: mode = _update_if_necessary(dic=par, key="Comment", value=columns_comments[name], mode=mode) _logger.debug("table_input: %s", table_input) session: boto3.Session = _utils.ensure_session(session=boto3_session) client_glue: boto3.client = _utils.client(service_name="glue", session=session) skip_archive: bool = not catalog_versioning if mode not in ("overwrite", "append", "overwrite_partitions", "update"): raise exceptions.InvalidArgument( f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'." ) if table_exist is True and mode == "overwrite": _logger.debug("Fetching existing partitions...") partitions_values: List[List[str]] = list( _get_partitions(database=database, table=table, boto3_session=session, catalog_id=catalog_id).values() ) _logger.debug("Number of old partitions: %s", len(partitions_values)) _logger.debug("Deleting existing partitions...") client_glue.batch_delete_partition( **_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableName=table, PartitionsToDelete=[{"Values": v} for v in partitions_values], ) ) _logger.debug("Updating table...") client_glue.update_table( **_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive ) ) elif (table_exist is True) and (mode in ("append", "overwrite_partitions", "update")): if mode == "update": client_glue.update_table( **_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive ) ) elif table_exist is False: try: client_glue.create_table( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input) ) except client_glue.exceptions.AlreadyExistsException as ex: if mode == "overwrite": delete_table_if_exists(database=database, table=table, boto3_session=session, catalog_id=catalog_id) client_glue.create_table( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input) ) else: raise ex
def read_parquet_table( table: str, database: str, filename_suffix: Union[str, List[str], None] = None, filename_ignore_suffix: Union[str, List[str], None] = None, catalog_id: Optional[str] = None, partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None, columns: Optional[List[str]] = None, validate_schema: bool = True, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, chunked: Union[bool, int] = False, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read Apache Parquet table registered on AWS Glue Catalog. Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating to return DataFrames with the number of row igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- table : str AWS Glue Catalog table name. database : str AWS Glue Catalog database name. filename_suffix: Union[str, List[str], None] Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]). If None, will try to read all files. (default) filename_ignore_suffix: Union[str, List[str], None] Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]). If None, will try to read all files. (default) catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. partition_filter: Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. This function MUST return a bool, True to read the partition or False to ignore it. Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/023%20-%20Flexible%20Partitions%20Filter.html columns : List[str], optional Names of columns to read from the file(s). validate_schema: Check that individual file schemas are all the same / compatible. Schemas within a folder prefix should all be the same. Disable if you have schemas that are different and want to disable this check. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. chunked : bool If True will break the data in smaller DataFrames (Non deterministic number of lines). Otherwise return a single DataFrame with the whole data. use_threads : Union[bool, int] True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] Pandas DataFrame or a Generator in case of `chunked=True`. Examples -------- Reading Parquet Table >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table(database='...', table='...') Reading Parquet Table encrypted >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table( ... database='...', ... table='...' ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN' ... } ... ) Reading Parquet Table in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame Reading Parquet Dataset with PUSH-DOWN filter over partitions >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False >>> df = wr.s3.read_parquet_table(path, dataset=True, partition_filter=my_filter) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = {"DatabaseName": database, "Name": table} if catalog_id is not None: args["CatalogId"] = catalog_id res: Dict[str, Any] = client_glue.get_table(**args) try: location: str = res["Table"]["StorageDescriptor"]["Location"] path: str = location if location.endswith("/") else f"{location}/" except KeyError as ex: raise exceptions.InvalidTable(f"Missing s3 location for {database}.{table}.") from ex path_root: Optional[str] = None paths: Union[str, List[str]] = path # If filter is available, fetch & filter out partitions # Then list objects & process individual object keys under path_root if partition_filter is not None: available_partitions_dict = _get_partitions( database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session, ) available_partitions = list(available_partitions_dict.keys()) if available_partitions: paths = [] path_root = path partitions: Union[str, List[str]] = _apply_partition_filter( path_root=path_root, paths=available_partitions, filter_func=partition_filter ) for partition in partitions: paths += _path2list( path=partition, boto3_session=boto3_session, suffix=filename_suffix, ignore_suffix=_get_path_ignore_suffix(path_ignore_suffix=filename_ignore_suffix), s3_additional_kwargs=s3_additional_kwargs, ) df = read_parquet( path=paths, path_root=path_root, path_suffix=filename_suffix if path_root is None else None, path_ignore_suffix=filename_ignore_suffix if path_root is None else None, columns=columns, validate_schema=validate_schema, categories=categories, safe=safe, map_types=map_types, chunked=chunked, dataset=True, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) partial_cast_function = functools.partial( _data_types.cast_pandas_with_athena_types, dtype=_extract_partitions_dtypes_from_table_details(response=res) ) if isinstance(df, pd.DataFrame): return partial_cast_function(df) # df is a generator, so map is needed for casting dtypes return map(partial_cast_function, df)