def _update_table_objects( catalog_id: Optional[str], database: str, table: str, transaction_id: str, boto3_session: Optional[boto3.Session], add_objects: Optional[List[Dict[str, Any]]] = None, del_objects: Optional[List[Dict[str, Any]]] = None, ) -> None: """Register Governed Table Objects changes to Lake Formation Engine.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) update_kwargs: Dict[str, Union[str, int, List[Dict[str, Dict[str, Any]]]]] = _catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, TableName=table)) write_operations: List[Dict[str, Dict[str, Any]]] = [] if add_objects: write_operations.extend({"AddObject": obj} for obj in add_objects) if del_objects: write_operations.extend({"DeleteObject": _without_keys(obj, ["Size"])} for obj in del_objects) update_kwargs["WriteOperations"] = write_operations client_lakeformation.update_table_objects(**update_kwargs)
def _get_table_input( database: str, table: str, boto3_session: Optional[boto3.Session], transaction_id: Optional[str] = None, catalog_id: Optional[str] = None, ) -> Optional[Dict[str, Any]]: client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, Name=table) ) try: response: Dict[str, Any] = client_glue.get_table(**args) except client_glue.exceptions.EntityNotFoundException: return None table_input: Dict[str, Any] = {} for k, v in response["Table"].items(): if k in [ "Name", "Description", "Owner", "LastAccessTime", "LastAnalyzedTime", "Retention", "StorageDescriptor", "PartitionKeys", "ViewOriginalText", "ViewExpandedText", "TableType", "Parameters", "TargetTable", ]: table_input[k] = v return table_input
def _overwrite_table( client_glue: boto3.client, catalog_id: Optional[str], database: str, table: str, table_input: Dict[str, Any], transaction_id: Optional[str], boto3_session: boto3.Session, ) -> None: delete_table_if_exists( database=database, table=table, transaction_id=transaction_id, boto3_session=boto3_session, catalog_id=catalog_id, ) args: Dict[str, Any] = _catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, TableInput=table_input, ), ) client_glue.create_table(**args)
def get_columns_comments( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, str]: """Get all columns comments. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] Columns comments. e.g. {"col1": "foo boo bar"}. Examples -------- >>> import awswrangler as wr >>> pars = wr.catalog.get_columns_comments(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) comments: Dict[str, str] = {} for c in response["Table"]["StorageDescriptor"]["Columns"]: comments[c["Name"]] = c.get("Comment") if "PartitionKeys" in response["Table"]: for p in response["Table"]["PartitionKeys"]: comments[p["Name"]] = p.get("Comment") return comments
def get_table_location( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Get table's location on Glue catalog. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Table's location. Examples -------- >>> import awswrangler as wr >>> wr.catalog.get_table_location(database='default', table='my_table') 's3://bucket/prefix/' """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) res: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) try: return cast(str, res["Table"]["StorageDescriptor"]["Location"]) except KeyError as ex: raise exceptions.InvalidTable(f"{database}.{table}") from ex
def get_table_types( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Optional[Dict[str, str]]: """Get all columns and types from a table. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Optional[Dict[str, str]] If table exists, a dictionary like {'col name': 'col data type'}. Otherwise None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.get_table_types(database='default', table='my_table') {'col0': 'int', 'col1': double} """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) try: response: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) except client_glue.exceptions.EntityNotFoundException: return None return _extract_dtypes_from_table_details(response=response)
def get_table_description( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Optional[str]: """Get table description. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Optional[str] Description if exists. Examples -------- >>> import awswrangler as wr >>> desc = wr.catalog.get_table_description(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) desc: Optional[str] = response["Table"].get("Description", None) return desc
def delete_table_if_exists( database: str, table: str, transaction_id: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> bool: """Delete Glue table if exists. Parameters ---------- database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if deleted, otherwise False. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_table_if_exists(database='default', table='my_table') # deleted True >>> wr.catalog.delete_table_if_exists(database='default', table='my_table') # Nothing to be deleted False """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) try: client_glue.delete_table( **_catalog_id( **_transaction_id( transaction_id=transaction_id, DatabaseName=database, Name=table, catalog_id=catalog_id ) ) ) return True except client_glue.exceptions.EntityNotFoundException: return False
def _get_table_objects( catalog_id: Optional[str], database: str, table: str, transaction_id: str, boto3_session: Optional[boto3.Session], partition_cols: Optional[List[str]] = None, partitions_types: Optional[Dict[str, str]] = None, partitions_values: Optional[List[str]] = None, ) -> List[Dict[str, Any]]: """Get Governed Table Objects from Lake Formation Engine.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) scan_kwargs: Dict[str, Union[str, int]] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableName=table, MaxResults=100), ) if partition_cols and partitions_types and partitions_values: scan_kwargs["PartitionPredicate"] = _build_partition_predicate( partition_cols=partition_cols, partitions_types=partitions_types, partitions_values=partitions_values) next_token: str = "init_token" # Dummy token table_objects: List[Dict[str, Any]] = [] while next_token: response = _utils.try_it( f=client_lakeformation.get_table_objects, ex=botocore.exceptions.ClientError, ex_code="ResourceNotReadyException", base=1.0, max_num_tries=5, **scan_kwargs, ) for objects in response["Objects"]: for table_object in objects["Objects"]: if objects["PartitionValues"]: table_object["PartitionValues"] = objects[ "PartitionValues"] table_objects.append(table_object) next_token = response.get("NextToken", None) scan_kwargs["NextToken"] = next_token return table_objects
def _overwrite_table_parameters( parameters: Dict[str, str], database: str, transaction_id: Optional[str], catalog_versioning: bool, catalog_id: Optional[str], table_input: Dict[str, Any], boto3_session: Optional[boto3.Session], ) -> Dict[str, str]: table_input["Parameters"] = parameters client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) skip_archive: bool = not catalog_versioning client_glue.update_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive ), ) ) return parameters
def read_sql_query( sql: str, database: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, params: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: """Execute PartiQL query on AWS Glue Table (Transaction ID or time travel timestamp). Return Pandas DataFrame. Note ---- ORDER BY operations are not honoured. i.e. sql="SELECT * FROM my_table ORDER BY my_column" is NOT valid Note ---- The database must NOT be explicitely defined in the PartiQL statement. i.e. sql="SELECT * FROM my_table" is valid but sql="SELECT * FROM my_db.my_table" is NOT valid Note ---- Pass one of `transaction_id` or `query_as_of_time`, not both. Parameters ---------- sql : str partiQL query. database : str AWS Glue database name transaction_id : str, optional The ID of the transaction at which to read the table contents. Cannot be specified alongside query_as_of_time query_as_of_time : str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. use_threads : bool True to enable concurrent requests, False to disable multiple threads. When enabled, os.cpu_count() is used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session is used if boto3_session receives None. params: Dict[str, any], optional Dict of parameters used to format the partiQL query. Only named parameters are supported. The dict must contain the information in the form {"name": "value"} and the SQL query must contain `:name`. Returns ------- pd.DataFrame Pandas DataFrame. Examples -------- >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table;", ... database="my_db", ... catalog_id="111111111111" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table LIMIT 10;", ... database="my_db", ... transaction_id="1b62811fa3e02c4e5fdbaa642b752030379c4a8a70da1f8732ce6ccca47afdc9" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table WHERE name=:name; AND city=:city;", ... database="my_db", ... query_as_of_time="1611142914", ... params={"name": "'filtered_name'", "city": "'filtered_city'"} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) commit_trans: bool = False if params is None: params = {} for key, value in params.items(): sql = sql.replace(f":{key};", str(value)) if not any([transaction_id, query_as_of_time]): _logger.debug( "Neither `transaction_id` nor `query_as_of_time` were specified, starting transaction" ) transaction_id = start_transaction(read_only=True, boto3_session=session) commit_trans = True args: Dict[str, Optional[str]] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database), ) query_id: str = client_lakeformation.start_query_planning( QueryString=sql, QueryPlanningContext=args)["QueryId"] df = _resolve_sql_query( query_id=query_id, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, boto3_session=session, ) if commit_trans: commit_transaction(transaction_id=transaction_id) # type: ignore return df
def table( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: """Get table details as Pandas DataFrame. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- pandas.DataFrame Pandas DataFrame filled by formatted infos. Examples -------- >>> import awswrangler as wr >>> df_table = wr.catalog.table(database='default', table='my_table') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) tbl = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) )["Table"] df_dict: Dict[str, List[Union[str, bool]]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []} if "StorageDescriptor" in tbl: for col in tbl["StorageDescriptor"].get("Columns", {}): df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(False) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") if "PartitionKeys" in tbl: for col in tbl["PartitionKeys"]: df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(True) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") return pd.DataFrame(data=df_dict)
def get_tables( catalog_id: Optional[str] = None, database: Optional[str] = None, transaction_id: Optional[str] = None, name_contains: Optional[str] = None, name_prefix: Optional[str] = None, name_suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Iterator[Dict[str, Any]]: """Get an iterator of tables. Note ---- Please, does not filter using name_contains and name_prefix/name_suffix at the same time. Only name_prefix and name_suffix can be combined together. Parameters ---------- catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. database : str, optional Database name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). name_contains : str, optional Select by a specific string on table name name_prefix : str, optional Select by a specific prefix on table name name_suffix : str, optional Select by a specific suffix on table name boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Iterator[Dict[str, Any]] Iterator of tables. Examples -------- >>> import awswrangler as wr >>> tables = wr.catalog.get_tables() """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_tables") args: Dict[str, str] = {} if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None): raise exceptions.InvalidArgumentCombination( "Please, does not filter using name_contains and " "name_prefix/name_suffix at the same time. Only " "name_prefix and name_suffix can be combined together." ) if (name_prefix is not None) and (name_suffix is not None): args["Expression"] = f"{name_prefix}*{name_suffix}" elif name_contains is not None: args["Expression"] = f"*{name_contains}*" elif name_prefix is not None: args["Expression"] = f"{name_prefix}*" elif name_suffix is not None: args["Expression"] = f"*{name_suffix}" if database is not None: dbs: List[str] = [database] else: dbs = [x["Name"] for x in get_databases(catalog_id=catalog_id)] for db in dbs: args["DatabaseName"] = db response_iterator = paginator.paginate( **_catalog_id(catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, **args)) ) try: for page in response_iterator: for tbl in page["TableList"]: yield tbl except client_glue.exceptions.EntityNotFoundException: continue
def delete_column( database: str, table: str, column_name: str, transaction_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, catalog_id: Optional[str] = None, ) -> None: """Delete a column in a AWS Glue Catalog table. Parameters ---------- database : str Database name. table : str Table name. column_name : str Column name transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_column( ... database='my_db', ... table='my_table', ... column_name='my_col', ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) table_res: Dict[str, Any] = client_glue.get_table(**_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, Name=table), )) table_input: Dict[str, Any] = _update_table_definition(table_res) table_input["StorageDescriptor"]["Columns"] = [ i for i in table_input["StorageDescriptor"]["Columns"] if i["Name"] != column_name ] res: Dict[str, Any] = client_glue.update_table(**_catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableInput=table_input), )) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: raise exceptions.ServiceApiError(str(res["Errors"]))
def _create_table( # pylint: disable=too-many-branches,too-many-statements,too-many-locals database: str, table: str, description: Optional[str], parameters: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], table_type: Optional[str], table_exist: bool, projection_enabled: bool, partitions_types: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], transaction_id: Optional[str], projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], projection_storage_location_template: Optional[str], catalog_id: Optional[str], ) -> None: # Description mode = _update_if_necessary(dic=table_input, key="Description", value=description, mode=mode) if "Parameters" not in table_input: table_input["Parameters"] = {} # Parameters parameters = parameters if parameters else {} for k, v in parameters.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=k, value=v, mode=mode) # Projection if projection_enabled is True: table_input["Parameters"]["projection.enabled"] = "true" partitions_types = partitions_types if partitions_types else {} projection_types = projection_types if projection_types else {} projection_ranges = projection_ranges if projection_ranges else {} projection_values = projection_values if projection_values else {} projection_intervals = projection_intervals if projection_intervals else {} projection_digits = projection_digits if projection_digits else {} projection_types = {sanitize_column_name(k): v for k, v in projection_types.items()} projection_ranges = {sanitize_column_name(k): v for k, v in projection_ranges.items()} projection_values = {sanitize_column_name(k): v for k, v in projection_values.items()} projection_intervals = {sanitize_column_name(k): v for k, v in projection_intervals.items()} projection_digits = {sanitize_column_name(k): v for k, v in projection_digits.items()} for k, v in projection_types.items(): dtype: Optional[str] = partitions_types.get(k) if dtype is None and projection_storage_location_template is None: raise exceptions.InvalidArgumentCombination( f"Column {k} appears as projected column but not as partitioned column." ) if dtype == "date": table_input["Parameters"][f"projection.{k}.format"] = "yyyy-MM-dd" elif dtype == "timestamp": table_input["Parameters"][f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss" table_input["Parameters"][f"projection.{k}.interval.unit"] = "SECONDS" table_input["Parameters"][f"projection.{k}.interval"] = "1" for k, v in projection_types.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.type", value=v, mode=mode) for k, v in projection_ranges.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.range", value=v, mode=mode) for k, v in projection_values.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.values", value=v, mode=mode) for k, v in projection_intervals.items(): mode = _update_if_necessary( dic=table_input["Parameters"], key=f"projection.{k}.interval", value=str(v), mode=mode ) for k, v in projection_digits.items(): mode = _update_if_necessary( dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode ) mode = _update_if_necessary( table_input["Parameters"], key="storage.location.template", value=projection_storage_location_template, mode=mode, ) else: table_input["Parameters"]["projection.enabled"] = "false" # Column comments columns_comments = columns_comments if columns_comments else {} columns_comments = {sanitize_column_name(k): v for k, v in columns_comments.items()} if columns_comments: for col in table_input["StorageDescriptor"]["Columns"]: name: str = col["Name"] if name in columns_comments: mode = _update_if_necessary(dic=col, key="Comment", value=columns_comments[name], mode=mode) for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: mode = _update_if_necessary(dic=par, key="Comment", value=columns_comments[name], mode=mode) _logger.debug("table_input: %s", table_input) session: boto3.Session = _utils.ensure_session(session=boto3_session) client_glue: boto3.client = _utils.client(service_name="glue", session=session) skip_archive: bool = not catalog_versioning if mode not in ("overwrite", "append", "overwrite_partitions", "update"): raise exceptions.InvalidArgument( f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'." ) args: Dict[str, Any] = _catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, TableInput=table_input, ), ) if table_exist: _logger.debug("Updating table (%s)...", mode) args["SkipArchive"] = skip_archive if mode == "overwrite": if table_type != "GOVERNED": delete_all_partitions(table=table, database=database, catalog_id=catalog_id, boto3_session=session) client_glue.update_table(**args) elif mode == "update": client_glue.update_table(**args) else: try: _logger.debug("Creating table (%s)...", mode) client_glue.create_table(**args) except client_glue.exceptions.AlreadyExistsException: if mode == "overwrite": _utils.try_it( f=_overwrite_table, ex=client_glue.exceptions.AlreadyExistsException, client_glue=client_glue, catalog_id=catalog_id, database=database, table=table, table_input=table_input, transaction_id=transaction_id, boto3_session=boto3_session, ) _logger.debug("Leaving table as is (%s)...", mode)