def create_database( name: str, description: Optional[str] = None, catalog_id: Optional[str] = None, exist_ok: bool = False, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a database in AWS Glue Catalog. Parameters ---------- name : str Database name. description : str, optional A Descrption for the Database. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. exist_ok : bool If set to True will not raise an Exception if a Database with the same already exists. In this case the description will be updated if it is different from the current one. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.create_database( ... name='awswrangler_test' ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, str] = {"Name": name} if description is not None: args["Description"] = description try: r = client_glue.get_database(Name=name) if not exist_ok: raise exceptions.AlreadyExists( f"Database {name} already exists and <exist_ok> is set to False." ) if description and description != r["Database"].get("Description", ""): client_glue.update_database(**_catalog_id( catalog_id=catalog_id, Name=name, DatabaseInput=args)) except client_glue.exceptions.EntityNotFoundException: client_glue.create_database( **_catalog_id(catalog_id=catalog_id, DatabaseInput=args))
def _update_table_objects( catalog_id: Optional[str], database: str, table: str, transaction_id: str, boto3_session: Optional[boto3.Session], add_objects: Optional[List[Dict[str, Any]]] = None, del_objects: Optional[List[Dict[str, Any]]] = None, ) -> None: """Register Governed Table Objects changes to Lake Formation Engine.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) update_kwargs: Dict[str, Union[str, int, List[Dict[str, Dict[str, Any]]]]] = _catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, TableName=table)) write_operations: List[Dict[str, Dict[str, Any]]] = [] if add_objects: write_operations.extend({"AddObject": obj} for obj in add_objects) if del_objects: write_operations.extend({"DeleteObject": _without_keys(obj, ["Size"])} for obj in del_objects) update_kwargs["WriteOperations"] = write_operations client_lakeformation.update_table_objects(**update_kwargs)
def _overwrite_table( client_glue: boto3.client, catalog_id: Optional[str], database: str, table: str, table_input: Dict[str, Any], transaction_id: Optional[str], boto3_session: boto3.Session, ) -> None: delete_table_if_exists( database=database, table=table, transaction_id=transaction_id, boto3_session=boto3_session, catalog_id=catalog_id, ) args: Dict[str, Any] = _catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, TableInput=table_input, ), ) client_glue.create_table(**args)
def get_table_number_of_versions( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ) -> int: """Get tatal number of versions. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- int Total number of versions. Examples -------- >>> import awswrangler as wr >>> num = wr.catalog.get_table_number_of_versions(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_table_versions") count: int = 0 response_iterator = paginator.paginate(**_catalog_id(DatabaseName=database, TableName=table, catalog_id=catalog_id)) for page in response_iterator: count += len(page["TableVersions"]) return count
def _add_partitions( database: str, table: str, boto3_session: Optional[boto3.Session], inputs: List[Dict[str, Any]], catalog_id: Optional[str] = None, ) -> None: chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs, max_length=100) client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) for chunk in chunks: # pylint: disable=too-many-nested-blocks res: Dict[str, Any] = client_glue.batch_create_partition( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableName=table, PartitionInputList=chunk)) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: if error["ErrorDetail"][ "ErrorCode"] != "AlreadyExistsException": raise exceptions.ServiceApiError(str( res["Errors"]))
def get_databases( catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ) -> Iterator[Dict[str, Any]]: """Get an iterator of databases. Parameters ---------- catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Iterator[Dict[str, Any]] Iterator of Databases. Examples -------- >>> import awswrangler as wr >>> dbs = wr.catalog.get_databases() """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_databases") response_iterator = paginator.paginate(**_catalog_id(catalog_id=catalog_id)) for page in response_iterator: for db in page["DatabaseList"]: yield db
def get_table_description( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ) -> Optional[str]: """Get table description. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Optional[str] Description if exists. Examples -------- >>> import awswrangler as wr >>> desc = wr.catalog.get_table_description(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, Name=table) ) desc: Optional[str] = response["Table"].get("Description", None) return desc
def delete_database(name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> None: """Create a database in AWS Glue Catalog. Parameters ---------- name : str Database name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_database( ... name='awswrangler_test' ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) client_glue.delete_database( **_catalog_id(Name=name, catalog_id=catalog_id))
def _get_partitions( database: str, table: str, expression: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, List[str]]: client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = _catalog_id( catalog_id=catalog_id, DatabaseName=database, TableName=table, MaxResults=1_000, Segment={"SegmentNumber": 0, "TotalSegments": 1}, ) if expression is not None: args["Expression"] = expression partitions_values: Dict[str, List[str]] = {} _logger.debug("Starting pagination...") response: Dict[str, Any] = client_glue.get_partitions(**args) token: Optional[str] = _append_partitions(partitions_values=partitions_values, response=response) while token is not None: args["NextToken"] = response["NextToken"] response = client_glue.get_partitions(**args) token = _append_partitions(partitions_values=partitions_values, response=response) _logger.debug("Pagination done.") return partitions_values
def _get_table_input( database: str, table: str, boto3_session: Optional[boto3.Session], transaction_id: Optional[str] = None, catalog_id: Optional[str] = None, ) -> Optional[Dict[str, Any]]: client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, Name=table) ) try: response: Dict[str, Any] = client_glue.get_table(**args) except client_glue.exceptions.EntityNotFoundException: return None table_input: Dict[str, Any] = {} for k, v in response["Table"].items(): if k in [ "Name", "Description", "Owner", "LastAccessTime", "LastAnalyzedTime", "Retention", "StorageDescriptor", "PartitionKeys", "ViewOriginalText", "ViewExpandedText", "TableType", "Parameters", "TargetTable", ]: table_input[k] = v return table_input
def get_columns_comments( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, str]: """Get all columns comments. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] Columns comments. e.g. {"col1": "foo boo bar"}. Examples -------- >>> import awswrangler as wr >>> pars = wr.catalog.get_columns_comments(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) comments: Dict[str, str] = {} for c in response["Table"]["StorageDescriptor"]["Columns"]: comments[c["Name"]] = c.get("Comment") if "PartitionKeys" in response["Table"]: for p in response["Table"]["PartitionKeys"]: comments[p["Name"]] = p.get("Comment") return comments
def table(database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> pd.DataFrame: """Get table details as Pandas DataFrame. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- pandas.DataFrame Pandas DataFrame filled by formatted infos. Examples -------- >>> import awswrangler as wr >>> df_table = wr.catalog.table(database='default', table='my_table') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) tbl = client_glue.get_table(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, Name=table))["Table"] df_dict: Dict[str, List[Union[str, bool]]] = { "Column Name": [], "Type": [], "Partition": [], "Comment": [] } for col in tbl["StorageDescriptor"]["Columns"]: df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(False) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") if "PartitionKeys" in tbl: for col in tbl["PartitionKeys"]: df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(True) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") return pd.DataFrame(data=df_dict)
def get_table_types( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Optional[Dict[str, str]]: """Get all columns and types from a table. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Optional[Dict[str, str]] If table exists, a dictionary like {'col name': 'col data type'}. Otherwise None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.get_table_types(database='default', table='my_table') {'col0': 'int', 'col1': double} """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) try: response: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) except client_glue.exceptions.EntityNotFoundException: return None return _extract_dtypes_from_table_details(response=response)
def get_table_location( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Get table's location on Glue catalog. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Table's location. Examples -------- >>> import awswrangler as wr >>> wr.catalog.get_table_location(database='default', table='my_table') 's3://bucket/prefix/' """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) res: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) try: return cast(str, res["Table"]["StorageDescriptor"]["Location"]) except KeyError as ex: raise exceptions.InvalidTable(f"{database}.{table}") from ex
def get_table_description( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Optional[str]: """Get table description. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Optional[str] Description if exists. Examples -------- >>> import awswrangler as wr >>> desc = wr.catalog.get_table_description(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) ) desc: Optional[str] = response["Table"].get("Description", None) return desc
def delete_column( database: str, table: str, column_name: str, boto3_session: Optional[boto3.Session] = None, catalog_id: Optional[str] = None, ) -> None: """Delete a column in a AWS Glue Catalog table. Parameters ---------- database : str Database name. table : str Table name. column_name : str Column name boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_column( ... database='my_db', ... table='my_table', ... column_name='my_col', ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) table_res: Dict[str, Any] = client_glue.get_table(DatabaseName=database, Name=table) table_input: Dict[str, Any] = _update_table_definition(table_res) table_input["StorageDescriptor"]["Columns"] = [ i for i in table_input["StorageDescriptor"]["Columns"] if i["Name"] != column_name ] res: Dict[str, Any] = client_glue.update_table(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: raise exceptions.ServiceApiError(str(res["Errors"]))
def delete_partitions( table: str, database: str, partitions_values: List[List[str]], catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Delete specified partitions in a AWS Glue Catalog table. Parameters ---------- table : str Table name. database : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. partitions_values : List[List[str]] List of lists of partitions values as strings. (e.g. [['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']]). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_partitions( ... table='my_table', ... database='awswrangler_test', ... partitions_values=[['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']] ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) chunks: List[List[List[str]]] = _utils.chunkify(lst=partitions_values, max_length=25) for chunk in chunks: client_glue.batch_delete_partition(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableName=table, PartitionsToDelete=[{ "Values": v } for v in chunk], ))
def delete_table_if_exists( database: str, table: str, transaction_id: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> bool: """Delete Glue table if exists. Parameters ---------- database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if deleted, otherwise False. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_table_if_exists(database='default', table='my_table') # deleted True >>> wr.catalog.delete_table_if_exists(database='default', table='my_table') # Nothing to be deleted False """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) try: client_glue.delete_table( **_catalog_id( **_transaction_id( transaction_id=transaction_id, DatabaseName=database, Name=table, catalog_id=catalog_id ) ) ) return True except client_glue.exceptions.EntityNotFoundException: return False
def _overwrite_table_parameters( parameters: Dict[str, str], database: str, catalog_versioning: bool, catalog_id: Optional[str], table_input: Dict[str, Any], boto3_session: Optional[boto3.Session], ) -> Dict[str, str]: table_input["Parameters"] = parameters client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) skip_archive: bool = not catalog_versioning client_glue.update_table( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive) ) return parameters
def _get_table_objects( catalog_id: Optional[str], database: str, table: str, transaction_id: str, boto3_session: Optional[boto3.Session], partition_cols: Optional[List[str]] = None, partitions_types: Optional[Dict[str, str]] = None, partitions_values: Optional[List[str]] = None, ) -> List[Dict[str, Any]]: """Get Governed Table Objects from Lake Formation Engine.""" session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) scan_kwargs: Dict[str, Union[str, int]] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableName=table, MaxResults=100), ) if partition_cols and partitions_types and partitions_values: scan_kwargs["PartitionPredicate"] = _build_partition_predicate( partition_cols=partition_cols, partitions_types=partitions_types, partitions_values=partitions_values) next_token: str = "init_token" # Dummy token table_objects: List[Dict[str, Any]] = [] while next_token: response = _utils.try_it( f=client_lakeformation.get_table_objects, ex=botocore.exceptions.ClientError, ex_code="ResourceNotReadyException", base=1.0, max_num_tries=5, **scan_kwargs, ) for objects in response["Objects"]: for table_object in objects["Objects"]: if objects["PartitionValues"]: table_object["PartitionValues"] = objects[ "PartitionValues"] table_objects.append(table_object) next_token = response.get("NextToken", None) scan_kwargs["NextToken"] = next_token return table_objects
def get_connection( name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]: """Get Glue connection details. Parameters ---------- name : str Connection name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] API Response for: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_connection Examples -------- >>> import awswrangler as wr >>> res = wr.catalog.get_connection(name='my_connection') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) res = _utils.try_it( f=client_glue.get_connection, ex=botocore.exceptions.ClientError, ex_code="ThrottlingException", max_num_tries=3, **_catalog_id(catalog_id=catalog_id, Name=name, HidePassword=False), )["Connection"] if "ENCRYPTED_PASSWORD" in res["ConnectionProperties"]: client_kms = _utils.client(service_name="kms", session=boto3_session) pwd = client_kms.decrypt(CiphertextBlob=base64.b64decode( res["ConnectionProperties"] ["ENCRYPTED_PASSWORD"]))["Plaintext"].decode("utf-8") res["ConnectionProperties"]["PASSWORD"] = pwd return cast(Dict[str, Any], res)
def search_tables( text: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ) -> Iterator[Dict[str, Any]]: """Get Pandas DataFrame of tables filtered by a search string. Note ---- Search feature is not supported for Governed tables. Parameters ---------- text : str, optional Select only tables with the given string in table's properties. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Iterator[Dict[str, Any]] Iterator of tables. Examples -------- >>> import awswrangler as wr >>> df_tables = wr.catalog.search_tables(text='my_property') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = _catalog_id(catalog_id=catalog_id, SearchText=text) response: Dict[str, Any] = client_glue.search_tables(**args) for tbl in response["TableList"]: yield tbl while "NextToken" in response: args["NextToken"] = response["NextToken"] response = client_glue.search_tables(**args) for tbl in response["TableList"]: yield tbl
def get_table_versions( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> List[Dict[str, Any]]: """Get all versions. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- List[Dict[str, Any] List of table inputs: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_table_versions Examples -------- >>> import awswrangler as wr >>> tables_versions = wr.catalog.get_table_versions(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_table_versions") versions: List[Dict[str, Any]] = [] response_iterator = paginator.paginate(**_catalog_id( DatabaseName=database, TableName=table, catalog_id=catalog_id)) for page in response_iterator: for tbl in page["TableVersions"]: versions.append(tbl) return versions
def get_columns_comments( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> Dict[str, str]: """Get all columns comments. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] Columns comments. e.g. {"col1": "foo boo bar"}. Examples -------- >>> import awswrangler as wr >>> pars = wr.catalog.get_table_parameters(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, Name=table)) comments: Dict[str, str] = {} for c in response["Table"]["StorageDescriptor"]["Columns"]: comments[c["Name"]] = c["Comment"] if "PartitionKeys" in response["Table"]: for p in response["Table"]["PartitionKeys"]: comments[p["Name"]] = p["Comment"] return comments
def get_connection( name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]: """Get Glue connection details. Parameters ---------- name : str Connection name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] API Response for: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_connection Examples -------- >>> import awswrangler as wr >>> res = wr.catalog.get_connection(name='my_connection') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) return cast( Dict[str, Any], client_glue.get_connection( **_catalog_id(catalog_id=catalog_id, Name=name, HidePassword=False))["Connection"], )
def add_column( database: str, table: str, column_name: str, column_type: str = "string", column_comment: Optional[str] = None, transaction_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, catalog_id: Optional[str] = None, ) -> None: """Add a column in a AWS Glue Catalog table. Parameters ---------- database : str Database name. table : str Table name. column_name : str Column name column_type : str Column type. column_comment : str Column Comment transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_column( ... database='my_db', ... table='my_table', ... column_name='my_col', ... column_type='int' ... ) """ if _check_column_type(column_type): client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) table_res: Dict[str, Any] = client_glue.get_table(**_catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, Name=table), )) table_input: Dict[str, Any] = _update_table_definition(table_res) table_input["StorageDescriptor"]["Columns"].append({ "Name": column_name, "Type": column_type, "Comment": column_comment }) res: Dict[str, Any] = client_glue.update_table(**_catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableInput=table_input), )) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: raise exceptions.ServiceApiError(str(res["Errors"]))
def read_sql_query( sql: str, database: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, params: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: """Execute PartiQL query on AWS Glue Table (Transaction ID or time travel timestamp). Return Pandas DataFrame. Note ---- ORDER BY operations are not honoured. i.e. sql="SELECT * FROM my_table ORDER BY my_column" is NOT valid Note ---- The database must NOT be explicitely defined in the PartiQL statement. i.e. sql="SELECT * FROM my_table" is valid but sql="SELECT * FROM my_db.my_table" is NOT valid Note ---- Pass one of `transaction_id` or `query_as_of_time`, not both. Parameters ---------- sql : str partiQL query. database : str AWS Glue database name transaction_id : str, optional The ID of the transaction at which to read the table contents. Cannot be specified alongside query_as_of_time query_as_of_time : str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. use_threads : bool True to enable concurrent requests, False to disable multiple threads. When enabled, os.cpu_count() is used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session is used if boto3_session receives None. params: Dict[str, any], optional Dict of parameters used to format the partiQL query. Only named parameters are supported. The dict must contain the information in the form {"name": "value"} and the SQL query must contain `:name`. Returns ------- pd.DataFrame Pandas DataFrame. Examples -------- >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table;", ... database="my_db", ... catalog_id="111111111111" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table LIMIT 10;", ... database="my_db", ... transaction_id="1b62811fa3e02c4e5fdbaa642b752030379c4a8a70da1f8732ce6ccca47afdc9" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table WHERE name=:name; AND city=:city;", ... database="my_db", ... query_as_of_time="1611142914", ... params={"name": "'filtered_name'", "city": "'filtered_city'"} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) commit_trans: bool = False if params is None: params = {} for key, value in params.items(): sql = sql.replace(f":{key};", str(value)) if not any([transaction_id, query_as_of_time]): _logger.debug( "Neither `transaction_id` nor `query_as_of_time` were specified, starting transaction" ) transaction_id = start_transaction(read_only=True, boto3_session=session) commit_trans = True args: Dict[str, Optional[str]] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database), ) query_id: str = client_lakeformation.start_query_planning( QueryString=sql, QueryPlanningContext=args)["QueryId"] df = _resolve_sql_query( query_id=query_id, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, boto3_session=session, ) if commit_trans: commit_transaction(transaction_id=transaction_id) # type: ignore return df
def table( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: """Get table details as Pandas DataFrame. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- pandas.DataFrame Pandas DataFrame filled by formatted infos. Examples -------- >>> import awswrangler as wr >>> df_table = wr.catalog.table(database='default', table='my_table') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) tbl = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) )["Table"] df_dict: Dict[str, List[Union[str, bool]]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []} if "StorageDescriptor" in tbl: for col in tbl["StorageDescriptor"].get("Columns", {}): df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(False) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") if "PartitionKeys" in tbl: for col in tbl["PartitionKeys"]: df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(True) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") return pd.DataFrame(data=df_dict)
def get_tables( catalog_id: Optional[str] = None, database: Optional[str] = None, transaction_id: Optional[str] = None, name_contains: Optional[str] = None, name_prefix: Optional[str] = None, name_suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Iterator[Dict[str, Any]]: """Get an iterator of tables. Note ---- Please, does not filter using name_contains and name_prefix/name_suffix at the same time. Only name_prefix and name_suffix can be combined together. Parameters ---------- catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. database : str, optional Database name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). name_contains : str, optional Select by a specific string on table name name_prefix : str, optional Select by a specific prefix on table name name_suffix : str, optional Select by a specific suffix on table name boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Iterator[Dict[str, Any]] Iterator of tables. Examples -------- >>> import awswrangler as wr >>> tables = wr.catalog.get_tables() """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_tables") args: Dict[str, str] = {} if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None): raise exceptions.InvalidArgumentCombination( "Please, does not filter using name_contains and " "name_prefix/name_suffix at the same time. Only " "name_prefix and name_suffix can be combined together." ) if (name_prefix is not None) and (name_suffix is not None): args["Expression"] = f"{name_prefix}*{name_suffix}" elif name_contains is not None: args["Expression"] = f"*{name_contains}*" elif name_prefix is not None: args["Expression"] = f"{name_prefix}*" elif name_suffix is not None: args["Expression"] = f"*{name_suffix}" if database is not None: dbs: List[str] = [database] else: dbs = [x["Name"] for x in get_databases(catalog_id=catalog_id)] for db in dbs: args["DatabaseName"] = db response_iterator = paginator.paginate( **_catalog_id(catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, **args)) ) try: for page in response_iterator: for tbl in page["TableList"]: yield tbl except client_glue.exceptions.EntityNotFoundException: continue
def _create_table( # pylint: disable=too-many-branches,too-many-statements database: str, table: str, description: Optional[str], parameters: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], table_exist: bool, projection_enabled: bool, partitions_types: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], catalog_id: Optional[str], ) -> None: # Description mode = _update_if_necessary(dic=table_input, key="Description", value=description, mode=mode) # Parameters parameters = parameters if parameters else {} for k, v in parameters.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=k, value=v, mode=mode) # Projection if projection_enabled is True: table_input["Parameters"]["projection.enabled"] = "true" partitions_types = partitions_types if partitions_types else {} projection_types = projection_types if projection_types else {} projection_ranges = projection_ranges if projection_ranges else {} projection_values = projection_values if projection_values else {} projection_intervals = projection_intervals if projection_intervals else {} projection_digits = projection_digits if projection_digits else {} projection_types = { sanitize_column_name(k): v for k, v in projection_types.items() } projection_ranges = { sanitize_column_name(k): v for k, v in projection_ranges.items() } projection_values = { sanitize_column_name(k): v for k, v in projection_values.items() } projection_intervals = { sanitize_column_name(k): v for k, v in projection_intervals.items() } projection_digits = { sanitize_column_name(k): v for k, v in projection_digits.items() } for k, v in projection_types.items(): dtype: Optional[str] = partitions_types.get(k) if dtype is None: raise exceptions.InvalidArgumentCombination( f"Column {k} appears as projected column but not as partitioned column." ) if dtype == "date": table_input["Parameters"][ f"projection.{k}.format"] = "yyyy-MM-dd" elif dtype == "timestamp": table_input["Parameters"][ f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss" table_input["Parameters"][ f"projection.{k}.interval.unit"] = "SECONDS" table_input["Parameters"][f"projection.{k}.interval"] = "1" for k, v in projection_types.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.type", value=v, mode=mode) for k, v in projection_ranges.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.range", value=v, mode=mode) for k, v in projection_values.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.values", value=v, mode=mode) for k, v in projection_intervals.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.interval", value=str(v), mode=mode) for k, v in projection_digits.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode) else: table_input["Parameters"]["projection.enabled"] = "false" # Column comments columns_comments = columns_comments if columns_comments else {} columns_comments = { sanitize_column_name(k): v for k, v in columns_comments.items() } if columns_comments: for col in table_input["StorageDescriptor"]["Columns"]: name: str = col["Name"] if name in columns_comments: mode = _update_if_necessary(dic=col, key="Comment", value=columns_comments[name], mode=mode) for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: mode = _update_if_necessary(dic=par, key="Comment", value=columns_comments[name], mode=mode) _logger.debug("table_input: %s", table_input) session: boto3.Session = _utils.ensure_session(session=boto3_session) client_glue: boto3.client = _utils.client(service_name="glue", session=session) skip_archive: bool = not catalog_versioning if mode not in ("overwrite", "append", "overwrite_partitions", "update"): raise exceptions.InvalidArgument( f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'." ) if table_exist is True and mode == "overwrite": delete_all_partitions(table=table, database=database, catalog_id=catalog_id, boto3_session=session) _logger.debug("Updating table (%s)...", mode) client_glue.update_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)) elif (table_exist is True) and (mode in ("append", "overwrite_partitions", "update")): if mode == "update": _logger.debug("Updating table (%s)...", mode) client_glue.update_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)) elif table_exist is False: try: _logger.debug("Creating table (%s)...", mode) client_glue.create_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)) except client_glue.exceptions.AlreadyExistsException: if mode == "overwrite": _utils.try_it( f=_overwrite_table, ex=client_glue.exceptions.AlreadyExistsException, client_glue=client_glue, catalog_id=catalog_id, database=database, table=table, table_input=table_input, boto3_session=boto3_session, ) _logger.debug("Leaving table as is (%s)...", mode)