def _delete_objects(bucket: str, keys: List[str], boto3_session: boto3.Session, attempt: int = 1) -> None: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) _logger.debug("len(keys): %s", len(keys)) batch: List[Dict[str, str]] = [{"Key": key} for key in keys] res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) deleted: List[Dict[str, Any]] = res.get("Deleted", []) for obj in deleted: _logger.debug("s3://%s/%s has been deleted.", bucket, obj.get("Key")) errors: List[Dict[str, Any]] = res.get("Errors", []) internal_errors: List[str] = [] for error in errors: _logger.debug("error: %s", error) if "Code" not in error or error["Code"] != "InternalError": raise exceptions.ServiceApiError(errors) internal_errors.append(_unquote_plus(error["Key"])) if len(internal_errors) > 0: if attempt > 5: # Maximum of 5 attempts (Total of 15 seconds) raise exceptions.ServiceApiError(errors) time.sleep(attempt) # Incremental delay (linear) _delete_objects(bucket=bucket, keys=internal_errors, boto3_session=boto3_session, attempt=(attempt + 1))
def _add_partitions( database: str, table: str, boto3_session: Optional[boto3.Session], inputs: List[Dict[str, Any]], catalog_id: Optional[str] = None, ) -> None: chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs, max_length=100) client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) for chunk in chunks: # pylint: disable=too-many-nested-blocks res: Dict[str, Any] = client_glue.batch_create_partition( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableName=table, PartitionInputList=chunk)) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: if error["ErrorDetail"][ "ErrorCode"] != "AlreadyExistsException": raise exceptions.ServiceApiError(str( res["Errors"]))
def add_parquet_partitions( database: str, table: str, partitions_values: Dict[str, List[str]], compression: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog. Parameters ---------- database : str Database name. table : str Table name. partitions_values: Dict[str, List[str]] Dictionary with keys as S3 path locations and values as a list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). compression: str, optional Compression style (``None``, ``snappy``, ``gzip``, etc). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_parquet_partitions( ... database='default', ... table='my_table', ... partitions_values={ ... 's3://bucket/prefix/y=2020/m=10/': ['2020', '10'], ... 's3://bucket/prefix/y=2020/m=11/': ['2020', '11'], ... 's3://bucket/prefix/y=2020/m=12/': ['2020', '12'] ... } ... ) """ inputs: List[Dict[str, Any]] = [ _parquet_partition_definition(location=k, values=v, compression=compression) for k, v in partitions_values.items() ] chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs, max_length=100) client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) for chunk in chunks: res: Dict[str, Any] = client_glue.batch_create_partition( DatabaseName=database, TableName=table, PartitionInputList=chunk) if ("Errors" in res) and res["Errors"]: # pragma: no cover raise exceptions.ServiceApiError(str(res["Errors"]))
def delete_column( database: str, table: str, column_name: str, boto3_session: Optional[boto3.Session] = None, catalog_id: Optional[str] = None, ) -> None: """Delete a column in a AWS Glue Catalog table. Parameters ---------- database : str Database name. table : str Table name. column_name : str Column name boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_column( ... database='my_db', ... table='my_table', ... column_name='my_col', ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) table_res: Dict[str, Any] = client_glue.get_table(DatabaseName=database, Name=table) table_input: Dict[str, Any] = _update_table_definition(table_res) table_input["StorageDescriptor"]["Columns"] = [ i for i in table_input["StorageDescriptor"]["Columns"] if i["Name"] != column_name ] res: Dict[str, Any] = client_glue.update_table(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: raise exceptions.ServiceApiError(str(res["Errors"]))
def _delete_objects(bucket: str, keys: List[str], client_s3: boto3.client) -> None: _logger.debug("len(keys): %s", len(keys)) batch: List[Dict[str, str]] = [{"Key": key} for key in keys] res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch}) deleted = res.get("Deleted") if deleted is not None: for i in deleted: _logger.debug("s3://%s/%s has been deleted.", bucket, i.get("Key")) errors = res.get("Errors") if errors is not None: # pragma: no cover raise exceptions.ServiceApiError(errors)
def add_column( database: str, table: str, column_name: str, column_type: str = "string", column_comment: Optional[str] = None, transaction_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, catalog_id: Optional[str] = None, ) -> None: """Add a column in a AWS Glue Catalog table. Parameters ---------- database : str Database name. table : str Table name. column_name : str Column name column_type : str Column type. column_comment : str Column Comment transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_column( ... database='my_db', ... table='my_table', ... column_name='my_col', ... column_type='int' ... ) """ if _check_column_type(column_type): client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) table_res: Dict[str, Any] = client_glue.get_table(**_catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, Name=table), )) table_input: Dict[str, Any] = _update_table_definition(table_res) table_input["StorageDescriptor"]["Columns"].append({ "Name": column_name, "Type": column_type, "Comment": column_comment }) res: Dict[str, Any] = client_glue.update_table(**_catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, DatabaseName=database, TableInput=table_input), )) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: raise exceptions.ServiceApiError(str(res["Errors"]))