Beispiel #1
0
def _delete_objects(bucket: str,
                    keys: List[str],
                    boto3_session: boto3.Session,
                    attempt: int = 1) -> None:
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    _logger.debug("len(keys): %s", len(keys))
    batch: List[Dict[str, str]] = [{"Key": key} for key in keys]
    res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch})
    deleted: List[Dict[str, Any]] = res.get("Deleted", [])
    for obj in deleted:
        _logger.debug("s3://%s/%s has been deleted.", bucket, obj.get("Key"))
    errors: List[Dict[str, Any]] = res.get("Errors", [])
    internal_errors: List[str] = []
    for error in errors:
        _logger.debug("error: %s", error)
        if "Code" not in error or error["Code"] != "InternalError":
            raise exceptions.ServiceApiError(errors)
        internal_errors.append(_unquote_plus(error["Key"]))
    if len(internal_errors) > 0:
        if attempt > 5:  # Maximum of 5 attempts (Total of 15 seconds)
            raise exceptions.ServiceApiError(errors)
        time.sleep(attempt)  # Incremental delay (linear)
        _delete_objects(bucket=bucket,
                        keys=internal_errors,
                        boto3_session=boto3_session,
                        attempt=(attempt + 1))
Beispiel #2
0
def _add_partitions(
    database: str,
    table: str,
    boto3_session: Optional[boto3.Session],
    inputs: List[Dict[str, Any]],
    catalog_id: Optional[str] = None,
) -> None:
    chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs,
                                                         max_length=100)
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    for chunk in chunks:  # pylint: disable=too-many-nested-blocks
        res: Dict[str, Any] = client_glue.batch_create_partition(
            **_catalog_id(catalog_id=catalog_id,
                          DatabaseName=database,
                          TableName=table,
                          PartitionInputList=chunk))
        if ("Errors" in res) and res["Errors"]:
            for error in res["Errors"]:
                if "ErrorDetail" in error:
                    if "ErrorCode" in error["ErrorDetail"]:
                        if error["ErrorDetail"][
                                "ErrorCode"] != "AlreadyExistsException":
                            raise exceptions.ServiceApiError(str(
                                res["Errors"]))
Beispiel #3
0
def add_parquet_partitions(
    database: str,
    table: str,
    partitions_values: Dict[str, List[str]],
    compression: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    partitions_values: Dict[str, List[str]]
        Dictionary with keys as S3 path locations and values as a list of partitions values as str
        (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}).
    compression: str, optional
        Compression style (``None``, ``snappy``, ``gzip``, etc).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.add_parquet_partitions(
    ...     database='default',
    ...     table='my_table',
    ...     partitions_values={
    ...         's3://bucket/prefix/y=2020/m=10/': ['2020', '10'],
    ...         's3://bucket/prefix/y=2020/m=11/': ['2020', '11'],
    ...         's3://bucket/prefix/y=2020/m=12/': ['2020', '12']
    ...     }
    ... )

    """
    inputs: List[Dict[str, Any]] = [
        _parquet_partition_definition(location=k,
                                      values=v,
                                      compression=compression)
        for k, v in partitions_values.items()
    ]
    chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs,
                                                         max_length=100)
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    for chunk in chunks:
        res: Dict[str, Any] = client_glue.batch_create_partition(
            DatabaseName=database, TableName=table, PartitionInputList=chunk)
        if ("Errors" in res) and res["Errors"]:  # pragma: no cover
            raise exceptions.ServiceApiError(str(res["Errors"]))
Beispiel #4
0
def delete_column(
    database: str,
    table: str,
    column_name: str,
    boto3_session: Optional[boto3.Session] = None,
    catalog_id: Optional[str] = None,
) -> None:
    """Delete a column in a AWS Glue Catalog table.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    column_name : str
        Column name
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.

    Returns
    -------
    None
        None

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.delete_column(
    ...     database='my_db',
    ...     table='my_table',
    ...     column_name='my_col',
    ... )
    """
    client_glue: boto3.client = _utils.client(service_name="glue",
                                              session=boto3_session)
    table_res: Dict[str, Any] = client_glue.get_table(DatabaseName=database,
                                                      Name=table)
    table_input: Dict[str, Any] = _update_table_definition(table_res)
    table_input["StorageDescriptor"]["Columns"] = [
        i for i in table_input["StorageDescriptor"]["Columns"]
        if i["Name"] != column_name
    ]
    res: Dict[str, Any] = client_glue.update_table(**_catalog_id(
        catalog_id=catalog_id, DatabaseName=database, TableInput=table_input))
    if ("Errors" in res) and res["Errors"]:
        for error in res["Errors"]:
            if "ErrorDetail" in error:
                if "ErrorCode" in error["ErrorDetail"]:
                    raise exceptions.ServiceApiError(str(res["Errors"]))
Beispiel #5
0
def _delete_objects(bucket: str, keys: List[str],
                    client_s3: boto3.client) -> None:
    _logger.debug("len(keys): %s", len(keys))
    batch: List[Dict[str, str]] = [{"Key": key} for key in keys]
    res = client_s3.delete_objects(Bucket=bucket, Delete={"Objects": batch})
    deleted = res.get("Deleted")
    if deleted is not None:
        for i in deleted:
            _logger.debug("s3://%s/%s has been deleted.", bucket, i.get("Key"))
    errors = res.get("Errors")
    if errors is not None:  # pragma: no cover
        raise exceptions.ServiceApiError(errors)
Beispiel #6
0
def add_column(
    database: str,
    table: str,
    column_name: str,
    column_type: str = "string",
    column_comment: Optional[str] = None,
    transaction_id: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
    catalog_id: Optional[str] = None,
) -> None:
    """Add a column in a AWS Glue Catalog table.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    column_name : str
        Column name
    column_type : str
        Column type.
    column_comment : str
        Column Comment
    transaction_id: str, optional
        The ID of the transaction (i.e. used with GOVERNED tables).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.

    Returns
    -------
    None
        None

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.add_column(
    ...     database='my_db',
    ...     table='my_table',
    ...     column_name='my_col',
    ...     column_type='int'
    ... )
    """
    if _check_column_type(column_type):
        client_glue: boto3.client = _utils.client(service_name="glue",
                                                  session=boto3_session)
        table_res: Dict[str, Any] = client_glue.get_table(**_catalog_id(
            catalog_id=catalog_id,
            **_transaction_id(transaction_id=transaction_id,
                              DatabaseName=database,
                              Name=table),
        ))
        table_input: Dict[str, Any] = _update_table_definition(table_res)
        table_input["StorageDescriptor"]["Columns"].append({
            "Name":
            column_name,
            "Type":
            column_type,
            "Comment":
            column_comment
        })
        res: Dict[str, Any] = client_glue.update_table(**_catalog_id(
            catalog_id=catalog_id,
            **_transaction_id(transaction_id=transaction_id,
                              DatabaseName=database,
                              TableInput=table_input),
        ))
        if ("Errors" in res) and res["Errors"]:
            for error in res["Errors"]:
                if "ErrorDetail" in error:
                    if "ErrorCode" in error["ErrorDetail"]:
                        raise exceptions.ServiceApiError(str(res["Errors"]))