예제 #1
0
def delete_all_partitions(
        table: str,
        database: str,
        catalog_id: Optional[str] = None,
        boto3_session: Optional[boto3.Session] = None) -> List[List[str]]:
    """Delete all partitions in a AWS Glue Catalog table.

    Parameters
    ----------
    table : str
        Table name.
    database : str
        Table name.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    List[List[str]]
        Partitions values.

    Examples
    --------
    >>> import awswrangler as wr
    >>> partitions = wr.catalog.delete_all_partitions(
    ...     table='my_table',
    ...     database='awswrangler_test',
    ... )
    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    _logger.debug("Fetching existing partitions...")
    partitions_values: List[List[str]] = list(
        _get_partitions(database=database,
                        table=table,
                        boto3_session=session,
                        catalog_id=catalog_id).values())
    _logger.debug("Number of old partitions: %s", len(partitions_values))
    _logger.debug("Deleting existing partitions...")
    delete_partitions(
        table=table,
        database=database,
        catalog_id=catalog_id,
        partitions_values=partitions_values,
        boto3_session=boto3_session,
    )
    return partitions_values
예제 #2
0
def _create_table(  # pylint: disable=too-many-branches,too-many-statements
    database: str,
    table: str,
    description: Optional[str],
    parameters: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    boto3_session: Optional[boto3.Session],
    table_input: Dict[str, Any],
    table_exist: bool,
    projection_enabled: bool,
    partitions_types: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    projection_types: Optional[Dict[str, str]],
    projection_ranges: Optional[Dict[str, str]],
    projection_values: Optional[Dict[str, str]],
    projection_intervals: Optional[Dict[str, str]],
    projection_digits: Optional[Dict[str, str]],
    catalog_id: Optional[str],
) -> None:
    # Description
    mode = _update_if_necessary(dic=table_input, key="Description", value=description, mode=mode)

    # Parameters
    parameters = parameters if parameters else {}
    for k, v in parameters.items():
        mode = _update_if_necessary(dic=table_input["Parameters"], key=k, value=v, mode=mode)

    # Projection
    if projection_enabled is True:
        table_input["Parameters"]["projection.enabled"] = "true"
        partitions_types = partitions_types if partitions_types else {}
        projection_types = projection_types if projection_types else {}
        projection_ranges = projection_ranges if projection_ranges else {}
        projection_values = projection_values if projection_values else {}
        projection_intervals = projection_intervals if projection_intervals else {}
        projection_digits = projection_digits if projection_digits else {}
        projection_types = {sanitize_column_name(k): v for k, v in projection_types.items()}
        projection_ranges = {sanitize_column_name(k): v for k, v in projection_ranges.items()}
        projection_values = {sanitize_column_name(k): v for k, v in projection_values.items()}
        projection_intervals = {sanitize_column_name(k): v for k, v in projection_intervals.items()}
        projection_digits = {sanitize_column_name(k): v for k, v in projection_digits.items()}
        for k, v in projection_types.items():
            dtype: Optional[str] = partitions_types.get(k)
            if dtype is None:
                raise exceptions.InvalidArgumentCombination(
                    f"Column {k} appears as projected column but not as partitioned column."
                )
            if dtype == "date":
                table_input["Parameters"][f"projection.{k}.format"] = "yyyy-MM-dd"
            elif dtype == "timestamp":
                table_input["Parameters"][f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss"
                table_input["Parameters"][f"projection.{k}.interval.unit"] = "SECONDS"
                table_input["Parameters"][f"projection.{k}.interval"] = "1"
        for k, v in projection_types.items():
            mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.type", value=v, mode=mode)
        for k, v in projection_ranges.items():
            mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.range", value=v, mode=mode)
        for k, v in projection_values.items():
            mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.values", value=v, mode=mode)
        for k, v in projection_intervals.items():
            mode = _update_if_necessary(
                dic=table_input["Parameters"], key=f"projection.{k}.interval", value=str(v), mode=mode
            )
        for k, v in projection_digits.items():
            mode = _update_if_necessary(
                dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode
            )
    else:
        table_input["Parameters"]["projection.enabled"] = "false"

    # Column comments
    columns_comments = columns_comments if columns_comments else {}
    columns_comments = {sanitize_column_name(k): v for k, v in columns_comments.items()}
    if columns_comments:
        for col in table_input["StorageDescriptor"]["Columns"]:
            name: str = col["Name"]
            if name in columns_comments:
                mode = _update_if_necessary(dic=col, key="Comment", value=columns_comments[name], mode=mode)
        for par in table_input["PartitionKeys"]:
            name = par["Name"]
            if name in columns_comments:
                mode = _update_if_necessary(dic=par, key="Comment", value=columns_comments[name], mode=mode)

    _logger.debug("table_input: %s", table_input)

    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_glue: boto3.client = _utils.client(service_name="glue", session=session)
    skip_archive: bool = not catalog_versioning
    if mode not in ("overwrite", "append", "overwrite_partitions", "update"):
        raise exceptions.InvalidArgument(
            f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'."
        )
    if table_exist is True and mode == "overwrite":
        _logger.debug("Fetching existing partitions...")
        partitions_values: List[List[str]] = list(
            _get_partitions(database=database, table=table, boto3_session=session, catalog_id=catalog_id).values()
        )
        _logger.debug("Number of old partitions: %s", len(partitions_values))
        _logger.debug("Deleting existing partitions...")
        client_glue.batch_delete_partition(
            **_catalog_id(
                catalog_id=catalog_id,
                DatabaseName=database,
                TableName=table,
                PartitionsToDelete=[{"Values": v} for v in partitions_values],
            )
        )
        _logger.debug("Updating table...")
        client_glue.update_table(
            **_catalog_id(
                catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive
            )
        )
    elif (table_exist is True) and (mode in ("append", "overwrite_partitions", "update")):
        if mode == "update":
            client_glue.update_table(
                **_catalog_id(
                    catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive
                )
            )
    elif table_exist is False:
        try:
            client_glue.create_table(
                **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)
            )
        except client_glue.exceptions.AlreadyExistsException as ex:
            if mode == "overwrite":
                delete_table_if_exists(database=database, table=table, boto3_session=session, catalog_id=catalog_id)
                client_glue.create_table(
                    **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)
                )
            else:
                raise ex
예제 #3
0
def read_parquet_table(
    table: str,
    database: str,
    filename_suffix: Union[str, List[str], None] = None,
    filename_ignore_suffix: Union[str, List[str], None] = None,
    catalog_id: Optional[str] = None,
    partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None,
    columns: Optional[List[str]] = None,
    validate_schema: bool = True,
    categories: Optional[List[str]] = None,
    safe: bool = True,
    map_types: bool = True,
    chunked: Union[bool, int] = False,
    use_threads: Union[bool, int] = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    """Read Apache Parquet table registered on AWS Glue Catalog.

    Note
    ----
    ``Batching`` (`chunked` argument) (Memory Friendly):

    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.

    There are two batching strategies on Wrangler:

    - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset.

    - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating
      to return DataFrames with the number of row igual the received INTEGER.

    `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise
    in number of rows for each Dataframe.


    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    table : str
        AWS Glue Catalog table name.
    database : str
        AWS Glue Catalog database name.
    filename_suffix: Union[str, List[str], None]
        Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
        If None, will try to read all files. (default)
    filename_ignore_suffix: Union[str, List[str], None]
        Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
        If None, will try to read all files. (default)
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    partition_filter: Optional[Callable[[Dict[str, str]], bool]]
        Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
        This function MUST receive a single argument (Dict[str, str]) where keys are partitions
        names and values are partitions values. Partitions values will be always strings extracted from S3.
        This function MUST return a bool, True to read the partition or False to ignore it.
        Ignored if `dataset=False`.
        E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False``
        https://aws-data-wrangler.readthedocs.io/en/2.13.0/tutorials/023%20-%20Flexible%20Partitions%20Filter.html
    columns : List[str], optional
        Names of columns to read from the file(s).
    validate_schema:
        Check that individual file schemas are all the same / compatible. Schemas within a
        folder prefix should all be the same. Disable if you have schemas that are different
        and want to disable this check.
    categories: Optional[List[str]], optional
        List of columns names that should be returned as pandas.Categorical.
        Recommended for memory restricted environments.
    safe : bool, default True
        For certain data types, a cast is needed in order to store the
        data in a pandas DataFrame or Series (e.g. timestamps are always
        stored as nanoseconds in pandas). This option controls whether it
        is a safe cast or not.
    map_types : bool, default True
        True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is
        used to override the default pandas type for conversion of built-in
        pyarrow types or in absence of pandas_metadata in the Table schema.
    chunked : bool
        If True will break the data in smaller DataFrames (Non deterministic number of lines).
        Otherwise return a single DataFrame with the whole data.
    use_threads : Union[bool, int]
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
        If integer is provided, specified number is used.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered.

    Returns
    -------
    Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]]
        Pandas DataFrame or a Generator in case of `chunked=True`.

    Examples
    --------
    Reading Parquet Table

    >>> import awswrangler as wr
    >>> df = wr.s3.read_parquet_table(database='...', table='...')

    Reading Parquet Table encrypted

    >>> import awswrangler as wr
    >>> df = wr.s3.read_parquet_table(
    ...     database='...',
    ...     table='...'
    ...     s3_additional_kwargs={
    ...         'ServerSideEncryption': 'aws:kms',
    ...         'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'
    ...     }
    ... )

    Reading Parquet Table in chunks (Chunk by file)

    >>> import awswrangler as wr
    >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True)
    >>> for df in dfs:
    >>>     print(df)  # Smaller Pandas DataFrame

    Reading Parquet Dataset with PUSH-DOWN filter over partitions

    >>> import awswrangler as wr
    >>> my_filter = lambda x: True if x["city"].startswith("new") else False
    >>> df = wr.s3.read_parquet_table(path, dataset=True, partition_filter=my_filter)

    """
    client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session)
    args: Dict[str, Any] = {"DatabaseName": database, "Name": table}
    if catalog_id is not None:
        args["CatalogId"] = catalog_id
    res: Dict[str, Any] = client_glue.get_table(**args)
    try:
        location: str = res["Table"]["StorageDescriptor"]["Location"]
        path: str = location if location.endswith("/") else f"{location}/"
    except KeyError as ex:
        raise exceptions.InvalidTable(f"Missing s3 location for {database}.{table}.") from ex
    path_root: Optional[str] = None
    paths: Union[str, List[str]] = path
    # If filter is available, fetch & filter out partitions
    # Then list objects & process individual object keys under path_root
    if partition_filter is not None:
        available_partitions_dict = _get_partitions(
            database=database,
            table=table,
            catalog_id=catalog_id,
            boto3_session=boto3_session,
        )
        available_partitions = list(available_partitions_dict.keys())
        if available_partitions:
            paths = []
            path_root = path
            partitions: Union[str, List[str]] = _apply_partition_filter(
                path_root=path_root, paths=available_partitions, filter_func=partition_filter
            )
            for partition in partitions:
                paths += _path2list(
                    path=partition,
                    boto3_session=boto3_session,
                    suffix=filename_suffix,
                    ignore_suffix=_get_path_ignore_suffix(path_ignore_suffix=filename_ignore_suffix),
                    s3_additional_kwargs=s3_additional_kwargs,
                )
    df = read_parquet(
        path=paths,
        path_root=path_root,
        path_suffix=filename_suffix if path_root is None else None,
        path_ignore_suffix=filename_ignore_suffix if path_root is None else None,
        columns=columns,
        validate_schema=validate_schema,
        categories=categories,
        safe=safe,
        map_types=map_types,
        chunked=chunked,
        dataset=True,
        use_threads=use_threads,
        boto3_session=boto3_session,
        s3_additional_kwargs=s3_additional_kwargs,
    )
    partial_cast_function = functools.partial(
        _data_types.cast_pandas_with_athena_types, dtype=_extract_partitions_dtypes_from_table_details(response=res)
    )

    if isinstance(df, pd.DataFrame):
        return partial_cast_function(df)

    # df is a generator, so map is needed for casting dtypes
    return map(partial_cast_function, df)