예제 #1
0
def add_parquet_partitions(
    database: str,
    table: str,
    partitions_values: Dict[str, List[str]],
    catalog_id: Optional[str] = None,
    compression: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    partitions_values: Dict[str, List[str]]
        Dictionary with keys as S3 path locations and values as a list of partitions values as str
        (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}).
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    compression: str, optional
        Compression style (``None``, ``snappy``, ``gzip``, etc).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.add_parquet_partitions(
    ...     database='default',
    ...     table='my_table',
    ...     partitions_values={
    ...         's3://bucket/prefix/y=2020/m=10/': ['2020', '10'],
    ...         's3://bucket/prefix/y=2020/m=11/': ['2020', '11'],
    ...         's3://bucket/prefix/y=2020/m=12/': ['2020', '12']
    ...     }
    ... )

    """
    table = sanitize_table_name(table=table)
    if partitions_values:
        inputs: List[Dict[str, Any]] = [
            _parquet_partition_definition(location=k,
                                          values=v,
                                          compression=compression)
            for k, v in partitions_values.items()
        ]
        _add_partitions(database=database,
                        table=table,
                        boto3_session=boto3_session,
                        inputs=inputs,
                        catalog_id=catalog_id)
예제 #2
0
def _create_csv_table(
    database: str,
    table: str,
    path: str,
    columns_types: Dict[str, str],
    partitions_types: Optional[Dict[str, str]],
    bucketing_info: Optional[Tuple[List[str], int]],
    description: Optional[str],
    compression: Optional[str],
    parameters: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    sep: str,
    skip_header_line_count: Optional[int],
    boto3_session: Optional[boto3.Session],
    projection_enabled: bool,
    projection_types: Optional[Dict[str, str]],
    projection_ranges: Optional[Dict[str, str]],
    projection_values: Optional[Dict[str, str]],
    projection_intervals: Optional[Dict[str, str]],
    projection_digits: Optional[Dict[str, str]],
    catalog_table_input: Optional[Dict[str, Any]],
    catalog_id: Optional[str],
) -> None:
    table = sanitize_table_name(table=table)
    partitions_types = {} if partitions_types is None else partitions_types
    _logger.debug("catalog_table_input: %s", catalog_table_input)
    table_input: Dict[str, Any]
    if (catalog_table_input
            is not None) and (mode in ("append", "overwrite_partitions")):
        table_input = catalog_table_input
        catalog_cols: Dict[str, str] = {
            x["Name"]: x["Type"]
            for x in table_input["StorageDescriptor"]["Columns"]
        }
        for c, t in columns_types.items():
            if c not in catalog_cols:
                _logger.debug("New column %s with type %s.", c, t)
                raise exceptions.InvalidArgumentValue(
                    f"Schema change detected - New column {c}. Schema evolution is not supported for CSV tables."
                )
    else:
        table_input = _csv_table_definition(
            table=table,
            path=path,
            columns_types=columns_types,
            partitions_types=partitions_types,
            bucketing_info=bucketing_info,
            compression=compression,
            sep=sep,
            skip_header_line_count=skip_header_line_count,
        )
    table_exist: bool = catalog_table_input is not None
    _logger.debug("table_exist: %s", table_exist)
    _create_table(
        database=database,
        table=table,
        description=description,
        parameters=parameters,
        columns_comments=columns_comments,
        mode=mode,
        catalog_versioning=catalog_versioning,
        boto3_session=boto3_session,
        table_input=table_input,
        table_exist=table_exist,
        partitions_types=partitions_types,
        projection_enabled=projection_enabled,
        projection_types=projection_types,
        projection_ranges=projection_ranges,
        projection_values=projection_values,
        projection_intervals=projection_intervals,
        projection_digits=projection_digits,
        catalog_id=catalog_id,
    )
예제 #3
0
def _create_parquet_table(
    database: str,
    table: str,
    path: str,
    columns_types: Dict[str, str],
    partitions_types: Optional[Dict[str, str]],
    bucketing_info: Optional[Tuple[List[str], int]],
    catalog_id: Optional[str],
    compression: Optional[str],
    description: Optional[str],
    parameters: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    projection_enabled: bool,
    projection_types: Optional[Dict[str, str]],
    projection_ranges: Optional[Dict[str, str]],
    projection_values: Optional[Dict[str, str]],
    projection_intervals: Optional[Dict[str, str]],
    projection_digits: Optional[Dict[str, str]],
    boto3_session: Optional[boto3.Session],
    catalog_table_input: Optional[Dict[str, Any]],
) -> None:
    table = sanitize_table_name(table=table)
    partitions_types = {} if partitions_types is None else partitions_types
    _logger.debug("catalog_table_input: %s", catalog_table_input)
    table_input: Dict[str, Any]
    if (catalog_table_input
            is not None) and (mode in ("append", "overwrite_partitions")):
        table_input = catalog_table_input
        catalog_cols: Dict[str, str] = {
            x["Name"]: x["Type"]
            for x in table_input["StorageDescriptor"]["Columns"]
        }
        for c, t in columns_types.items():
            if c not in catalog_cols:
                _logger.debug("New column %s with type %s.", c, t)
                table_input["StorageDescriptor"]["Columns"].append({
                    "Name": c,
                    "Type": t
                })
                mode = "update"
            elif t != catalog_cols[c]:  # Data type change detected!
                raise exceptions.InvalidArgumentValue(
                    f"Data type change detected on column {c} (Old type: {catalog_cols[c]} / New type {t})."
                )
    else:
        table_input = _parquet_table_definition(
            table=table,
            path=path,
            columns_types=columns_types,
            partitions_types=partitions_types,
            bucketing_info=bucketing_info,
            compression=compression,
        )
    table_exist: bool = catalog_table_input is not None
    _logger.debug("table_exist: %s", table_exist)
    _create_table(
        database=database,
        table=table,
        description=description,
        parameters=parameters,
        columns_comments=columns_comments,
        mode=mode,
        catalog_versioning=catalog_versioning,
        boto3_session=boto3_session,
        table_input=table_input,
        table_exist=table_exist,
        partitions_types=partitions_types,
        projection_enabled=projection_enabled,
        projection_types=projection_types,
        projection_ranges=projection_ranges,
        projection_values=projection_values,
        projection_intervals=projection_intervals,
        projection_digits=projection_digits,
        catalog_id=catalog_id,
    )
예제 #4
0
def add_parquet_partitions(
    database: str,
    table: str,
    partitions_values: Dict[str, List[str]],
    bucketing_info: Optional[Tuple[List[str], int]] = None,
    catalog_id: Optional[str] = None,
    compression: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
    columns_types: Optional[Dict[str, str]] = None,
) -> None:
    """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    partitions_values: Dict[str, List[str]]
        Dictionary with keys as S3 path locations and values as a list of partitions values as str
        (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}).
    bucketing_info: Tuple[List[str], int], optional
        Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the
        second element.
        Only `str`, `int` and `bool` are supported as column data types for bucketing.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    compression: str, optional
        Compression style (``None``, ``snappy``, ``gzip``, etc).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    columns_types: Optional[Dict[str, str]]
        Only required for Hive compability.
        Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}).
        P.S. Only materialized columns please, not partition columns.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.add_parquet_partitions(
    ...     database='default',
    ...     table='my_table',
    ...     partitions_values={
    ...         's3://bucket/prefix/y=2020/m=10/': ['2020', '10'],
    ...         's3://bucket/prefix/y=2020/m=11/': ['2020', '11'],
    ...         's3://bucket/prefix/y=2020/m=12/': ['2020', '12']
    ...     }
    ... )

    """
    table = sanitize_table_name(table=table)
    if partitions_values:
        inputs: List[Dict[str, Any]] = [
            _parquet_partition_definition(
                location=k,
                values=v,
                bucketing_info=bucketing_info,
                compression=compression,
                columns_types=columns_types,
            ) for k, v in partitions_values.items()
        ]
        _add_partitions(database=database,
                        table=table,
                        boto3_session=boto3_session,
                        inputs=inputs,
                        catalog_id=catalog_id)
예제 #5
0
def _create_json_table(  # pylint: disable=too-many-arguments
    database: str,
    table: str,
    path: str,
    columns_types: Dict[str, str],
    table_type: Optional[str],
    partitions_types: Optional[Dict[str, str]],
    bucketing_info: Optional[Tuple[List[str], int]],
    description: Optional[str],
    compression: Optional[str],
    parameters: Optional[Dict[str, str]],
    columns_comments: Optional[Dict[str, str]],
    mode: str,
    catalog_versioning: bool,
    schema_evolution: bool,
    transaction_id: Optional[str],
    serde_library: Optional[str],
    serde_parameters: Optional[Dict[str, str]],
    boto3_session: Optional[boto3.Session],
    projection_enabled: bool,
    projection_types: Optional[Dict[str, str]],
    projection_ranges: Optional[Dict[str, str]],
    projection_values: Optional[Dict[str, str]],
    projection_intervals: Optional[Dict[str, str]],
    projection_digits: Optional[Dict[str, str]],
    projection_storage_location_template: Optional[str],
    catalog_table_input: Optional[Dict[str, Any]],
    catalog_id: Optional[str],
) -> None:
    table = sanitize_table_name(table=table)
    partitions_types = {} if partitions_types is None else partitions_types
    _logger.debug("catalog_table_input: %s", catalog_table_input)
    table_input: Dict[str, Any]
    if schema_evolution is False:
        _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode)
    if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")):
        table_input = catalog_table_input
    else:
        table_input = _json_table_definition(
            table=table,
            path=path,
            columns_types=columns_types,
            table_type=table_type,
            partitions_types=partitions_types,
            bucketing_info=bucketing_info,
            compression=compression,
            serde_library=serde_library,
            serde_parameters=serde_parameters,
        )
    table_exist: bool = catalog_table_input is not None
    _logger.debug("table_exist: %s", table_exist)
    _create_table(
        database=database,
        table=table,
        description=description,
        parameters=parameters,
        columns_comments=columns_comments,
        mode=mode,
        catalog_versioning=catalog_versioning,
        transaction_id=transaction_id,
        boto3_session=boto3_session,
        table_input=table_input,
        table_type=table_type,
        table_exist=table_exist,
        partitions_types=partitions_types,
        projection_enabled=projection_enabled,
        projection_types=projection_types,
        projection_ranges=projection_ranges,
        projection_values=projection_values,
        projection_intervals=projection_intervals,
        projection_digits=projection_digits,
        projection_storage_location_template=projection_storage_location_template,
        catalog_id=catalog_id,
    )
예제 #6
0
def add_csv_partitions(
    database: str,
    table: str,
    partitions_values: Dict[str, List[str]],
    bucketing_info: Optional[Tuple[List[str], int]] = None,
    catalog_id: Optional[str] = None,
    compression: Optional[str] = None,
    sep: str = ",",
    serde_library: Optional[str] = None,
    serde_parameters: Optional[Dict[str, str]] = None,
    boto3_session: Optional[boto3.Session] = None,
    columns_types: Optional[Dict[str, str]] = None,
    partitions_parameters: Optional[Dict[str, str]] = None,
) -> None:
    r"""Add partitions (metadata) to a CSV Table in the AWS Glue Catalog.

    Parameters
    ----------
    database : str
        Database name.
    table : str
        Table name.
    partitions_values: Dict[str, List[str]]
        Dictionary with keys as S3 path locations and values as a list of partitions values as str
        (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}).
    bucketing_info: Tuple[List[str], int], optional
        Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the
        second element.
        Only `str`, `int` and `bool` are supported as column data types for bucketing.
    catalog_id : str, optional
        The ID of the Data Catalog from which to retrieve Databases.
        If none is provided, the AWS account ID is used by default.
    compression: str, optional
        Compression style (``None``, ``gzip``, etc).
    sep : str
        String of length 1. Field delimiter for the output file.
    serde_library : Optional[str]
        Specifies the SerDe Serialization library which will be used. You need to provide the Class library name
        as a string.
        If no library is provided the default is `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`.
    serde_parameters : Optional[str]
        Dictionary of initialization parameters for the SerDe.
        The default is `{"field.delim": sep, "escape.delim": "\\"}`.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    columns_types: Optional[Dict[str, str]]
        Only required for Hive compability.
        Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}).
        P.S. Only materialized columns please, not partition columns.
    partitions_parameters: Optional[Dict[str, str]]
        Dictionary with key-value pairs defining partition parameters.

    Returns
    -------
    None
        None.

    Examples
    --------
    >>> import awswrangler as wr
    >>> wr.catalog.add_csv_partitions(
    ...     database='default',
    ...     table='my_table',
    ...     partitions_values={
    ...         's3://bucket/prefix/y=2020/m=10/': ['2020', '10'],
    ...         's3://bucket/prefix/y=2020/m=11/': ['2020', '11'],
    ...         's3://bucket/prefix/y=2020/m=12/': ['2020', '12']
    ...     }
    ... )

    """
    table = sanitize_table_name(table=table)
    inputs: List[Dict[str, Any]] = [
        _csv_partition_definition(
            location=k,
            values=v,
            bucketing_info=bucketing_info,
            compression=compression,
            sep=sep,
            columns_types=columns_types,
            serde_library=serde_library,
            serde_parameters=serde_parameters,
            partitions_parameters=partitions_parameters,
        ) for k, v in partitions_values.items()
    ]
    _add_partitions(database=database,
                    table=table,
                    boto3_session=boto3_session,
                    inputs=inputs,
                    catalog_id=catalog_id)