def add_parquet_partitions( database: str, table: str, partitions_values: Dict[str, List[str]], catalog_id: Optional[str] = None, compression: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog. Parameters ---------- database : str Database name. table : str Table name. partitions_values: Dict[str, List[str]] Dictionary with keys as S3 path locations and values as a list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``, etc). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_parquet_partitions( ... database='default', ... table='my_table', ... partitions_values={ ... 's3://bucket/prefix/y=2020/m=10/': ['2020', '10'], ... 's3://bucket/prefix/y=2020/m=11/': ['2020', '11'], ... 's3://bucket/prefix/y=2020/m=12/': ['2020', '12'] ... } ... ) """ table = sanitize_table_name(table=table) if partitions_values: inputs: List[Dict[str, Any]] = [ _parquet_partition_definition(location=k, values=v, compression=compression) for k, v in partitions_values.items() ] _add_partitions(database=database, table=table, boto3_session=boto3_session, inputs=inputs, catalog_id=catalog_id)
def _create_csv_table( database: str, table: str, path: str, columns_types: Dict[str, str], partitions_types: Optional[Dict[str, str]], bucketing_info: Optional[Tuple[List[str], int]], description: Optional[str], compression: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, sep: str, skip_header_line_count: Optional[int], boto3_session: Optional[boto3.Session], projection_enabled: bool, projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], catalog_table_input: Optional[Dict[str, Any]], catalog_id: Optional[str], ) -> None: table = sanitize_table_name(table=table) partitions_types = {} if partitions_types is None else partitions_types _logger.debug("catalog_table_input: %s", catalog_table_input) table_input: Dict[str, Any] if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")): table_input = catalog_table_input catalog_cols: Dict[str, str] = { x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"] } for c, t in columns_types.items(): if c not in catalog_cols: _logger.debug("New column %s with type %s.", c, t) raise exceptions.InvalidArgumentValue( f"Schema change detected - New column {c}. Schema evolution is not supported for CSV tables." ) else: table_input = _csv_table_definition( table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, sep=sep, skip_header_line_count=skip_header_line_count, ) table_exist: bool = catalog_table_input is not None _logger.debug("table_exist: %s", table_exist) _create_table( database=database, table=table, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, boto3_session=boto3_session, table_input=table_input, table_exist=table_exist, partitions_types=partitions_types, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_id=catalog_id, )
def _create_parquet_table( database: str, table: str, path: str, columns_types: Dict[str, str], partitions_types: Optional[Dict[str, str]], bucketing_info: Optional[Tuple[List[str], int]], catalog_id: Optional[str], compression: Optional[str], description: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, projection_enabled: bool, projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], catalog_table_input: Optional[Dict[str, Any]], ) -> None: table = sanitize_table_name(table=table) partitions_types = {} if partitions_types is None else partitions_types _logger.debug("catalog_table_input: %s", catalog_table_input) table_input: Dict[str, Any] if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")): table_input = catalog_table_input catalog_cols: Dict[str, str] = { x["Name"]: x["Type"] for x in table_input["StorageDescriptor"]["Columns"] } for c, t in columns_types.items(): if c not in catalog_cols: _logger.debug("New column %s with type %s.", c, t) table_input["StorageDescriptor"]["Columns"].append({ "Name": c, "Type": t }) mode = "update" elif t != catalog_cols[c]: # Data type change detected! raise exceptions.InvalidArgumentValue( f"Data type change detected on column {c} (Old type: {catalog_cols[c]} / New type {t})." ) else: table_input = _parquet_table_definition( table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, ) table_exist: bool = catalog_table_input is not None _logger.debug("table_exist: %s", table_exist) _create_table( database=database, table=table, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, boto3_session=boto3_session, table_input=table_input, table_exist=table_exist, partitions_types=partitions_types, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, catalog_id=catalog_id, )
def add_parquet_partitions( database: str, table: str, partitions_values: Dict[str, List[str]], bucketing_info: Optional[Tuple[List[str], int]] = None, catalog_id: Optional[str] = None, compression: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, columns_types: Optional[Dict[str, str]] = None, ) -> None: """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog. Parameters ---------- database : str Database name. table : str Table name. partitions_values: Dict[str, List[str]] Dictionary with keys as S3 path locations and values as a list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``, etc). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. columns_types: Optional[Dict[str, str]] Only required for Hive compability. Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). P.S. Only materialized columns please, not partition columns. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_parquet_partitions( ... database='default', ... table='my_table', ... partitions_values={ ... 's3://bucket/prefix/y=2020/m=10/': ['2020', '10'], ... 's3://bucket/prefix/y=2020/m=11/': ['2020', '11'], ... 's3://bucket/prefix/y=2020/m=12/': ['2020', '12'] ... } ... ) """ table = sanitize_table_name(table=table) if partitions_values: inputs: List[Dict[str, Any]] = [ _parquet_partition_definition( location=k, values=v, bucketing_info=bucketing_info, compression=compression, columns_types=columns_types, ) for k, v in partitions_values.items() ] _add_partitions(database=database, table=table, boto3_session=boto3_session, inputs=inputs, catalog_id=catalog_id)
def _create_json_table( # pylint: disable=too-many-arguments database: str, table: str, path: str, columns_types: Dict[str, str], table_type: Optional[str], partitions_types: Optional[Dict[str, str]], bucketing_info: Optional[Tuple[List[str], int]], description: Optional[str], compression: Optional[str], parameters: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, schema_evolution: bool, transaction_id: Optional[str], serde_library: Optional[str], serde_parameters: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], projection_enabled: bool, projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], projection_storage_location_template: Optional[str], catalog_table_input: Optional[Dict[str, Any]], catalog_id: Optional[str], ) -> None: table = sanitize_table_name(table=table) partitions_types = {} if partitions_types is None else partitions_types _logger.debug("catalog_table_input: %s", catalog_table_input) table_input: Dict[str, Any] if schema_evolution is False: _utils.check_schema_changes(columns_types=columns_types, table_input=catalog_table_input, mode=mode) if (catalog_table_input is not None) and (mode in ("append", "overwrite_partitions")): table_input = catalog_table_input else: table_input = _json_table_definition( table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, compression=compression, serde_library=serde_library, serde_parameters=serde_parameters, ) table_exist: bool = catalog_table_input is not None _logger.debug("table_exist: %s", table_exist) _create_table( database=database, table=table, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, transaction_id=transaction_id, boto3_session=boto3_session, table_input=table_input, table_type=table_type, table_exist=table_exist, partitions_types=partitions_types, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=projection_storage_location_template, catalog_id=catalog_id, )
def add_csv_partitions( database: str, table: str, partitions_values: Dict[str, List[str]], bucketing_info: Optional[Tuple[List[str], int]] = None, catalog_id: Optional[str] = None, compression: Optional[str] = None, sep: str = ",", serde_library: Optional[str] = None, serde_parameters: Optional[Dict[str, str]] = None, boto3_session: Optional[boto3.Session] = None, columns_types: Optional[Dict[str, str]] = None, partitions_parameters: Optional[Dict[str, str]] = None, ) -> None: r"""Add partitions (metadata) to a CSV Table in the AWS Glue Catalog. Parameters ---------- database : str Database name. table : str Table name. partitions_values: Dict[str, List[str]] Dictionary with keys as S3 path locations and values as a list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. compression: str, optional Compression style (``None``, ``gzip``, etc). sep : str String of length 1. Field delimiter for the output file. serde_library : Optional[str] Specifies the SerDe Serialization library which will be used. You need to provide the Class library name as a string. If no library is provided the default is `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`. serde_parameters : Optional[str] Dictionary of initialization parameters for the SerDe. The default is `{"field.delim": sep, "escape.delim": "\\"}`. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. columns_types: Optional[Dict[str, str]] Only required for Hive compability. Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). P.S. Only materialized columns please, not partition columns. partitions_parameters: Optional[Dict[str, str]] Dictionary with key-value pairs defining partition parameters. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_csv_partitions( ... database='default', ... table='my_table', ... partitions_values={ ... 's3://bucket/prefix/y=2020/m=10/': ['2020', '10'], ... 's3://bucket/prefix/y=2020/m=11/': ['2020', '11'], ... 's3://bucket/prefix/y=2020/m=12/': ['2020', '12'] ... } ... ) """ table = sanitize_table_name(table=table) inputs: List[Dict[str, Any]] = [ _csv_partition_definition( location=k, values=v, bucketing_info=bucketing_info, compression=compression, sep=sep, columns_types=columns_types, serde_library=serde_library, serde_parameters=serde_parameters, partitions_parameters=partitions_parameters, ) for k, v in partitions_values.items() ] _add_partitions(database=database, table=table, boto3_session=boto3_session, inputs=inputs, catalog_id=catalog_id)