def overwrite_table_parameters( parameters: Dict[str, str], database: str, table: str, transaction_id: Optional[str] = None, catalog_versioning: bool = False, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, str]: """Overwrite all existing parameters. Parameters ---------- parameters : Dict[str, str] e.g. {"source": "mysql", "destination": "datalake"} database : str Database name. table : str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] All parameters after the overwrite (The same received). Examples -------- >>> import awswrangler as wr >>> pars = wr.catalog.overwrite_table_parameters( ... parameters={"source": "mysql", "destination": "datalake"}, ... database="...", ... table="...") """ session: boto3.Session = _utils.ensure_session(session=boto3_session) table_input: Optional[Dict[str, Any]] = _get_table_input( database=database, table=table, transaction_id=transaction_id, catalog_id=catalog_id, boto3_session=session ) if table_input is None: raise exceptions.InvalidTable(f"Table {table} does not exist on database {database}.") return _overwrite_table_parameters( parameters=parameters, database=database, catalog_id=catalog_id, transaction_id=transaction_id, table_input=table_input, boto3_session=session, catalog_versioning=catalog_versioning, )
def create_csv_table( database: str, table: str, path: str, columns_types: Dict[str, str], partitions_types: Optional[Dict[str, str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, compression: Optional[str] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, mode: str = "overwrite", catalog_versioning: bool = False, sep: str = ",", skip_header_line_count: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, catalog_id: Optional[str] = None, ) -> None: """Create a CSV Table (Metadata Only) in the AWS Glue Catalog. 'https://docs.aws.amazon.com/athena/latest/ug/data-types.html' Parameters ---------- database : str Database name. table : str Table name. path : str Amazon S3 path (e.g. s3://bucket/prefix/). columns_types: Dict[str, str] Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). partitions_types: Dict[str, str], optional Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. compression : str, optional Compression style (``None``, ``gzip``, etc). description : str, optional Table description parameters : Dict[str, str], optional Key/value pairs to tag the table. columns_comments: Dict[str, str], optional Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). mode : str 'overwrite' to recreate any possible axisting table or 'append' to keep any possible axisting table. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. sep : str String of length 1. Field delimiter for the output file. skip_header_line_count : Optional[int] Number of Lines to skip regarding to the header. projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.create_csv_table( ... database='default', ... table='my_table', ... path='s3://bucket/prefix/', ... columns_types={'col0': 'bigint', 'col1': 'double'}, ... partitions_types={'col2': 'date'}, ... compression='gzip', ... description='My own table!', ... parameters={'source': 'postgresql'}, ... columns_comments={'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) catalog_table_input: Optional[Dict[str, Any]] = _get_table_input( database=database, table=table, boto3_session=session, catalog_id=catalog_id) _create_csv_table( database=database, table=table, path=path, columns_types=columns_types, partitions_types=partitions_types, bucketing_info=bucketing_info, catalog_id=catalog_id, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, boto3_session=boto3_session, catalog_table_input=catalog_table_input, sep=sep, skip_header_line_count=skip_header_line_count, )
def create_json_table( # pylint: disable=too-many-arguments database: str, table: str, path: str, columns_types: Dict[str, str], table_type: Optional[str] = None, partitions_types: Optional[Dict[str, str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, compression: Optional[str] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, mode: str = "overwrite", catalog_versioning: bool = False, schema_evolution: bool = False, serde_library: Optional[str] = None, serde_parameters: Optional[Dict[str, str]] = None, transaction_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, projection_storage_location_template: Optional[str] = None, catalog_id: Optional[str] = None, ) -> None: r"""Create a JSON Table (Metadata Only) in the AWS Glue Catalog. 'https://docs.aws.amazon.com/athena/latest/ug/data-types.html' Parameters ---------- database : str Database name. table : str Table name. path : str Amazon S3 path (e.g. s3://bucket/prefix/). columns_types: Dict[str, str] Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). table_type: str, optional The type of the Glue Table (EXTERNAL_TABLE, GOVERNED...). Set to EXTERNAL_TABLE if None partitions_types: Dict[str, str], optional Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. compression : str, optional Compression style (``None``, ``gzip``, etc). description : str, optional Table description parameters : Dict[str, str], optional Key/value pairs to tag the table. columns_comments: Dict[str, str], optional Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). mode : str 'overwrite' to recreate any possible axisting table or 'append' to keep any possible axisting table. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. schema_evolution : bool If True allows schema evolution (new or missing columns), otherwise a exception will be raised. (Only considered if dataset=True and mode in ("append", "overwrite_partitions")) Related tutorial: https://aws-data-wrangler.readthedocs.io/en/2.11.0/tutorials/014%20-%20Schema%20Evolution.html serde_library : Optional[str] Specifies the SerDe Serialization library which will be used. You need to provide the Class library name as a string. If no library is provided the default is `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`. serde_parameters : Optional[str] Dictionary of initialization parameters for the SerDe. The default is `{"field.delim": sep, "escape.delim": "\\"}`. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) projection_storage_location_template: Optional[str] Value which is allows Athena to properly map partition values if the S3 file locations do not follow a typical `.../column=value/...` pattern. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.create_json_table( ... database='default', ... table='my_table', ... path='s3://bucket/prefix/', ... columns_types={'col0': 'bigint', 'col1': 'double'}, ... partitions_types={'col2': 'date'}, ... description='My very own JSON table!', ... parameters={'source': 'postgresql'}, ... columns_comments={'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) catalog_table_input: Optional[Dict[str, Any]] = _get_table_input( database=database, table=table, boto3_session=session, catalog_id=catalog_id ) _create_json_table( database=database, table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, catalog_id=catalog_id, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, transaction_id=transaction_id, schema_evolution=schema_evolution, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=projection_storage_location_template, boto3_session=boto3_session, catalog_table_input=catalog_table_input, serde_library=serde_library, serde_parameters=serde_parameters, )
def create_parquet_table( database: str, table: str, path: str, columns_types: Dict[str, str], table_type: Optional[str] = None, partitions_types: Optional[Dict[str, str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, catalog_id: Optional[str] = None, compression: Optional[str] = None, description: Optional[str] = None, parameters: Optional[Dict[str, str]] = None, columns_comments: Optional[Dict[str, str]] = None, mode: str = "overwrite", catalog_versioning: bool = False, transaction_id: Optional[str] = None, projection_enabled: bool = False, projection_types: Optional[Dict[str, str]] = None, projection_ranges: Optional[Dict[str, str]] = None, projection_values: Optional[Dict[str, str]] = None, projection_intervals: Optional[Dict[str, str]] = None, projection_digits: Optional[Dict[str, str]] = None, projection_storage_location_template: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a Parquet Table (Metadata Only) in the AWS Glue Catalog. 'https://docs.aws.amazon.com/athena/latest/ug/data-types.html' Parameters ---------- database : str Database name. table : str Table name. path : str Amazon S3 path (e.g. s3://bucket/prefix/). columns_types: Dict[str, str] Dictionary with keys as column names and values as data types (e.g. {'col0': 'bigint', 'col1': 'double'}). table_type: str, optional The type of the Glue Table (EXTERNAL_TABLE, GOVERNED...). Set to EXTERNAL_TABLE if None partitions_types: Dict[str, str], optional Dictionary with keys as partition names and values as data types (e.g. {'col2': 'date'}). bucketing_info: Tuple[List[str], int], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. compression: str, optional Compression style (``None``, ``snappy``, ``gzip``, etc). description: str, optional Table description parameters: Dict[str, str], optional Key/value pairs to tag the table. columns_comments: Dict[str, str], optional Columns names and the related comments (e.g. {'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'}). mode: str 'overwrite' to recreate any possible existing table or 'append' to keep any possible existing table. catalog_versioning : bool If True and `mode="overwrite"`, creates an archived version of the table catalog before updating it. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). projection_enabled : bool Enable Partition Projection on Athena (https://docs.aws.amazon.com/athena/latest/ug/partition-projection.html) projection_types : Optional[Dict[str, str]] Dictionary of partitions names and Athena projections types. Valid types: "enum", "integer", "date", "injected" https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'enum', 'col2_name': 'integer'}) projection_ranges: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections ranges. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '0,10', 'col2_name': '-1,8675309'}) projection_values: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections values. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': 'A,B,Unknown', 'col2_name': 'foo,boo,bar'}) projection_intervals: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections intervals. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '5'}) projection_digits: Optional[Dict[str, str]] Dictionary of partitions names and Athena projections digits. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html (e.g. {'col_name': '1', 'col2_name': '2'}) projection_storage_location_template: Optional[str] Value which is allows Athena to properly map partition values if the S3 file locations do not follow a typical `.../column=value/...` pattern. https://docs.aws.amazon.com/athena/latest/ug/partition-projection-setting-up.html (e.g. s3://bucket/table_root/a=${a}/${b}/some_static_subdirectory/${c}/) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.create_parquet_table( ... database='default', ... table='my_table', ... path='s3://bucket/prefix/', ... columns_types={'col0': 'bigint', 'col1': 'double'}, ... partitions_types={'col2': 'date'}, ... compression='snappy', ... description='My own table!', ... parameters={'source': 'postgresql'}, ... columns_comments={'col0': 'Column 0.', 'col1': 'Column 1.', 'col2': 'Partition.'} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) catalog_table_input: Optional[Dict[str, Any]] = _get_table_input( database=database, table=table, boto3_session=session, transaction_id=transaction_id, catalog_id=catalog_id ) _create_parquet_table( database=database, table=table, path=path, columns_types=columns_types, table_type=table_type, partitions_types=partitions_types, bucketing_info=bucketing_info, catalog_id=catalog_id, compression=compression, description=description, parameters=parameters, columns_comments=columns_comments, mode=mode, catalog_versioning=catalog_versioning, transaction_id=transaction_id, projection_enabled=projection_enabled, projection_types=projection_types, projection_ranges=projection_ranges, projection_values=projection_values, projection_intervals=projection_intervals, projection_digits=projection_digits, projection_storage_location_template=projection_storage_location_template, boto3_session=boto3_session, catalog_table_input=catalog_table_input, )