def delete_database( database: str, boto3_session: Optional[boto3.Session] = None, ) -> None: """Delete a given Timestream database. This is an irreversible operation. After a database is deleted, the time series data from its tables cannot be recovered. All tables in the database must be deleted first, or a ValidationException error will be thrown. Due to the nature of distributed retries, the operation can return either success or a ResourceNotFoundException. Clients should consider them equivalent. Parameters ---------- database: str Database name. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. Returns ------- None None. Examples -------- Deleting a database >>> import awswrangler as wr >>> arn = wr.timestream.delete_database("MyDatabase") """ client: boto3.client = _utils.client(service_name="timestream-write", session=boto3_session) client.delete_database(DatabaseName=database)
def get_table_types( database: str, table: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, str]: """Get all columns and types from a table. Parameters ---------- database : str Database name. table : str Table name. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, str] A dictionary as {'col name': 'col data type'}. Examples -------- >>> import awswrangler as wr >>> wr.catalog.get_table_types(database='default', name='my_table') {'col0': 'int', 'col1': double} """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table(DatabaseName=database, Name=table) dtypes: Dict[str, str] = {} for col in response["Table"]["StorageDescriptor"]["Columns"]: dtypes[col["Name"]] = col["Type"] for par in response["Table"]["PartitionKeys"]: dtypes[par["Name"]] = par["Type"] return dtypes
def _get_table_input( database: str, table: str, boto3_session: Optional[boto3.Session], transaction_id: Optional[str] = None, catalog_id: Optional[str] = None, ) -> Optional[Dict[str, Any]]: client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = _catalog_id(catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, DatabaseName=database, Name=table)) try: response: Dict[str, Any] = client_glue.get_table(**args) except client_glue.exceptions.EntityNotFoundException: return None table_input: Dict[str, Any] = {} for k, v in response["Table"].items(): if k in [ "Name", "Description", "Owner", "LastAccessTime", "LastAnalyzedTime", "Retention", "StorageDescriptor", "PartitionKeys", "ViewOriginalText", "ViewExpandedText", "TableType", "Parameters", "TargetTable", ]: table_input[k] = v return table_input
def get_table_versions( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ) -> List[Dict[str, Any]]: """Get all versions. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- List[Dict[str, Any] List of table inputs: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_table_versions Examples -------- >>> import awswrangler as wr >>> tables_versions = wr.catalog.get_table_versions(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_table_versions") versions: List[Dict[str, Any]] = [] response_iterator = paginator.paginate(**_catalog_id(DatabaseName=database, TableName=table, catalog_id=catalog_id)) for page in response_iterator: for tbl in page["TableVersions"]: versions.append(tbl) return versions
def get_query_columns_types( query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, str]: """Get the data type of all columns queried. https://docs.aws.amazon.com/athena/latest/ug/data-types.html Parameters ---------- query_execution_id : str Athena query execution ID. boto3_session : boto3.Session(), optional Boto3 Session. If none, the default boto3 session is used. Returns ------- Dict[str, str] Dictionary with all data types. Examples -------- >>> import awswrangler as wr >>> wr.athena.get_query_columns_types('query-execution-id') {'col0': 'int', 'col1': 'double'} """ client_athena: boto3.client = _utils.client( service_name="athena", session=_utils.ensure_session(session=boto3_session)) response: Dict[str, Any] = client_athena.get_query_results( QueryExecutionId=query_execution_id, MaxResults=1) col_info: List[Dict[ str, str]] = response["ResultSet"]["ResultSetMetadata"]["ColumnInfo"] return dict( (c["Name"], f"{c['Type']}({c['Precision']},{c.get('Scale', 0)})" ) if c["Type"] in ["decimal"] else (c["Name"], c["Type"]) for c in col_info)
def delete_table_if_exists( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None ) -> bool: """Delete Glue table if exists. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if deleted, otherwise False. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_table_if_exists(database='default', table='my_table') # deleted True >>> wr.catalog.delete_table_if_exists(database='default', table='my_table') # Nothing to be deleted False """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) try: client_glue.delete_table(**_catalog_id(DatabaseName=database, Name=table, catalog_id=catalog_id)) return True except client_glue.exceptions.EntityNotFoundException: return False
def get_connection( name: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]: """Get Glue connection details. Parameters ---------- name : str Connection name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] API Response for: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_connection Examples -------- >>> import awswrangler as wr >>> res = wr.catalog.get_connection(name='my_connection') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) if catalog_id is None: return client_glue.get_connection(Name=name, HidePassword=False)["Connection"] return client_glue.get_connection(CatalogId=catalog_id, Name=name, HidePassword=False)["Connection"]
def delete_table_if_exists( database: str, table: str, boto3_session: Optional[boto3.Session] = None) -> bool: """Delete Glue table if exists. Parameters ---------- database : str Database name. table : str Table name. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if deleted, otherwise False. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_table_if_exists(database='default', name='my_table') # deleted True >>> wr.catalog.delete_table_if_exists(database='default', name='my_table') # Nothing to be deleted False """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) try: client_glue.delete_table(DatabaseName=database, Name=table) return True except client_glue.exceptions.EntityNotFoundException: return False
def get_table_description( database: str, table: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None) -> Optional[str]: """Get table description. Parameters ---------- database : str Database name. table : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Optional[str] Description if exists. Examples -------- >>> import awswrangler as wr >>> desc = wr.catalog.get_table_description(database="...", table="...") """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) response: Dict[str, Any] = client_glue.get_table(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, Name=table)) desc: Optional[str] = response["Table"].get("Description", None) return desc
def search_tables(text: str, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None): """Get Pandas DataFrame of tables filtered by a search string. Parameters ---------- text : str, optional Select only tables with the given string in table's properties. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Iterator[Dict[str, Any]] Iterator of tables. Examples -------- >>> import awswrangler as wr >>> df_tables = wr.catalog.search_tables(text='my_property') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = {"SearchText": text} if catalog_id is not None: args["CatalogId"] = catalog_id response: Dict[str, Any] = client_glue.search_tables(**args) for tbl in response["TableList"]: yield tbl while "NextToken" in response: # pragma: no cover args["NextToken"] = response["NextToken"] response = client_glue.search_tables(**args) for tbl in response["TableList"]: yield tbl
def get_account_id(boto3_session: Optional[boto3.Session] = None) -> str: """Get Account ID. Parameters ---------- boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Account ID. Examples -------- >>> import awswrangler as wr >>> account_id = wr.sts.get_account_id() """ session: boto3.Session = _utils.ensure_session(session=boto3_session) return cast( str, _utils.client(service_name="sts", session=session).get_caller_identity().get("Account"))
def terminate_cluster(cluster_id: str, boto3_session: Optional[boto3.Session] = None) -> None: """Terminate EMR cluster. Parameters ---------- cluster_id : str Cluster ID. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.emr.terminate_cluster("cluster-id") """ client_emr: boto3.client = _utils.client(service_name="emr", session=boto3_session) response: Dict[str, Any] = client_emr.terminate_job_flows(JobFlowIds=[cluster_id]) _logger.debug("response: \n%s", pprint.pformat(response))
def _list( func_name: str, attr_name: str, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, **kwargs, ) -> List[Dict[str, Any]]: session: boto3.Session = _utils.ensure_session(session=boto3_session) if account_id is None: account_id = sts.get_account_id( boto3_session=session) # pragma: no cover client: boto3.client = _utils.client(service_name="quicksight", session=session) func: Callable = getattr(client, func_name) response = func(AwsAccountId=account_id, **kwargs) next_token: str = response.get("NextToken", None) result: List[Dict[str, Any]] = response[attr_name] while next_token is not None: # pragma: no cover response = func(AwsAccountId=account_id, NextToken=next_token, **kwargs) next_token = response.get("NextToken", None) result += response[attr_name] return result
def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]: """Wait for the query end. Parameters ---------- query_execution_id : str Athena query execution ID. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, Any] Dictionary with the get_query_execution response. Examples -------- >>> import awswrangler as wr >>> res = wr.athena.wait_query(query_execution_id='query-execution-id') """ final_states: List[str] = ["FAILED", "SUCCEEDED", "CANCELLED"] client_athena: boto3.client = _utils.client(service_name="athena", session=boto3_session) response: Dict[str, Any] = client_athena.get_query_execution(QueryExecutionId=query_execution_id) state: str = response["QueryExecution"]["Status"]["State"] while state not in final_states: time.sleep(_QUERY_WAIT_POLLING_DELAY) response = client_athena.get_query_execution(QueryExecutionId=query_execution_id) state = response["QueryExecution"]["Status"]["State"] _logger.debug("state: %s", state) _logger.debug("StateChangeReason: %s", response["QueryExecution"]["Status"].get("StateChangeReason")) if state == "FAILED": raise exceptions.QueryFailed(response["QueryExecution"]["Status"].get("StateChangeReason")) if state == "CANCELLED": raise exceptions.QueryCancelled(response["QueryExecution"]["Status"].get("StateChangeReason")) return response
def read_sql_query( sql: str, database: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, params: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: """Execute PartiQL query on AWS Glue Table (Transaction ID or time travel timestamp). Return Pandas DataFrame. Note ---- ORDER BY operations are not honoured. i.e. sql="SELECT * FROM my_table ORDER BY my_column" is NOT valid Note ---- The database must NOT be explicitely defined in the PartiQL statement. i.e. sql="SELECT * FROM my_table" is valid but sql="SELECT * FROM my_db.my_table" is NOT valid Note ---- Pass one of `transaction_id` or `query_as_of_time`, not both. Parameters ---------- sql : str partiQL query. database : str AWS Glue database name transaction_id : str, optional The ID of the transaction at which to read the table contents. Cannot be specified alongside query_as_of_time query_as_of_time : str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. use_threads : bool True to enable concurrent requests, False to disable multiple threads. When enabled, os.cpu_count() is used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session is used if boto3_session receives None. params: Dict[str, any], optional Dict of parameters used to format the partiQL query. Only named parameters are supported. The dict must contain the information in the form {"name": "value"} and the SQL query must contain `:name`. Returns ------- pd.DataFrame Pandas DataFrame. Examples -------- >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table;", ... database="my_db", ... catalog_id="111111111111" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table LIMIT 10;", ... database="my_db", ... transaction_id="1b62811fa3e02c4e5fdbaa642b752030379c4a8a70da1f8732ce6ccca47afdc9" ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_query( ... sql="SELECT * FROM my_table WHERE name=:name; AND city=:city;", ... database="my_db", ... query_as_of_time="1611142914", ... params={"name": "'filtered_name'", "city": "'filtered_city'"} ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=session) commit_trans: bool = False if params is None: params = {} for key, value in params.items(): sql = sql.replace(f":{key};", str(value)) if not any([transaction_id, query_as_of_time]): _logger.debug( "Neither `transaction_id` nor `query_as_of_time` were specified, starting transaction" ) transaction_id = start_transaction(read_only=True, boto3_session=session) commit_trans = True args: Dict[str, Optional[str]] = _catalog_id( catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database), ) query_id: str = client_lakeformation.start_query_planning( QueryString=sql, QueryPlanningContext=args)["QueryId"] df = _resolve_sql_query( query_id=query_id, categories=categories, safe=safe, map_types=map_types, use_threads=use_threads, boto3_session=session, ) if commit_trans: commit_transaction(transaction_id=transaction_id) # type: ignore return df
def start_query( query: str, log_group_names: List[str], start_time: datetime.datetime = datetime.datetime( year=1970, month=1, day=1, tzinfo=datetime.timezone.utc), end_time: datetime.datetime = datetime.datetime.now(), limit: Optional[int] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Run a query against AWS CloudWatchLogs Insights. https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/CWL_QuerySyntax.html Parameters ---------- query : str The query string. log_group_names : str The list of log groups to be queried. You can include up to 20 log groups. start_time : datetime.datetime The beginning of the time range to query. end_time : datetime.datetime The end of the time range to query. limit : Optional[int] The maximum number of log events to return in the query. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Query ID. Examples -------- >>> import awswrangler as wr >>> query_id = wr.cloudwatch.start_query( ... log_group_names=["loggroup"], ... query="fields @timestamp, @message | sort @timestamp desc | limit 5", ... ) """ _logger.debug("log_group_names: %s", log_group_names) start_timestamp: int = int(1000 * start_time.timestamp()) end_timestamp: int = int(1000 * end_time.timestamp()) _logger.debug("start_timestamp: %s", start_timestamp) _logger.debug("end_timestamp: %s", end_timestamp) _validate_args(start_timestamp=start_timestamp, end_timestamp=end_timestamp) args: Dict[str, Any] = { "logGroupNames": log_group_names, "startTime": start_timestamp, "endTime": end_timestamp, "queryString": query, } if limit is not None: args["limit"] = limit client_logs: boto3.client = _utils.client(service_name="logs", session=boto3_session) response: Dict[str, Any] = client_logs.start_query(**args) return cast(str, response["queryId"])
def __init__( self, path: str, s3_block_size: int, mode: str, use_threads: bool, s3_additional_kwargs: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], newline: Optional[str], encoding: Optional[str], ) -> None: self.closed: bool = False self._use_threads = use_threads self._newline: str = "\n" if newline is None else newline self._encoding: str = "utf-8" if encoding is None else encoding self._bucket, self._key = _utils.parse_path(path=path) self._boto3_session: boto3.Session = _utils.ensure_session( session=boto3_session) if mode not in {"rb", "wb", "r", "w"}: raise NotImplementedError( "File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode) self._mode: str = "rb" if mode is None else mode self._one_shot_download: bool = False if 0 < s3_block_size < 3: raise exceptions.InvalidArgumentValue( "s3_block_size MUST > 2 to define a valid size or " "< 1 to avoid blocks and always execute one shot downloads.") if s3_block_size <= 0: _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size) self._one_shot_download = True self._s3_block_size: int = s3_block_size self._s3_half_block_size: int = s3_block_size // 2 self._s3_additional_kwargs: Dict[ str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) self._loc: int = 0 if self.readable() is True: self._cache: bytes = b"" self._start: int = 0 self._end: int = 0 size: Optional[int] = size_objects( path=[path], use_threads=False, boto3_session=self._boto3_session)[path] if size is None: raise exceptions.InvalidArgumentValue( f"S3 object w/o defined size: {path}") self._size: int = size _logger.debug("self._size: %s", self._size) _logger.debug("self._s3_block_size: %s", self._s3_block_size) elif self.writable() is True: self._mpu: Dict[str, Any] = {} self._buffer: io.BytesIO = io.BytesIO() self._parts_count: int = 0 self._size = 0 self._upload_proxy: _UploadProxy = _UploadProxy( use_threads=self._use_threads) else: raise RuntimeError(f"Invalid mode: {self._mode}")
def create_table( database: str, table: str, memory_retention_hours: int, magnetic_retention_days: int, tags: Optional[Dict[str, str]] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create a new Timestream database. Note ---- If the KMS key is not specified, the database will be encrypted with a Timestream managed KMS key located in your account. Parameters ---------- database: str Database name. table: str Table name. memory_retention_hours: int The duration for which data must be stored in the memory store. magnetic_retention_days: int The duration for which data must be stored in the magnetic store. tags: Optional[Dict[str, str]] Key/Value dict to put on the table. Tags enable you to categorize databases and/or tables, for example, by purpose, owner, or environment. e.g. {"foo": "boo", "bar": "xoo"}) boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. Returns ------- str The Amazon Resource Name that uniquely identifies this database. (ARN) Examples -------- Creating a table. >>> import awswrangler as wr >>> arn = wr.timestream.create_table( ... database="MyDatabase", ... table="MyTable", ... memory_retention_hours=3, ... magnetic_retention_days=7 ... ) """ client: boto3.client = _utils.client(service_name="timestream-write", session=boto3_session) args: Dict[str, Any] = { "DatabaseName": database, "TableName": table, "RetentionProperties": { "MemoryStoreRetentionPeriodInHours": memory_retention_hours, "MagneticStoreRetentionPeriodInDays": magnetic_retention_days, }, } if tags is not None: args["Tags"] = [{"Key": k, "Value": v} for k, v in tags.items()] response: Dict[str, Dict[str, Any]] = client.create_table(**args) return cast(str, response["Table"]["Arn"])
def _extract_ctas_manifest_paths(path: str, boto3_session: Optional[boto3.Session] = None) -> List[str]: """Get the list of paths of the generated files.""" bucket_name, key_path = _utils.parse_path(path) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) body: bytes = client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read() return [x for x in body.decode("utf-8").split("\n") if x != ""]
def _list_objects( # pylint: disable=too-many-branches path: str, delimiter: Optional[str] = None, suffix: Union[str, List[str], None] = None, ignore_suffix: Union[str, List[str], None] = None, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[str]: bucket: str prefix_original: str bucket, prefix_original = _utils.parse_path(path=path) prefix: str = _prefix_cleanup(prefix=prefix_original) _suffix: Union[List[str], None] = [suffix] if isinstance(suffix, str) else suffix _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance( ignore_suffix, str) else ignore_suffix client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = { "Bucket": bucket, "Prefix": prefix, "PaginationConfig": { "PageSize": 1000 } } if delimiter is not None: args["Delimiter"] = delimiter response_iterator = paginator.paginate(**args) paths: List[str] = [] _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end) for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List[Dict[str, Any]]] = page.get("Contents") if contents is not None: for content in contents: key: str = content["Key"] if (content is not None) and ("Key" in content): if (_suffix is None) or key.endswith(tuple(_suffix)): if last_modified_begin is not None: if content[ "LastModified"] < last_modified_begin: continue if last_modified_end is not None: if content["LastModified"] > last_modified_end: continue paths.append(f"s3://{bucket}/{key}") else: prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get( "CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") if prefix != prefix_original: paths = fnmatch.filter(paths, path) if _ignore_suffix is not None: paths = [ p for p in paths if p.endswith(tuple(_ignore_suffix)) is False ] return paths
def _create_table( # pylint: disable=too-many-branches,too-many-statements database: str, table: str, description: Optional[str], parameters: Optional[Dict[str, str]], mode: str, catalog_versioning: bool, boto3_session: Optional[boto3.Session], table_input: Dict[str, Any], table_exist: bool, projection_enabled: bool, partitions_types: Optional[Dict[str, str]], columns_comments: Optional[Dict[str, str]], projection_types: Optional[Dict[str, str]], projection_ranges: Optional[Dict[str, str]], projection_values: Optional[Dict[str, str]], projection_intervals: Optional[Dict[str, str]], projection_digits: Optional[Dict[str, str]], catalog_id: Optional[str], ) -> None: # Description mode = _update_if_necessary(dic=table_input, key="Description", value=description, mode=mode) # Parameters parameters = parameters if parameters else {} for k, v in parameters.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=k, value=v, mode=mode) # Projection if projection_enabled is True: table_input["Parameters"]["projection.enabled"] = "true" partitions_types = partitions_types if partitions_types else {} projection_types = projection_types if projection_types else {} projection_ranges = projection_ranges if projection_ranges else {} projection_values = projection_values if projection_values else {} projection_intervals = projection_intervals if projection_intervals else {} projection_digits = projection_digits if projection_digits else {} projection_types = { sanitize_column_name(k): v for k, v in projection_types.items() } projection_ranges = { sanitize_column_name(k): v for k, v in projection_ranges.items() } projection_values = { sanitize_column_name(k): v for k, v in projection_values.items() } projection_intervals = { sanitize_column_name(k): v for k, v in projection_intervals.items() } projection_digits = { sanitize_column_name(k): v for k, v in projection_digits.items() } for k, v in projection_types.items(): dtype: Optional[str] = partitions_types.get(k) if dtype is None: raise exceptions.InvalidArgumentCombination( f"Column {k} appears as projected column but not as partitioned column." ) if dtype == "date": table_input["Parameters"][ f"projection.{k}.format"] = "yyyy-MM-dd" elif dtype == "timestamp": table_input["Parameters"][ f"projection.{k}.format"] = "yyyy-MM-dd HH:mm:ss" table_input["Parameters"][ f"projection.{k}.interval.unit"] = "SECONDS" table_input["Parameters"][f"projection.{k}.interval"] = "1" for k, v in projection_types.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.type", value=v, mode=mode) for k, v in projection_ranges.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.range", value=v, mode=mode) for k, v in projection_values.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.values", value=v, mode=mode) for k, v in projection_intervals.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.interval", value=str(v), mode=mode) for k, v in projection_digits.items(): mode = _update_if_necessary(dic=table_input["Parameters"], key=f"projection.{k}.digits", value=str(v), mode=mode) else: table_input["Parameters"]["projection.enabled"] = "false" # Column comments columns_comments = columns_comments if columns_comments else {} columns_comments = { sanitize_column_name(k): v for k, v in columns_comments.items() } if columns_comments: for col in table_input["StorageDescriptor"]["Columns"]: name: str = col["Name"] if name in columns_comments: mode = _update_if_necessary(dic=col, key="Comment", value=columns_comments[name], mode=mode) for par in table_input["PartitionKeys"]: name = par["Name"] if name in columns_comments: mode = _update_if_necessary(dic=par, key="Comment", value=columns_comments[name], mode=mode) _logger.debug("table_input: %s", table_input) session: boto3.Session = _utils.ensure_session(session=boto3_session) client_glue: boto3.client = _utils.client(service_name="glue", session=session) skip_archive: bool = not catalog_versioning if mode not in ("overwrite", "append", "overwrite_partitions", "update"): raise exceptions.InvalidArgument( f"{mode} is not a valid mode. It must be 'overwrite', 'append' or 'overwrite_partitions'." ) if table_exist is True and mode == "overwrite": delete_all_partitions(table=table, database=database, catalog_id=catalog_id, boto3_session=session) _logger.debug("Updating table (%s)...", mode) client_glue.update_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)) elif (table_exist is True) and (mode in ("append", "overwrite_partitions", "update")): if mode == "update": _logger.debug("Updating table (%s)...", mode) client_glue.update_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input, SkipArchive=skip_archive)) elif table_exist is False: try: _logger.debug("Creating table (%s)...", mode) client_glue.create_table(**_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableInput=table_input)) except client_glue.exceptions.AlreadyExistsException: if mode == "overwrite": _utils.try_it( f=_overwrite_table, ex=client_glue.exceptions.AlreadyExistsException, client_glue=client_glue, catalog_id=catalog_id, database=database, table=table, table_input=table_input, boto3_session=boto3_session, ) _logger.debug("Leaving table as is (%s)...", mode)
def _resolve_sql_query( query_id: str, categories: Optional[List[str]], safe: bool, map_types: bool, use_threads: bool, boto3_session: boto3.Session, ) -> pd.DataFrame: client_lakeformation: boto3.client = _utils.client( service_name="lakeformation", session=boto3_session) wait_query(query_id=query_id, boto3_session=boto3_session) # The LF Query Engine distributes the load across workers # Retrieve the tokens and their associated work units until NextToken is '' # One Token can span multiple work units # PageSize determines the size of the "Units" array in each call scan_kwargs: Dict[str, Union[str, int]] = { "QueryId": query_id, "PageSize": 10 } next_token: str = "init_token" # Dummy token token_work_units: List[Tuple[str, int]] = [] while next_token: response = client_lakeformation.get_work_units(**scan_kwargs) token_work_units.extend( # [(Token0, WorkUnitId0), (Token0, WorkUnitId1), (Token1, WorkUnitId2) ... ] [ (unit["WorkUnitToken"], unit_id) for unit in response["WorkUnitRanges"] for unit_id in range( unit["WorkUnitIdMin"], unit["WorkUnitIdMax"] + 1) # Max is inclusive ]) next_token = response.get("NextToken", None) scan_kwargs["NextToken"] = next_token tables: List[Table] = [] if use_threads is False: tables = list( _get_work_unit_results( query_id=query_id, token_work_unit=token_work_unit, client_lakeformation=client_lakeformation, ) for token_work_unit in token_work_units) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: tables = list( executor.map( _get_work_unit_results, itertools.repeat(query_id), token_work_units, itertools.repeat(client_lakeformation), )) table = concat_tables(tables) args = { "use_threads": use_threads, "split_blocks": True, "self_destruct": True, "integer_object_nulls": False, "date_as_object": True, "ignore_metadata": True, "strings_to_categorical": False, "categories": categories, "safe": safe, "types_mapper": _data_types.pyarrow2pandas_extension if map_types else None, } return _utils.ensure_df_is_mutable(df=table.to_pandas(**args))
def create_athena_dataset( name: str, database: Optional[str] = None, table: Optional[str] = None, sql: Optional[str] = None, sql_name: str = "CustomSQL", data_source_name: Optional[str] = None, data_source_arn: Optional[str] = None, import_mode: str = "DIRECT_QUERY", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, logical_table_alias: str = "LogicalTable", rename_columns: Optional[Dict[str, str]] = None, cast_columns_types: Optional[Dict[str, str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: """Create a QuickSight dataset. Note ---- You will not be able to see the the dataset in the console if you not pass your user to one of the ``allowed_*`` arguments. Note ---- You must pass ``database``/``table`` OR ``sql`` argument. Note ---- You must pass ``data_source_name`` OR ``data_source_arn`` argument. Parameters ---------- name : str Dataset name. database : str Athena's database name. table : str Athena's table name. sql : str Use a SQL query to define your table. sql_name : str Query name. data_source_name : str, optional QuickSight data source name. data_source_arn : str, optional QuickSight data source ARN. import_mode : str Indicates whether you want to import the data into SPICE. 'SPICE'|'DIRECT_QUERY' tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["john", "Mary"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] logical_table_alias : str A display name for the logical table. rename_columns : Dict[str, str], optional Dictionary to map column renames. e.g. {"old_name": "new_name", "old_name2": "new_name2"} cast_columns_types : Dict[str, str], optional Dictionary to map column casts. e.g. {"col_name": "STRING", "col_name2": "DECIMAL"} Valid types: 'STRING'|'INTEGER'|'DECIMAL'|'DATETIME' account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Dataset ID. Examples -------- >>> import awswrangler as wr >>> dataset_id = wr.quicksight.create_athena_dataset( ... name="...", ... database="..." ... table="..." ... data_source_name="..." ... allowed_to_manage=["Mary"] ... ) """ if (data_source_name is None) and (data_source_arn is None): raise exceptions.InvalidArgument("You must pass a not None data_source_name or data_source_arn argument.") if ((database is None) and (table is None)) and (sql is None): raise exceptions.InvalidArgument("You must pass database/table OR sql argument.") if (database is not None) and (sql is not None): raise exceptions.InvalidArgument( "If you provide sql argument, please include the database name inside the sql statement." "Do NOT pass in with database argument." ) session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) if (data_source_arn is None) and (data_source_name is not None): data_source_arn = get_data_source_arn(name=data_source_name, account_id=account_id, boto3_session=session) if sql is not None: physical_table: Dict[str, Dict[str, Any]] = { "CustomSql": { "DataSourceArn": data_source_arn, "Name": sql_name, "SqlQuery": sql, "Columns": extract_athena_query_columns( sql=sql, data_source_arn=data_source_arn, # type: ignore account_id=account_id, boto3_session=session, ), } } else: physical_table = { "RelationalTable": { "DataSourceArn": data_source_arn, "Schema": database, "Name": table, "InputColumns": extract_athena_table_columns( database=database, # type: ignore table=table, # type: ignore boto3_session=session, ), } } table_uuid: str = uuid.uuid4().hex dataset_id: str = uuid.uuid4().hex args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSetId": dataset_id, "Name": name, "ImportMode": import_mode, "PhysicalTableMap": {table_uuid: physical_table}, "LogicalTableMap": {table_uuid: {"Alias": logical_table_alias, "Source": {"PhysicalTableId": table_uuid}}}, } trans: List[Dict[str, Dict[str, Any]]] = _generate_transformations( rename_columns=rename_columns, cast_columns_types=cast_columns_types ) if trans: args["LogicalTableMap"][table_uuid]["DataTransforms"] = trans permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="dataset", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_set(**args) return dataset_id
def get_redshift_temp_engine( cluster_identifier: str, user: str, database: Optional[str] = None, duration: int = 900, auto_create: bool = True, db_groups: Optional[List[str]] = None, boto3_session: Optional[boto3.Session] = None, **sqlalchemy_kwargs: Any, ) -> sqlalchemy.engine.Engine: """Get Glue connection details. Parameters ---------- cluster_identifier : str The unique identifier of a cluster. This parameter is case sensitive. user : str, optional The name of a database user. database : str, optional Database name. If None, the default Database is used. duration : int, optional The number of seconds until the returned temporary password expires. Constraint: minimum 900, maximum 3600. Default: 900 auto_create : bool Create a database user with the name specified for the user named in user if one does not exist. db_groups: List[str], optinal A list of the names of existing database groups that the user named in DbUser will join for the current session, in addition to any group memberships for an existing user. If not specified, a new user is added only to PUBLIC. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. sqlalchemy_kwargs keyword arguments forwarded to sqlalchemy.create_engine(). https://docs.sqlalchemy.org/en/13/core/engines.html Returns ------- sqlalchemy.engine.Engine SQLAlchemy Engine. Examples -------- >>> import awswrangler as wr >>> engine = wr.db.get_redshift_temp_engine('my_cluster', 'my_user') """ client_redshift: boto3.client = _utils.client(service_name="redshift", session=boto3_session) args: Dict[str, Any] = { "DbUser": user, "ClusterIdentifier": cluster_identifier, "DurationSeconds": duration, "AutoCreate": auto_create, } if db_groups is not None: args["DbGroups"] = db_groups res: Dict[str, Any] = client_redshift.get_cluster_credentials(**args) _user: str = _quote_plus(res["DbUser"]) password: str = _quote_plus(res["DbPassword"]) cluster: Dict[str, Any] = client_redshift.describe_clusters( ClusterIdentifier=cluster_identifier)["Clusters"][0] host: str = cluster["Endpoint"]["Address"] port: str = cluster["Endpoint"]["Port"] if database is None: database = cluster["DBName"] conn_str: str = f"redshift+psycopg2://{_user}:{password}@{host}:{port}/{database}" sqlalchemy_kwargs["executemany_mode"] = "values" sqlalchemy_kwargs["executemany_values_page_size"] = 100_000 return sqlalchemy.create_engine(conn_str, **sqlalchemy_kwargs)
def write_redshift_copy_manifest( manifest_path: str, paths: List[str], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]: """Write Redshift copy manifest and return its structure. Only Parquet files are supported. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- manifest_path : str Amazon S3 manifest path (e.g. s3://...) paths: List[str] List of S3 paths (Parquet Files) to be copied. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] Manifest content. Examples -------- Copying two files to Redshift cluster. >>> import awswrangler as wr >>> wr.db.write_redshift_copy_manifest( ... path="s3://bucket/my.manifest", ... paths=["s3://...parquet", "s3://...parquet"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) objects_sizes: Dict[str, Optional[int]] = s3.size_objects( path=paths, use_threads=use_threads, boto3_session=session) manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = { "entries": [] } path: str size: Optional[int] for path, size in objects_sizes.items(): if size is not None: entry: Dict[str, Union[str, bool, Dict[str, int]]] = { "url": path, "mandatory": True, "meta": { "content_length": size }, } manifest["entries"].append(entry) payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) additional_kwargs: Dict[ str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs _logger.debug("payload: %s", payload) client_s3: boto3.client = _utils.client(service_name="s3", session=session) _logger.debug("bucket: %s", bucket) _logger.debug("key: %s", key) client_s3.put_object(Body=payload, Bucket=bucket, Key=key, **additional_kwargs) return manifest
def _start_query_execution( sql: str, wg_config: _WorkGroupConfig, database: Optional[str] = None, data_source: Optional[str] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> str: args: Dict[str, Any] = {"QueryString": sql} session: boto3.Session = _utils.ensure_session(session=boto3_session) # s3_output args["ResultConfiguration"] = { "OutputLocation": _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) } # encryption if wg_config.enforced is True: if wg_config.encryption is not None: args["ResultConfiguration"]["EncryptionConfiguration"] = { "EncryptionOption": wg_config.encryption } if wg_config.kms_key is not None: args["ResultConfiguration"]["EncryptionConfiguration"][ "KmsKey"] = wg_config.kms_key else: if encryption is not None: args["ResultConfiguration"]["EncryptionConfiguration"] = { "EncryptionOption": encryption } if kms_key is not None: args["ResultConfiguration"]["EncryptionConfiguration"][ "KmsKey"] = kms_key # database if database is not None: args["QueryExecutionContext"] = {"Database": database} if data_source is not None: args["QueryExecutionContext"]["Catalog"] = data_source # workgroup if workgroup is not None: args["WorkGroup"] = workgroup client_athena: boto3.client = _utils.client(service_name="athena", session=session) _logger.debug("args: \n%s", pprint.pformat(args)) response: Dict[str, Any] = _utils.try_it( f=client_athena.start_query_execution, ex=botocore.exceptions.ClientError, ex_code="ThrottlingException", max_num_tries=5, **args, ) return cast(str, response["QueryExecutionId"])
def create_athena_data_source( name: str, workgroup: str = "primary", allowed_to_use: Optional[List[str]] = None, allowed_to_manage: Optional[List[str]] = None, tags: Optional[Dict[str, str]] = None, account_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Create a QuickSight data source pointing to an Athena/Workgroup. Note ---- You will not be able to see the the data source in the console if you not pass your user to one of the ``allowed_*`` arguments. Parameters ---------- name : str Data source name. workgroup : str Athena workgroup. tags : Dict[str, str], optional Key/Value collection to put on the Cluster. e.g. {"foo": "boo", "bar": "xoo"}) allowed_to_use : optional List of principals that will be allowed to see and use the data source. e.g. ["John"] allowed_to_manage : optional List of principals that will be allowed to see, use, update and delete the data source. e.g. ["Mary"] account_id : str, optional If None, the account ID will be inferred from your boto3 session. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.quicksight.create_athena_data_source( ... name="...", ... allowed_to_manage=["john"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) client: boto3.client = _utils.client(service_name="quicksight", session=session) if account_id is None: account_id = sts.get_account_id(boto3_session=session) args: Dict[str, Any] = { "AwsAccountId": account_id, "DataSourceId": name, "Name": name, "Type": "ATHENA", "DataSourceParameters": {"AthenaParameters": {"WorkGroup": workgroup}}, "SslProperties": {"DisableSsl": True}, } permissions: List[Dict[str, Union[str, List[str]]]] = _generate_permissions( resource="data_source", account_id=account_id, boto3_session=session, allowed_to_use=allowed_to_use, allowed_to_manage=allowed_to_manage, ) if permissions: args["Permissions"] = permissions if tags is not None: _tags: List[Dict[str, str]] = [{"Key": k, "Value": v} for k, v in tags.items()] args["Tags"] = _tags client.create_data_source(**args)
def get_tables( catalog_id: Optional[str] = None, database: Optional[str] = None, transaction_id: Optional[str] = None, name_contains: Optional[str] = None, name_prefix: Optional[str] = None, name_suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> Iterator[Dict[str, Any]]: """Get an iterator of tables. Note ---- Please, does not filter using name_contains and name_prefix/name_suffix at the same time. Only name_prefix and name_suffix can be combined together. Parameters ---------- catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. database : str, optional Database name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). name_contains : str, optional Select by a specific string on table name name_prefix : str, optional Select by a specific prefix on table name name_suffix : str, optional Select by a specific suffix on table name boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Iterator[Dict[str, Any]] Iterator of tables. Examples -------- >>> import awswrangler as wr >>> tables = wr.catalog.get_tables() """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) paginator = client_glue.get_paginator("get_tables") args: Dict[str, str] = {} if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None): raise exceptions.InvalidArgumentCombination( "Please, does not filter using name_contains and " "name_prefix/name_suffix at the same time. Only " "name_prefix and name_suffix can be combined together." ) if (name_prefix is not None) and (name_suffix is not None): args["Expression"] = f"{name_prefix}*{name_suffix}" elif name_contains is not None: args["Expression"] = f"*{name_contains}*" elif name_prefix is not None: args["Expression"] = f"{name_prefix}*" elif name_suffix is not None: args["Expression"] = f"*{name_suffix}" if database is not None: dbs: List[str] = [database] else: dbs = [x["Name"] for x in get_databases(catalog_id=catalog_id)] for db in dbs: args["DatabaseName"] = db response_iterator = paginator.paginate( **_catalog_id(catalog_id=catalog_id, **_transaction_id(transaction_id=transaction_id, **args)) ) try: for page in response_iterator: for tbl in page["TableList"]: yield tbl except client_glue.exceptions.EntityNotFoundException: continue
def read_parquet_table( table: str, database: str, catalog_id: Optional[str] = None, partition_filter: Optional[Callable[[Dict[str, str]], bool]] = None, columns: Optional[List[str]] = None, validate_schema: bool = True, categories: Optional[List[str]] = None, safe: bool = True, chunked: Union[bool, int] = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Read Apache Parquet table registered on AWS Glue Catalog. Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will paginate through files slicing and concatenating to return DataFrames with the number of row igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- table : str AWS Glue Catalog table name. database : str AWS Glue Catalog database name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. partition_filter: Optional[Callable[[Dict[str, str]], bool]] Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter). This function MUST receive a single argument (Dict[str, str]) where keys are partitions names and values are partitions values. Partitions values will be always strings extracted from S3. This function MUST return a bool, True to read the partition or False to ignore it. Ignored if `dataset=False`. E.g ``lambda x: True if x["year"] == "2020" and x["month"] == "1" else False`` https://github.com/awslabs/aws-data-wrangler/blob/master/tutorials/023%20-%20Flexible%20Partitions%20Filter.ipynb columns : List[str], optional Names of columns to read from the file(s). validate_schema: Check that individual file schemas are all the same / compatible. Schemas within a folder prefix should all be the same. Disable if you have schemas that are different and want to disable this check. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. chunked : bool If True will break the data in smaller DataFrames (Non deterministic number of lines). Otherwise return a single DataFrame with the whole data. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Generator[pandas.DataFrame, None, None]] Pandas DataFrame or a Generator in case of `chunked=True`. Examples -------- Reading Parquet Table >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table(database='...', table='...') Reading Parquet Table encrypted >>> import awswrangler as wr >>> df = wr.s3.read_parquet_table( ... database='...', ... table='...' ... s3_additional_kwargs={ ... 'ServerSideEncryption': 'aws:kms', ... 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN' ... } ... ) Reading Parquet Table in chunks (Chunk by file) >>> import awswrangler as wr >>> dfs = wr.s3.read_parquet_table(database='...', table='...', chunked=True) >>> for df in dfs: >>> print(df) # Smaller Pandas DataFrame Reading Parquet Dataset with PUSH-DOWN filter over partitions >>> import awswrangler as wr >>> my_filter = lambda x: True if x["city"].startswith("new") else False >>> df = wr.s3.read_parquet_table(path, dataset=True, partition_filter=my_filter) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) args: Dict[str, Any] = {"DatabaseName": database, "Name": table} if catalog_id is not None: args["CatalogId"] = catalog_id res: Dict[str, Any] = client_glue.get_table(**args) try: path: str = res["Table"]["StorageDescriptor"]["Location"] except KeyError as ex: raise exceptions.InvalidTable( f"Missing s3 location for {database}.{table}.") from ex return _data_types.cast_pandas_with_athena_types( df=read_parquet( path=path, partition_filter=partition_filter, columns=columns, validate_schema=validate_schema, categories=categories, safe=safe, chunked=chunked, dataset=True, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ), dtype=_extract_partitions_dtypes_from_table_details(response=res), )
def table( database: str, table: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: """Get table details as Pandas DataFrame. Note ---- If reading from a governed table, pass only one of `transaction_id` or `query_as_of_time`. Parameters ---------- database: str Database name. table: str Table name. transaction_id: str, optional The ID of the transaction (i.e. used with GOVERNED tables). query_as_of_time: str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id. catalog_id: str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. boto3_session: boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- pandas.DataFrame Pandas DataFrame filled by formatted infos. Examples -------- >>> import awswrangler as wr >>> df_table = wr.catalog.table(database='default', table='my_table') """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) tbl = client_glue.get_table( **_catalog_id( catalog_id=catalog_id, **_transaction_id( transaction_id=transaction_id, query_as_of_time=query_as_of_time, DatabaseName=database, Name=table ), ) )["Table"] df_dict: Dict[str, List[Union[str, bool]]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []} if "StorageDescriptor" in tbl: for col in tbl["StorageDescriptor"].get("Columns", {}): df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(False) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") if "PartitionKeys" in tbl: for col in tbl["PartitionKeys"]: df_dict["Column Name"].append(col["Name"]) df_dict["Type"].append(col["Type"]) df_dict["Partition"].append(True) if "Comment" in col: df_dict["Comment"].append(col["Comment"]) else: df_dict["Comment"].append("") return pd.DataFrame(data=df_dict)