def read_sql_table( table: str, database: str, ctas_approach: bool = True, categories: Optional[List[str]] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, max_cache_seconds: int = 0, max_cache_query_inspections: int = 50, max_remote_cache_entries: int = 50, max_local_cache_entries: int = 100, data_source: Optional[str] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Extract the full table AWS Athena and return the results as a Pandas DataFrame. **Related tutorial:** - `Amazon Athena <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/006%20-%20Amazon%20Athena.html>`_ - `Athena Cache <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/019%20-%20Athena%20Cache.html>`_ - `Global Configurations <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/021%20-%20Global%20Configurations.html>`_ **There are two approaches to be defined through ctas_approach parameter:** **1** - ctas_approach=True (Default): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: - Faster for mid and big result sizes. - Can handle some level of nested types. CONS: - Requires create/delete table permissions on Glue. - Does not support timestamp with time zone - Does not support columns with repeated names. - Does not support columns with undefined data types. - A temporary table will be created and then deleted immediately. **2** - ctas_approach=False: Does a regular query on Athena and parse the regular CSV result on s3. PROS: - Faster for small result sizes (less latency). - Does not require create/delete table permissions on Glue - Supports timestamp with time zone. CONS: - Slower for big results (But stills faster than other libraries that uses the regular Athena's API) - Does not handle nested types at all. Note ---- The resulting DataFrame (or every DataFrame in the returned Iterator for chunked queries) have a `query_metadata` attribute, which brings the query result metadata returned by `Boto3/Athena <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services /athena.html#Athena.Client.get_query_execution>`_ . For a practical example check out the `related tutorial <https://aws-data-wrangler.readthedocs.io/en/2.5.0/ tutorials/024%20-%20Athena%20Query%20Metadata.html>`_! Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- `chunksize` argument (Memory Friendly) (i.e batching): Return an Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunksize=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` is faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. `P.P.S.` If `ctas_approach=False` and `chunksize=True`, you will always receive an interador with a single DataFrame because regular Athena queries only produces a single output file. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- table : str Table name. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{uuid.uuid4().hex}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. max_cache_seconds: int Wrangler can look up in Athena's history if this table has been read before. If so, and its completion time is less than `max_cache_seconds` before now, wrangler skips query execution and just returns the same results as last time. If cached results are valid, wrangler ignores the `ctas_approach`, `s3_output`, `encryption`, `kms_key`, `keep_files` and `ctas_temp_table_name` params. If reading cached data fails for any reason, execution falls back to the usual query run path. max_cache_query_inspections : int Max number of queries that will be inspected from the history to try to find some result to reuse. The bigger the number of inspection, the bigger will be the latency for not cached queries. Only takes effect if max_cache_seconds > 0. max_remote_cache_entries : int Max number of queries that will be retrieved from AWS for cache inspection. The bigger the number of inspection, the bigger will be the latency for not cached queries. Only takes effect if max_cache_seconds > 0 and default value is 50. max_local_cache_entries : int Max number of queries for which metadata will be cached locally. This will reduce the latency and also enables keeping more than `max_remote_cache_entries` available for the cache. This value should not be smaller than max_remote_cache_entries. Only takes effect if max_cache_seconds > 0 and default value is 100. data_source : str, optional Data Source / Catalog name. If None, 'AwsDataCatalog' will be used by default. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "RequestPayer", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'RequestPayer': 'requester'} Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_table(table="...", database="...") >>> scanned_bytes = df.query_metadata["Statistics"]["DataScannedInBytes"] """ table = catalog.sanitize_table_name(table=table) return read_sql_query( sql=f'SELECT * FROM "{table}"', database=database, data_source=data_source, ctas_approach=ctas_approach, categories=categories, chunksize=chunksize, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, keep_files=keep_files, ctas_temp_table_name=ctas_temp_table_name, use_threads=use_threads, boto3_session=boto3_session, max_cache_seconds=max_cache_seconds, max_cache_query_inspections=max_cache_query_inspections, max_remote_cache_entries=max_remote_cache_entries, max_local_cache_entries=max_local_cache_entries, s3_additional_kwargs=s3_additional_kwargs, )
def read_sql_table( table: str, database: str, transaction_id: Optional[str] = None, query_as_of_time: Optional[str] = None, catalog_id: Optional[str] = None, categories: Optional[List[str]] = None, safe: bool = True, map_types: bool = True, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> pd.DataFrame: """Extract all rows from AWS Glue Table (Transaction ID or time travel timestamp). Return Pandas DataFrame. Note ---- ORDER BY operations are not honoured. i.e. sql="SELECT * FROM my_table ORDER BY my_column" is NOT valid Note ---- Pass one of `transaction_id` or `query_as_of_time`, not both. Parameters ---------- table : str AWS Glue table name. database : str AWS Glue database name transaction_id : str, optional The ID of the transaction at which to read the table contents. Cannot be specified alongside query_as_of_time query_as_of_time : str, optional The time as of when to read the table contents. Must be a valid Unix epoch timestamp. Cannot be specified alongside transaction_id catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. categories: Optional[List[str]], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. safe : bool, default True For certain data types, a cast is needed in order to store the data in a pandas DataFrame or Series (e.g. timestamps are always stored as nanoseconds in pandas). This option controls whether it is a safe cast or not. map_types : bool, default True True to convert pyarrow DataTypes to pandas ExtensionDtypes. It is used to override the default pandas type for conversion of built-in pyarrow types or in absence of pandas_metadata in the Table schema. use_threads : bool True to enable concurrent requests, False to disable multiple threads. When enabled, os.cpu_count() is used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session is used if boto3_session receives None. Returns ------- pd.DataFrame Pandas DataFrame. Examples -------- >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_table( ... table="my_table", ... database="my_db", ... catalog_id="111111111111", ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_table( ... table="my_table", ... database="my_db", ... transaction_id="1b62811fa3e02c4e5fdbaa642b752030379c4a8a70da1f8732ce6ccca47afdc9", ... ) >>> import awswrangler as wr >>> df = wr.lakeformation.read_sql_table( ... table="my_table", ... database="my_db", ... query_as_of_time="1611142914", ... use_threads=True, ... ) """ table = catalog.sanitize_table_name(table=table) return read_sql_query( sql=f"SELECT * FROM {table}", database=database, transaction_id=transaction_id, query_as_of_time=query_as_of_time, safe=safe, map_types=map_types, catalog_id=catalog_id, categories=categories, use_threads=use_threads, boto3_session=boto3_session, )
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, data_source: Optional[str], ctas_approach: bool, categories: Optional[List[str]], chunksize: Union[int, bool, None], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, s3_additional_kwargs: Optional[Dict[str, Any]], boto3_session: boto3.Session, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute a query in Athena and returns results as DataFrame, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output if ctas_approach is True: if ctas_temp_table_name is not None: name: str = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{uuid.uuid4().hex}" try: return _resolve_query_without_cache_ctas( sql=sql, database=database, data_source=data_source, s3_output=_s3_output, keep_files=keep_files, chunksize=chunksize, categories=categories, encryption=encryption, workgroup=workgroup, kms_key=kms_key, wg_config=wg_config, name=name, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) finally: catalog.delete_table_if_exists(database=database, table=name, boto3_session=boto3_session) return _resolve_query_without_cache_regular( sql=sql, database=database, data_source=data_source, s3_output=_s3_output, keep_files=keep_files, chunksize=chunksize, categories=categories, encryption=encryption, workgroup=workgroup, kms_key=kms_key, wg_config=wg_config, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, )
def read_sql_table( table: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, max_cache_seconds: int = 0, max_cache_query_inspections: int = 50, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Extract the full table AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and give support timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- ``Batching`` (`chunksize` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- table : str Table name. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. max_cache_seconds: int Wrangler can look up in Athena's history if this table has been read before. If so, and its completion time is less than `max_cache_seconds` before now, wrangler skips query execution and just returns the same results as last time. If cached results are valid, wrangler ignores the `ctas_approach`, `s3_output`, `encryption`, `kms_key`, `keep_files` and `ctas_temp_table_name` params. If reading cached data fails for any reason, execution falls back to the usual query run path. max_cache_query_inspections : int Max number of queries that will be inspected from the history to try to find some result to reuse. The bigger the number of inspection, the bigger will be the latency for not cached queries. Only takes effect if max_cache_seconds > 0. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_table(table='...', database='...') """ table = catalog.sanitize_table_name(table=table) return read_sql_query( sql=f'SELECT * FROM "{table}"', database=database, ctas_approach=ctas_approach, categories=categories, chunksize=chunksize, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, keep_files=keep_files, ctas_temp_table_name=ctas_temp_table_name, use_threads=use_threads, boto3_session=boto3_session, max_cache_seconds=max_cache_seconds, max_cache_query_inspections=max_cache_query_inspections, )
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, session: Optional[boto3.Session], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute any query in Athena and returns results as Dataframe, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}" ) _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')" ) raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session ) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session ) return dfs
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- ``Batching`` (`chunksize` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config( session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config[ "enforced"] is True else f",\n external_location = '{path}'\n" sql = (f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}") _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate(dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return dfs
def create_ctas_table( # pylint: disable=too-many-locals sql: str, database: str, ctas_table: Optional[str] = None, ctas_database: Optional[str] = None, s3_output: Optional[str] = None, storage_format: Optional[str] = None, write_compression: Optional[str] = None, partitioning_info: Optional[List[str]] = None, bucketing_info: Optional[Tuple[List[str], int]] = None, field_delimiter: Optional[str] = None, schema_only: bool = False, workgroup: Optional[str] = None, data_source: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, categories: Optional[List[str]] = None, wait: bool = False, boto3_session: Optional[boto3.Session] = None, ) -> Dict[str, Union[str, _QueryMetadata]]: """Create a new table populated with the results of a SELECT query. https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html Parameters ---------- sql : str SELECT SQL query. database : str The name of the database where the original table is stored. ctas_table : Optional[str], optional The name of the CTAS table. If None, a name with a random string is used. ctas_database : Optional[str], optional The name of the alternative database where the CTAS table should be stored. If None, `database` is used, that is the CTAS table is stored in the same database as the original table. s3_output : Optional[str], optional The output Amazon S3 path. If None, either the Athena workgroup or client-side location setting is used. If a workgroup enforces a query results location, then it overrides this argument. storage_format : Optional[str], optional The storage format for the CTAS query results, such as ORC, PARQUET, AVRO, JSON, or TEXTFILE. PARQUET by default. write_compression : Optional[str], optional The compression type to use for any storage format that allows compression to be specified. partitioning_info : Optional[List[str]], optional A list of columns by which the CTAS table will be partitioned. bucketing_info : Optional[Tuple[List[str], int]], optional Tuple consisting of the column names used for bucketing as the first element and the number of buckets as the second element. Only `str`, `int` and `bool` are supported as column data types for bucketing. field_delimiter : Optional[str], optional The single-character field delimiter for files in CSV, TSV, and text files. schema_only : bool, optional _description_, by default False workgroup : Optional[str], optional Athena workgroup. data_source : Optional[str], optional Data Source / Catalog name. If None, 'AwsDataCatalog' is used. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Note: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. wait : bool, default False Whether to wait for the query to finish and return a dictionary with the Query metadata. boto3_session : Optional[boto3.Session], optional Boto3 Session. The default boto3 session is used if boto3_session is None. Returns ------- Dict[str, Union[str, _QueryMetadata]] A dictionary with the the CTAS database and table names. If `wait` is `False`, the query ID is included, otherwise a Query metadata object is added instead. Examples -------- Select all into a new table and encrypt the results >>> import awswrangler as wr >>> wr.athena.create_ctas_table( ... sql="select * from table", ... database="default", ... encryption="SSE_KMS", ... kms_key="1234abcd-12ab-34cd-56ef-1234567890ab", ... ) {'ctas_database': 'default', 'ctas_table': 'temp_table_5669340090094....', 'ctas_query_id': 'cc7dfa81-831d-...'} Create a table with schema only >>> wr.athena.create_ctas_table( ... sql="select col1, col2 from table", ... database="default", ... ctas_table="my_ctas_table", ... schema_only=True, ... wait=True, ... ) Partition data and save to alternative CTAS database >>> wr.athena.create_ctas_table( ... sql="select * from table", ... database="default", ... ctas_database="my_ctas_db", ... storage_format="avro", ... write_compression="snappy", ... partitioning_info=["par0", "par1"], ... wait=True, ... ) """ ctas_table = catalog.sanitize_table_name( ctas_table) if ctas_table else f"temp_table_{uuid.uuid4().hex}" ctas_database = ctas_database if ctas_database else database fully_qualified_name = f'"{ctas_database}"."{ctas_table}"' wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup) s3_output = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) s3_output = s3_output[:-1] if s3_output[-1] == "/" else s3_output # If the workgroup enforces an external location, then it overrides the user supplied argument external_location_str: str = ( f" external_location = '{s3_output}/{ctas_table}',\n" if (not wg_config.enforced) and (s3_output) else "") # At least one property must be specified within `WITH()` in the query. We default to `PARQUET` for `storage_format` storage_format_str: str = f""" format = '{storage_format.upper() if storage_format else "PARQUET"}'""" write_compression_str: str = ( f" write_compression = '{write_compression.upper()}',\n" if write_compression else "") partitioning_str: str = f" partitioned_by = ARRAY{partitioning_info},\n" if partitioning_info else "" bucketing_str: str = ( f" bucketed_by = ARRAY{bucketing_info[0]},\n bucket_count = {bucketing_info[1]},\n" if bucketing_info else "") field_delimiter_str: str = f" field_delimiter = '{field_delimiter}',\n" if field_delimiter else "" schema_only_str: str = "\nWITH NO DATA" if schema_only else "" ctas_sql = (f"CREATE TABLE {fully_qualified_name}\n" f"WITH(\n" f"{external_location_str}" f"{partitioning_str}" f"{bucketing_str}" f"{field_delimiter_str}" f"{write_compression_str}" f"{storage_format_str}" f")\n" f"AS {sql}" f"{schema_only_str}") _logger.debug("ctas sql: %s", ctas_sql) try: query_execution_id: str = _start_query_execution( sql=ctas_sql, wg_config=wg_config, database=database, data_source=data_source, s3_output=s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=boto3_session, ) except botocore.exceptions.ClientError as ex: error: Dict[str, Any] = ex.response["Error"] if error[ "Code"] == "InvalidRequestException" and "Exception parsing query" in error[ "Message"]: raise exceptions.InvalidCtasApproachQuery( f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" ) if error[ "Code"] == "InvalidRequestException" and "extraneous input" in error[ "Message"]: raise exceptions.InvalidCtasApproachQuery( f"It is not possible to wrap this query into a CTAS statement. Root error message: {error['Message']}" ) raise ex response: Dict[str, Union[str, _QueryMetadata]] = { "ctas_database": ctas_database, "ctas_table": ctas_table } if wait: try: response["ctas_query_metadata"] = _get_query_metadata( query_execution_id=query_execution_id, boto3_session=boto3_session, categories=categories, metadata_cache_manager=_cache_manager, ) except exceptions.QueryFailed as ex: msg: str = str(ex) if "Column name" in msg and "specified more than once" in msg: raise exceptions.InvalidCtasApproachQuery( f"Please, define distinct names for your columns. Root error message: {msg}" ) if "Column name not specified" in msg: raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in msg: raise exceptions.InvalidArgumentValue( "Please, don't leave undefined columns types in your query. You can cast to ensure it. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex else: response["ctas_query_id"] = query_execution_id return response