def generate_from_meta( self, metadata: Union[Metadata, str, dict], database_name: str = None, table_location: str = None, run_msck_repair=False, ): """ Creates a glue table from metadata arguments: - metadata: Metadata object, string path, or dictionary metadata. - database_name (optional): name of the glue database the table is to be created in. can also be a property of the metadata. - table_location (optional): the s3 location of the table. can also be a property of the metadata. - run_msck_repair (optional): run msck repair table on the created table, should be set to True for tables with partitions. Raises: - ValueError if run_msck_repair table is False, metadata has partitions, and options.ignore_warnings is set to False """ # set database_name to metadata.database_name if none database_name = database_name if database_name else metadata.database_name # do the same with table_location table_location = table_location if table_location else metadata.table_location glue_client = boto3.client( "glue", region_name=os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "eu-west-1")), ) metadata = Metadata.from_infer(metadata) boto_dict = self.gc.generate_from_meta(metadata, database_name=database_name, table_location=table_location) delete_table_if_exists(database=database_name, table=metadata.name) glue_client.create_table(**boto_dict) if (not run_msck_repair and metadata.partitions and not self.options.ignore_warnings): w = ( "metadata has partitions and run_msck_reapair is set to false. To " "To supress these warnings set this converters " "options.ignore_warnings = True") warnings.warn(w) elif run_msck_repair: pydb.read_sql_query( f"msck repair table {database_name}.{metadata.name}")
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, data_source: Optional[str], ctas_approach: bool, categories: Optional[List[str]], chunksize: Union[int, bool, None], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, s3_additional_kwargs: Optional[Dict[str, Any]], boto3_session: boto3.Session, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute a query in Athena and returns results as DataFrame, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: _WorkGroupConfig = _get_workgroup_config(session=boto3_session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=boto3_session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output if ctas_approach is True: if ctas_temp_table_name is not None: name: str = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{uuid.uuid4().hex}" try: return _resolve_query_without_cache_ctas( sql=sql, database=database, data_source=data_source, s3_output=_s3_output, keep_files=keep_files, chunksize=chunksize, categories=categories, encryption=encryption, workgroup=workgroup, kms_key=kms_key, wg_config=wg_config, name=name, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) finally: catalog.delete_table_if_exists(database=database, table=name, boto3_session=boto3_session) return _resolve_query_without_cache_regular( sql=sql, database=database, data_source=data_source, s3_output=_s3_output, keep_files=keep_files, chunksize=chunksize, categories=categories, encryption=encryption, workgroup=workgroup, kms_key=kms_key, wg_config=wg_config, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, )
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, session: Optional[boto3.Session], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute any query in Athena and returns results as Dataframe, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}" ) _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')" ) raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session ) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session ) return dfs
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[int] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- If `chunksize` is passed, then a Generator of DataFrames is returned. Note ---- If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, but it still useful to overcome memory limitation. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_s3_output, _, _ = _ensure_workgroup(session=session, workgroup=workgroup) if s3_output is None: if wg_s3_output is None: _s3_output: str = create_athena_bucket(boto3_session=session) else: _s3_output = wg_s3_output else: _s3_output = s3_output name: str = "" if ctas_approach is True: name = f"temp_table_{pa.compat.guid()}" _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output path: str = f"{_s3_output}/{name}" sql = (f"CREATE TABLE {name}\n" f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY',\n" f" external_location = '{path}'\n" f") AS\n" f"{sql}") _logger.debug(f"sql: {sql}") query_id: str = start_query_execution( sql=sql, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug(f"query_id: {query_id}") query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) dfs: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: bool = chunksize is not None _logger.debug(f"chunked: {chunked}") if not paths: if chunked is False: dfs = pd.DataFrame() else: dfs = _utils.empty_generator() else: s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) dfs = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) return dfs dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug(f"Start CSV reading from {path}") ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") if chunksize is None: return _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) _logger.debug(type(ret)) return _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- ``Batching`` (`chunksize` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config( session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config[ "enforced"] is True else f",\n external_location = '{path}'\n" sql = (f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}") _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate(dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return dfs