def _resolve_query_with_cache( # pylint: disable=too-many-return-statements cache_info, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], use_threads: bool, session: Optional[boto3.Session], ): """Fetch cached data and return it as a pandas Dataframe (or list of Dataframes).""" if cache_info["data_type"] == "parquet": manifest_path = cache_info["query_execution_info"]["Statistics"]["DataManifestLocation"] # this is needed just so we can access boto's modeled exceptions client_s3: boto3.client = _utils.client(service_name="s3", session=session) try: paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) except (client_s3.exceptions.NoSuchBucket, client_s3.exceptions.NoSuchKey): # pragma: no cover return None if all([s3.does_object_exist(path) for path in paths]): chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: # pragma: no cover if chunked is False: return pd.DataFrame() return _utils.empty_generator() ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) _logger.debug(type(ret)) return ret elif cache_info["data_type"] == "csv": dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=cache_info["query_execution_info"]["QueryExecutionId"], categories=categories, boto3_session=session, ) path = cache_info["query_execution_info"]["ResultConfiguration"]["OutputLocation"] if s3.does_object_exist(path=path, boto3_session=session): _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) return dfs raise exceptions.InvalidArgumentValue(f"Invalid data type: {cache_info['data_type']}.") # pragma: no cover
def _fetch_txt_result( query_metadata: _QueryMetadata, keep_files: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], ) -> pd.DataFrame: if query_metadata.output_location is None or query_metadata.output_location.endswith( ".txt") is False: return pd.DataFrame() path: str = query_metadata.output_location _logger.debug("Start TXT reading from %s", path) df = s3.read_csv( path=[path], dtype=query_metadata.dtype, parse_dates=query_metadata.parse_timestamps, converters=query_metadata.converters, quoting=csv.QUOTE_ALL, keep_default_na=False, skip_blank_lines=True, na_values=[], use_threads=False, boto3_session=boto3_session, names=list(query_metadata.dtype.keys()), sep="\t", ) if keep_files is False: s3.delete_objects( path=[path, f"{path}.metadata"], use_threads=False, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return df
def _fetch_csv_result( query_metadata: _QueryMetadata, keep_files: bool, chunksize: Optional[int], use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) if query_metadata.output_location is None or query_metadata.output_location.endswith( ".csv") is False: chunked = _chunksize is not None return _empty_dataframe_response(chunked, query_metadata) path: str = query_metadata.output_location _logger.debug("Start CSV reading from %s", path) ret = s3.read_csv( path=[path], dtype=query_metadata.dtype, parse_dates=query_metadata.parse_timestamps, converters=query_metadata.converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=["", "NaN"], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=boto3_session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if _chunksize is None: df = _fix_csv_types(df=ret, parse_dates=query_metadata.parse_dates, binaries=query_metadata.binaries) df = _apply_query_metadata(df=df, query_metadata=query_metadata) if keep_files is False: s3.delete_objects( path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=query_metadata.parse_dates, binaries=query_metadata.binaries) dfs = _add_query_metadata_generator(dfs=dfs, query_metadata=query_metadata) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return dfs
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, session: Optional[boto3.Session], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute any query in Athena and returns results as Dataframe, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}" ) _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')" ) raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session ) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session ) return dfs
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[int] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- If `chunksize` is passed, then a Generator of DataFrames is returned. Note ---- If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, but it still useful to overcome memory limitation. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_s3_output, _, _ = _ensure_workgroup(session=session, workgroup=workgroup) if s3_output is None: if wg_s3_output is None: _s3_output: str = create_athena_bucket(boto3_session=session) else: _s3_output = wg_s3_output else: _s3_output = s3_output name: str = "" if ctas_approach is True: name = f"temp_table_{pa.compat.guid()}" _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output path: str = f"{_s3_output}/{name}" sql = (f"CREATE TABLE {name}\n" f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY',\n" f" external_location = '{path}'\n" f") AS\n" f"{sql}") _logger.debug(f"sql: {sql}") query_id: str = start_query_execution( sql=sql, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug(f"query_id: {query_id}") query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) dfs: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: bool = chunksize is not None _logger.debug(f"chunked: {chunked}") if not paths: if chunked is False: dfs = pd.DataFrame() else: dfs = _utils.empty_generator() else: s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) dfs = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) return dfs dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug(f"Start CSV reading from {path}") ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") if chunksize is None: return _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) _logger.debug(type(ret)) return _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- ``Batching`` (`chunksize` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config( session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config[ "enforced"] is True else f",\n external_location = '{path}'\n" sql = (f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}") _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate(dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return dfs