Python read_csv Exemples, awswrangler.s3.read_csv Python Exemples

Exemple #1

0

Afficher le fichier

def _resolve_query_with_cache(  # pylint: disable=too-many-return-statements
    cache_info,
    categories: Optional[List[str]],
    chunksize: Optional[Union[int, bool]],
    use_threads: bool,
    session: Optional[boto3.Session],
):
    """Fetch cached data and return it as a pandas Dataframe (or list of Dataframes)."""
    if cache_info["data_type"] == "parquet":
        manifest_path = cache_info["query_execution_info"]["Statistics"]["DataManifestLocation"]
        # this is needed just so we can access boto's modeled exceptions
        client_s3: boto3.client = _utils.client(service_name="s3", session=session)
        try:
            paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session)
        except (client_s3.exceptions.NoSuchBucket, client_s3.exceptions.NoSuchKey):  # pragma: no cover
            return None
        if all([s3.does_object_exist(path) for path in paths]):
            chunked: Union[bool, int] = False if chunksize is None else chunksize
            _logger.debug("chunked: %s", chunked)
            if not paths:  # pragma: no cover
                if chunked is False:
                    return pd.DataFrame()
                return _utils.empty_generator()
            ret = s3.read_parquet(
                path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories
            )
            _logger.debug(type(ret))
            return ret
    elif cache_info["data_type"] == "csv":
        dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
            query_execution_id=cache_info["query_execution_info"]["QueryExecutionId"],
            categories=categories,
            boto3_session=session,
        )
        path = cache_info["query_execution_info"]["ResultConfiguration"]["OutputLocation"]
        if s3.does_object_exist(path=path, boto3_session=session):
            _logger.debug("Start CSV reading from %s", path)
            _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None
            _logger.debug("_chunksize: %s", _chunksize)
            ret = s3.read_csv(
                path=[path],
                dtype=dtype,
                parse_dates=parse_timestamps,
                converters=converters,
                quoting=csv.QUOTE_ALL,
                keep_default_na=False,
                na_values=[""],
                chunksize=_chunksize,
                skip_blank_lines=False,
                use_threads=False,
                boto3_session=session,
            )
            _logger.debug("Start type casting...")
            _logger.debug(type(ret))
            if chunksize is None:
                df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries)
                return df
            dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
            return dfs
    raise exceptions.InvalidArgumentValue(f"Invalid data type: {cache_info['data_type']}.")  # pragma: no cover

Exemple #2

0

Afficher le fichier

Fichier : _utils.py Projet : keithkruelskie/aws-data-wrangler

def _fetch_txt_result(
    query_metadata: _QueryMetadata,
    keep_files: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
) -> pd.DataFrame:
    if query_metadata.output_location is None or query_metadata.output_location.endswith(
            ".txt") is False:
        return pd.DataFrame()
    path: str = query_metadata.output_location
    _logger.debug("Start TXT reading from %s", path)
    df = s3.read_csv(
        path=[path],
        dtype=query_metadata.dtype,
        parse_dates=query_metadata.parse_timestamps,
        converters=query_metadata.converters,
        quoting=csv.QUOTE_ALL,
        keep_default_na=False,
        skip_blank_lines=True,
        na_values=[],
        use_threads=False,
        boto3_session=boto3_session,
        names=list(query_metadata.dtype.keys()),
        sep="\t",
    )
    if keep_files is False:
        s3.delete_objects(
            path=[path, f"{path}.metadata"],
            use_threads=False,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
        )
    return df

Exemple #3

0

Afficher le fichier

Fichier : _read.py Projet : romankoles/aws-data-wrangler

def _fetch_csv_result(
    query_metadata: _QueryMetadata,
    keep_files: bool,
    chunksize: Optional[int],
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, Any]],
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    _chunksize: Optional[int] = chunksize if isinstance(chunksize,
                                                        int) else None
    _logger.debug("_chunksize: %s", _chunksize)
    if query_metadata.output_location is None or query_metadata.output_location.endswith(
            ".csv") is False:
        chunked = _chunksize is not None
        return _empty_dataframe_response(chunked, query_metadata)
    path: str = query_metadata.output_location
    _logger.debug("Start CSV reading from %s", path)
    ret = s3.read_csv(
        path=[path],
        dtype=query_metadata.dtype,
        parse_dates=query_metadata.parse_timestamps,
        converters=query_metadata.converters,
        quoting=csv.QUOTE_ALL,
        keep_default_na=False,
        na_values=["", "NaN"],
        chunksize=_chunksize,
        skip_blank_lines=False,
        use_threads=False,
        boto3_session=boto3_session,
    )
    _logger.debug("Start type casting...")
    _logger.debug(type(ret))
    if _chunksize is None:
        df = _fix_csv_types(df=ret,
                            parse_dates=query_metadata.parse_dates,
                            binaries=query_metadata.binaries)
        df = _apply_query_metadata(df=df, query_metadata=query_metadata)
        if keep_files is False:
            s3.delete_objects(
                path=[path, f"{path}.metadata"],
                use_threads=use_threads,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
            )
        return df
    dfs = _fix_csv_types_generator(dfs=ret,
                                   parse_dates=query_metadata.parse_dates,
                                   binaries=query_metadata.binaries)
    dfs = _add_query_metadata_generator(dfs=dfs, query_metadata=query_metadata)
    if keep_files is False:
        return _delete_after_iterate(
            dfs=dfs,
            paths=[path, f"{path}.metadata"],
            use_threads=use_threads,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
        )
    return dfs

Exemple #4

0

Afficher le fichier

def _resolve_query_without_cache(
    # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements
    sql: str,
    database: str,
    ctas_approach: bool,
    categories: Optional[List[str]],
    chunksize: Optional[Union[int, bool]],
    s3_output: Optional[str],
    workgroup: Optional[str],
    encryption: Optional[str],
    kms_key: Optional[str],
    keep_files: bool,
    ctas_temp_table_name: Optional[str],
    use_threads: bool,
    session: Optional[boto3.Session],
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    """
    Execute any query in Athena and returns results as Dataframe, back to `read_sql_query`.

    Usually called by `read_sql_query` when using cache is not possible.
    """
    wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup)
    _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session)
    _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output

    name: str = ""
    if ctas_approach is True:
        if ctas_temp_table_name is not None:
            name = catalog.sanitize_table_name(ctas_temp_table_name)
        else:
            name = f"temp_table_{pa.compat.guid()}"
        path: str = f"{_s3_output}/{name}"
        ext_location: str = "\n" if wg_config["enforced"] is True else f",\n    external_location = '{path}'\n"
        sql = (
            f'CREATE TABLE "{name}"\n'
            f"WITH(\n"
            f"    format = 'Parquet',\n"
            f"    parquet_compression = 'SNAPPY'"
            f"{ext_location}"
            f") AS\n"
            f"{sql}"
        )
    _logger.debug("sql: %s", sql)
    query_id: str = _start_query_execution(
        sql=sql,
        wg_config=wg_config,
        database=database,
        s3_output=_s3_output,
        workgroup=workgroup,
        encryption=encryption,
        kms_key=kms_key,
        boto3_session=session,
    )
    _logger.debug("query_id: %s", query_id)
    try:
        query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session)
    except exceptions.QueryFailed as ex:
        if ctas_approach is True:
            if "Column name not specified" in str(ex):
                raise exceptions.InvalidArgumentValue(
                    "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')"
                )
            if "Column type is unknown" in str(ex):
                raise exceptions.InvalidArgumentValue(
                    "Please, define all columns types in your query. "
                    "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')"
                )
        raise ex  # pragma: no cover
    if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]:  # pragma: no cover
        reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"]
        message_error: str = f"Query error: {reason}"
        raise exceptions.AthenaQueryError(message_error)
    ret: Union[pd.DataFrame, Iterator[pd.DataFrame]]
    if ctas_approach is True:
        catalog.delete_table_if_exists(database=database, table=name, boto3_session=session)
        manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
        metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata"
        _logger.debug("manifest_path: %s", manifest_path)
        _logger.debug("metadata_path: %s", metadata_path)
        s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session)
        paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session)
        chunked: Union[bool, int] = False if chunksize is None else chunksize
        _logger.debug("chunked: %s", chunked)
        if not paths:
            if chunked is False:
                return pd.DataFrame()
            return _utils.empty_generator()
        s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session)
        ret = s3.read_parquet(
            path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories
        )
        paths_delete: List[str] = paths + [manifest_path, metadata_path]
        _logger.debug(type(ret))
        if chunked is False:
            if keep_files is False:
                s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session)
            return ret
        if keep_files is False:
            return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session)
        return ret
    dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
        query_execution_id=query_id, categories=categories, boto3_session=session
    )
    path = f"{_s3_output}/{query_id}.csv"
    s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session)
    _logger.debug("Start CSV reading from %s", path)
    _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None
    _logger.debug("_chunksize: %s", _chunksize)
    ret = s3.read_csv(
        path=[path],
        dtype=dtype,
        parse_dates=parse_timestamps,
        converters=converters,
        quoting=csv.QUOTE_ALL,
        keep_default_na=False,
        na_values=[""],
        chunksize=_chunksize,
        skip_blank_lines=False,
        use_threads=False,
        boto3_session=session,
    )
    _logger.debug("Start type casting...")
    _logger.debug(type(ret))
    if chunksize is None:
        df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries)
        if keep_files is False:
            s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session)
        return df
    dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
    if keep_files is False:
        return _delete_after_iterate(
            dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session
        )
    return dfs

Exemple #5

0

Afficher le fichier

Fichier : athena.py Projet : russellbrooks/aws-data-wrangler

def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals
    sql: str,
    database: str,
    ctas_approach: bool = True,
    categories: List[str] = None,
    chunksize: Optional[int] = None,
    s3_output: Optional[str] = None,
    workgroup: Optional[str] = None,
    encryption: Optional[str] = None,
    kms_key: Optional[str] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame.

    There are two approaches to be defined through ctas_approach parameter:

    1 - `ctas_approach=True` (`Default`):
    Wrap the query with a CTAS and then reads the table data as parquet directly from s3.
    PROS: Faster and can handle some level of nested types.
    CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone
    (A temporary table will be created and then deleted immediately).

    2 - `ctas_approach False`:
    Does a regular query on Athena and parse the regular CSV result on s3.
    PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone.
    CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
    and does not handle nested types at all.

    Note
    ----
    If `chunksize` is passed, then a Generator of DataFrames is returned.

    Note
    ----
    If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
    but it still useful to overcome memory limitation.

    Note
    ----
    Create the default Athena bucket if it doesn't exist and s3_output is None.
    (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)

    Note
    ----
    In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    sql : str
        SQL query.
    database : str
        AWS Glue/Athena database name.
    ctas_approach: bool
        Wraps the query using a CTAS, and read the resulted parquet data on S3.
        If false, read the regular CSV on S3.
    categories: List[str], optional
        List of columns names that should be returned as pandas.Categorical.
        Recommended for memory restricted environments.
    chunksize: int, optional
        If specified, return an generator where chunksize is the number of rows to include in each chunk.
    s3_output : str, optional
        AWS S3 path.
    workgroup : str, optional
        Athena workgroup.
    encryption : str, optional
        None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'.
    kms_key : str, optional
        For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Union[pd.DataFrame, Iterator[pd.DataFrame]]
        Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed.

    Examples
    --------
    >>> import awswrangler as wr
    >>> df = wr.athena.read_sql_query(sql='...', database='...')

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    wg_s3_output, _, _ = _ensure_workgroup(session=session,
                                           workgroup=workgroup)
    if s3_output is None:
        if wg_s3_output is None:
            _s3_output: str = create_athena_bucket(boto3_session=session)
        else:
            _s3_output = wg_s3_output
    else:
        _s3_output = s3_output
    name: str = ""
    if ctas_approach is True:
        name = f"temp_table_{pa.compat.guid()}"
        _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output
        path: str = f"{_s3_output}/{name}"
        sql = (f"CREATE TABLE {name}\n"
               f"WITH(\n"
               f"    format = 'Parquet',\n"
               f"    parquet_compression = 'SNAPPY',\n"
               f"    external_location = '{path}'\n"
               f") AS\n"
               f"{sql}")
    _logger.debug(f"sql: {sql}")
    query_id: str = start_query_execution(
        sql=sql,
        database=database,
        s3_output=_s3_output,
        workgroup=workgroup,
        encryption=encryption,
        kms_key=kms_key,
        boto3_session=session,
    )
    _logger.debug(f"query_id: {query_id}")
    query_response: Dict[str, Any] = wait_query(query_execution_id=query_id,
                                                boto3_session=session)
    if query_response["QueryExecution"]["Status"]["State"] in [
            "FAILED", "CANCELLED"
    ]:  # pragma: no cover
        reason: str = query_response["QueryExecution"]["Status"][
            "StateChangeReason"]
        message_error: str = f"Query error: {reason}"
        raise exceptions.AthenaQueryError(message_error)
    dfs: Union[pd.DataFrame, Iterator[pd.DataFrame]]
    if ctas_approach is True:
        catalog.delete_table_if_exists(database=database,
                                       table=name,
                                       boto3_session=session)
        manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
        paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path,
                                                        boto3_session=session)
        chunked: bool = chunksize is not None
        _logger.debug(f"chunked: {chunked}")
        if not paths:
            if chunked is False:
                dfs = pd.DataFrame()
            else:
                dfs = _utils.empty_generator()
        else:
            s3.wait_objects_exist(paths=paths,
                                  use_threads=False,
                                  boto3_session=session)
            dfs = s3.read_parquet(path=paths,
                                  use_threads=use_threads,
                                  boto3_session=session,
                                  chunked=chunked,
                                  categories=categories)
        return dfs
    dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
        query_execution_id=query_id,
        categories=categories,
        boto3_session=session)
    path = f"{_s3_output}/{query_id}.csv"
    s3.wait_objects_exist(paths=[path],
                          use_threads=False,
                          boto3_session=session)
    _logger.debug(f"Start CSV reading from {path}")
    ret = s3.read_csv(
        path=[path],
        dtype=dtype,
        parse_dates=parse_timestamps,
        converters=converters,
        quoting=csv.QUOTE_ALL,
        keep_default_na=False,
        na_values=[""],
        chunksize=chunksize,
        skip_blank_lines=False,
        use_threads=False,
        boto3_session=session,
    )
    _logger.debug("Start type casting...")
    if chunksize is None:
        return _fix_csv_types(df=ret,
                              parse_dates=parse_dates,
                              binaries=binaries)
    _logger.debug(type(ret))
    return _fix_csv_types_generator(dfs=ret,
                                    parse_dates=parse_dates,
                                    binaries=binaries)

Exemple #6

0

Afficher le fichier

def read_sql_query(  # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements
    sql: str,
    database: str,
    ctas_approach: bool = True,
    categories: List[str] = None,
    chunksize: Optional[Union[int, bool]] = None,
    s3_output: Optional[str] = None,
    workgroup: Optional[str] = None,
    encryption: Optional[str] = None,
    kms_key: Optional[str] = None,
    keep_files: bool = True,
    ctas_temp_table_name: Optional[str] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame.

    There are two approaches to be defined through ctas_approach parameter:

    1 - `ctas_approach=True` (`Default`):
    Wrap the query with a CTAS and then reads the table data as parquet directly from s3.
    PROS: Faster and can handle some level of nested types.
    CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone
    (A temporary table will be created and then deleted immediately).

    2 - `ctas_approach False`:
    Does a regular query on Athena and parse the regular CSV result on s3.
    PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone.
    CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
    and does not handle nested types at all.

    Note
    ----
    Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS'].

    `P.S. 'CSE_KMS' is not supported.`

    Note
    ----
    Create the default Athena bucket if it doesn't exist and s3_output is None.

    (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)

    Note
    ----
    ``Batching`` (`chunksize` argument) (Memory Friendly):

    Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.

    There are two batching strategies on Wrangler:

    - If **chunksize=True**, a new DataFrame will be returned for each file in the query result.

    - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.

    `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise
    in number of rows for each Dataframe.

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    sql : str
        SQL query.
    database : str
        AWS Glue/Athena database name.
    ctas_approach: bool
        Wraps the query using a CTAS, and read the resulted parquet data on S3.
        If false, read the regular CSV on S3.
    categories: List[str], optional
        List of columns names that should be returned as pandas.Categorical.
        Recommended for memory restricted environments.
    chunksize : Union[int, bool], optional
        If passed will split the data in a Iterable of DataFrames (Memory friendly).
        If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
        If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
    s3_output : str, optional
        AWS S3 path.
    workgroup : str, optional
        Athena workgroup.
    encryption : str, optional
        Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported.
    kms_key : str, optional
        For SSE-KMS, this is the KMS key ARN or ID.
    keep_files : bool
        Should Wrangler delete or keep the staging files produced by Athena?
    ctas_temp_table_name : str, optional
        The name of the temporary table and also the directory name on S3 where the CTAS result is stored.
        If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`.
        On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Union[pd.DataFrame, Iterator[pd.DataFrame]]
        Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed.

    Examples
    --------
    >>> import awswrangler as wr
    >>> df = wr.athena.read_sql_query(sql='...', database='...')

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(
        session=session, workgroup=workgroup)
    _s3_output: str = _get_s3_output(s3_output=s3_output,
                                     wg_config=wg_config,
                                     boto3_session=session)
    _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output
    name: str = ""
    if ctas_approach is True:
        if ctas_temp_table_name is not None:
            name = catalog.sanitize_table_name(ctas_temp_table_name)
        else:
            name = f"temp_table_{pa.compat.guid()}"
        path: str = f"{_s3_output}/{name}"
        ext_location: str = "\n" if wg_config[
            "enforced"] is True else f",\n    external_location = '{path}'\n"
        sql = (f'CREATE TABLE "{name}"\n'
               f"WITH(\n"
               f"    format = 'Parquet',\n"
               f"    parquet_compression = 'SNAPPY'"
               f"{ext_location}"
               f") AS\n"
               f"{sql}")
    _logger.debug("sql: %s", sql)
    query_id: str = _start_query_execution(
        sql=sql,
        wg_config=wg_config,
        database=database,
        s3_output=_s3_output,
        workgroup=workgroup,
        encryption=encryption,
        kms_key=kms_key,
        boto3_session=session,
    )
    _logger.debug("query_id: %s", query_id)
    try:
        query_response: Dict[str,
                             Any] = wait_query(query_execution_id=query_id,
                                               boto3_session=session)
    except exceptions.QueryFailed as ex:
        if ctas_approach is True:
            if "Column name not specified" in str(ex):
                raise exceptions.InvalidArgumentValue(
                    "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')"
                )
            if "Column type is unknown" in str(ex):
                raise exceptions.InvalidArgumentValue(
                    "Please, define all columns types in your query. "
                    "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')")
        raise ex  # pragma: no cover
    if query_response["QueryExecution"]["Status"]["State"] in [
            "FAILED", "CANCELLED"
    ]:  # pragma: no cover
        reason: str = query_response["QueryExecution"]["Status"][
            "StateChangeReason"]
        message_error: str = f"Query error: {reason}"
        raise exceptions.AthenaQueryError(message_error)
    ret: Union[pd.DataFrame, Iterator[pd.DataFrame]]
    if ctas_approach is True:
        catalog.delete_table_if_exists(database=database,
                                       table=name,
                                       boto3_session=session)
        manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv"
        metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata"
        _logger.debug("manifest_path: %s", manifest_path)
        _logger.debug("metadata_path: %s", metadata_path)
        s3.wait_objects_exist(paths=[manifest_path, metadata_path],
                              use_threads=False,
                              boto3_session=session)
        paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path,
                                                        boto3_session=session)
        chunked: Union[bool, int] = False if chunksize is None else chunksize
        _logger.debug("chunked: %s", chunked)
        if not paths:
            if chunked is False:
                return pd.DataFrame()
            return _utils.empty_generator()
        s3.wait_objects_exist(paths=paths,
                              use_threads=False,
                              boto3_session=session)
        ret = s3.read_parquet(path=paths,
                              use_threads=use_threads,
                              boto3_session=session,
                              chunked=chunked,
                              categories=categories)
        paths_delete: List[str] = paths + [manifest_path, metadata_path]
        _logger.debug(type(ret))
        if chunked is False:
            if keep_files is False:
                s3.delete_objects(path=paths_delete,
                                  use_threads=use_threads,
                                  boto3_session=session)
            return ret
        if keep_files is False:
            return _delete_after_iterate(dfs=ret,
                                         paths=paths_delete,
                                         use_threads=use_threads,
                                         boto3_session=session)
        return ret
    dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata(
        query_execution_id=query_id,
        categories=categories,
        boto3_session=session)
    path = f"{_s3_output}/{query_id}.csv"
    s3.wait_objects_exist(paths=[path],
                          use_threads=False,
                          boto3_session=session)
    _logger.debug("Start CSV reading from %s", path)
    _chunksize: Optional[int] = chunksize if isinstance(chunksize,
                                                        int) else None
    _logger.debug("_chunksize: %s", _chunksize)
    ret = s3.read_csv(
        path=[path],
        dtype=dtype,
        parse_dates=parse_timestamps,
        converters=converters,
        quoting=csv.QUOTE_ALL,
        keep_default_na=False,
        na_values=[""],
        chunksize=_chunksize,
        skip_blank_lines=False,
        use_threads=False,
        boto3_session=session,
    )
    _logger.debug("Start type casting...")
    _logger.debug(type(ret))
    if chunksize is None:
        df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries)
        if keep_files is False:
            s3.delete_objects(path=[path, f"{path}.metadata"],
                              use_threads=use_threads,
                              boto3_session=session)
        return df
    dfs = _fix_csv_types_generator(dfs=ret,
                                   parse_dates=parse_dates,
                                   binaries=binaries)
    if keep_files is False:
        return _delete_after_iterate(dfs=dfs,
                                     paths=[path, f"{path}.metadata"],
                                     use_threads=use_threads,
                                     boto3_session=session)
    return dfs