def _add_query_metadata_generator( dfs: Iterator[pd.DataFrame], query_metadata: _QueryMetadata) -> Iterator[pd.DataFrame]: """Add Query Execution metadata to every DF in iterator.""" for df in dfs: df = _apply_query_metadata(df=df, query_metadata=query_metadata) yield df
def _fetch_csv_result( query_metadata: _QueryMetadata, keep_files: bool, chunksize: Optional[int], use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) if query_metadata.output_location is None or query_metadata.output_location.endswith( ".csv") is False: chunked = _chunksize is not None return _empty_dataframe_response(chunked, query_metadata) path: str = query_metadata.output_location _logger.debug("Start CSV reading from %s", path) ret = s3.read_csv( path=[path], dtype=query_metadata.dtype, parse_dates=query_metadata.parse_timestamps, converters=query_metadata.converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=["", "NaN"], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=boto3_session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if _chunksize is None: df = _fix_csv_types(df=ret, parse_dates=query_metadata.parse_dates, binaries=query_metadata.binaries) df = _apply_query_metadata(df=df, query_metadata=query_metadata) if keep_files is False: s3.delete_objects( path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=query_metadata.parse_dates, binaries=query_metadata.binaries) dfs = _add_query_metadata_generator(dfs=dfs, query_metadata=query_metadata) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return dfs
def _fetch_parquet_result( query_metadata: _QueryMetadata, keep_files: bool, categories: Optional[List[str]], chunksize: Optional[int], use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if query_metadata.manifest_location is None: return _empty_dataframe_response(bool(chunked), query_metadata) manifest_path: str = query_metadata.manifest_location metadata_path: str = manifest_path.replace("-manifest.csv", ".metadata") _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) paths: List[str] = _extract_ctas_manifest_paths( path=manifest_path, boto3_session=boto3_session) if not paths: return _empty_dataframe_response(bool(chunked), query_metadata) ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=boto3_session, chunked=chunked, categories=categories, ignore_index=True, ) if chunked is False: ret = _apply_query_metadata(df=ret, query_metadata=query_metadata) else: ret = _add_query_metadata_generator(dfs=ret, query_metadata=query_metadata) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug("type(ret): %s", type(ret)) if chunked is False: if keep_files is False: s3.delete_objects( path=paths_delete, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return ret if keep_files is False: return _delete_after_iterate( dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return ret
def _fetch_parquet_result( query_metadata: _QueryMetadata, keep_files: bool, categories: Optional[List[str]], chunksize: Optional[int], use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], temp_table_fqn: Optional[str] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if query_metadata.manifest_location is None: return _empty_dataframe_response(bool(chunked), query_metadata) manifest_path: str = query_metadata.manifest_location metadata_path: str = manifest_path.replace("-manifest.csv", ".metadata") _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=boto3_session) if not paths: if not temp_table_fqn: raise exceptions.EmptyDataFrame("Query would return untyped, empty dataframe.") database, temp_table_name = map(lambda x: x.replace('"', ""), temp_table_fqn.split(".")) dtype_dict = catalog.get_table_types(database=database, table=temp_table_name) df = pd.DataFrame(columns=list(dtype_dict.keys())) df = cast_pandas_with_athena_types(df=df, dtype=dtype_dict) df = _apply_query_metadata(df=df, query_metadata=query_metadata) return df ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=boto3_session, chunked=chunked, categories=categories, ignore_index=True, ) if chunked is False: ret = _apply_query_metadata(df=ret, query_metadata=query_metadata) else: ret = _add_query_metadata_generator(dfs=ret, query_metadata=query_metadata) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug("type(ret): %s", type(ret)) if chunked is False: if keep_files is False: s3.delete_objects( path=paths_delete, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return ret if keep_files is False: return _delete_after_iterate( dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=boto3_session, s3_additional_kwargs=s3_additional_kwargs, ) return ret