def _fetch_txt_result( query_metadata: _QueryMetadata, keep_files: bool, boto3_session: boto3.Session, ) -> pd.DataFrame: if query_metadata.output_location is None or query_metadata.output_location.endswith( ".txt") is False: return pd.DataFrame() path: str = query_metadata.output_location s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=boto3_session) _logger.debug("Start TXT reading from %s", path) df = s3.read_csv( path=[path], dtype=query_metadata.dtype, parse_dates=query_metadata.parse_timestamps, converters=query_metadata.converters, quoting=csv.QUOTE_ALL, keep_default_na=False, skip_blank_lines=True, na_values=[], use_threads=False, boto3_session=boto3_session, names=list(query_metadata.dtype.keys()), sep="\t", ) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=False, boto3_session=boto3_session) return df
def _fetch_csv_result( query_metadata: _QueryMetadata, keep_files: bool, chunksize: Optional[int], use_threads: bool, boto3_session: boto3.Session, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) if query_metadata.output_location is None or query_metadata.output_location.endswith( ".csv") is False: chunked = _chunksize is not None return _empty_dataframe_response(chunked, query_metadata) path: str = query_metadata.output_location s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=boto3_session) _logger.debug("Start CSV reading from %s", path) ret = s3.read_csv( path=[path], dtype=query_metadata.dtype, parse_dates=query_metadata.parse_timestamps, converters=query_metadata.converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=["", "NaN"], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=boto3_session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if _chunksize is None: df = _fix_csv_types(df=ret, parse_dates=query_metadata.parse_dates, binaries=query_metadata.binaries) df = _apply_query_metadata(df=df, query_metadata=query_metadata) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=boto3_session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=query_metadata.parse_dates, binaries=query_metadata.binaries) dfs = _add_query_metadata_generator(dfs=dfs, query_metadata=query_metadata) if keep_files is False: return _delete_after_iterate(dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=boto3_session) return dfs
def _fetch_parquet_result( query_metadata: _QueryMetadata, keep_files: bool, categories: Optional[List[str]], chunksize: Optional[int], use_threads: bool, boto3_session: boto3.Session, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if query_metadata.manifest_location is None: return _empty_dataframe_response(bool(chunked), query_metadata) manifest_path: str = query_metadata.manifest_location metadata_path: str = manifest_path.replace("-manifest.csv", ".metadata") _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path], use_threads=False, boto3_session=boto3_session) paths: List[str] = _extract_ctas_manifest_paths( path=manifest_path, boto3_session=boto3_session) if not paths: return _empty_dataframe_response(bool(chunked), query_metadata) s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=boto3_session) ret = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=boto3_session, chunked=chunked, categories=categories) if chunked is False: ret = _apply_query_metadata(df=ret, query_metadata=query_metadata) else: ret = _add_query_metadata_generator(dfs=ret, query_metadata=query_metadata) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug("type(ret): %s", type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=boto3_session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=boto3_session) return ret
def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-arguments path: Union[str, List[str]], manifest_directory: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, parquet_infer_sampling: float = 1.0, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[List[str]] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). manifest_directory : str S3 prefix (e.g. s3://bucket/prefix) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. parquet_infer_sampling : float Random sample ratio of files that will have the metadata inspected. Must be `0.0 < sampling <= 1.0`. The higher, the more accurate. The lower, the faster. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : List[str], optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.db.copy_files_to_redshift( ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ _varchar_lengths: Dict[ str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = _path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith( "/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session) athena_types, _ = s3.read_parquet_metadata(path=paths, sampling=parquet_infer_sampling, dataset=False, use_threads=use_threads, boto3_session=session) _logger.debug("athena_types: %s", athena_types) redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[ col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift( dtype=col_type, varchar_length=length) with con.begin() as _con: created_table, created_schema = _rs_create_table( con=_con, table=table, schema=schema, redshift_types=redshift_types, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, ) _rs_copy( con=_con, table=created_table, schema=created_schema, manifest_path=manifest_path, iam_role=iam_role, num_files=len(paths), ) if table != created_table: # upsert _rs_upsert(con=_con, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) s3.delete_objects(path=[manifest_path], use_threads=use_threads, boto3_session=session)
def unload_redshift( sql: str, path: str, con: sqlalchemy.engine.Engine, iam_role: str, region: Optional[str] = None, max_file_size: Optional[float] = None, kms_key_id: Optional[str] = None, categories: Optional[List[str]] = None, chunked: Union[bool, int] = False, keep_files: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Load Pandas DataFrame from a Amazon Redshift query result using Parquet files on s3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.read_sql_query()`/`wr.db.read_sql_table()` to extract large Amazon Redshift data into a Pandas DataFrames through the **UNLOAD command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.db.read_sql_query()`/`wr.db.read_sql_table()` function, so it is only recommended to fetch +1MM rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html Note ---- ``Batching`` (`chunked` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunked=True**, a new DataFrame will be returned for each file in your path/dataset. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunked=True` if faster and uses less memory while `chunked=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- sql: str SQL query. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() iam_role : str AWS IAM role with the related permissions. region : str, optional Specifies the AWS Region where the target Amazon S3 bucket is located. REGION is required for UNLOAD to an Amazon S3 bucket that isn't in the same AWS Region as the Amazon Redshift cluster. By default, UNLOAD assumes that the target Amazon S3 bucket is located in the same AWS Region as the Amazon Redshift cluster. max_file_size : float, optional Specifies the maximum size (MB) of files that UNLOAD creates in Amazon S3. Specify a decimal value between 5.0 MB and 6200.0 MB. If None, the default maximum file size is 6200.0 MB. kms_key_id : str, optional Specifies the key ID for an AWS Key Management Service (AWS KMS) key to be used to encrypt data files on Amazon S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. keep_files : bool Should keep the stage files? chunked : Union[int, bool] If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests, only "SSECustomerAlgorithm" and "SSECustomerKey" arguments will be considered. Returns ------- Union[pandas.DataFrame, Iterator[pandas.DataFrame]] Result as Pandas DataFrame(s). Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> df = wr.db.unload_redshift( ... sql="SELECT * FROM public.mytable", ... path="s3://bucket/extracted_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_connection"), ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = unload_redshift_to_files( sql=sql, path=path, con=con, iam_role=iam_role, region=region, max_file_size=max_file_size, kms_key_id=kms_key_id, use_threads=use_threads, boto3_session=session, ) s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) if chunked is False: if not paths: return pd.DataFrame() df: pd.DataFrame = s3.read_parquet( path=paths, categories=categories, chunked=chunked, dataset=False, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) if keep_files is False: s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=session) return df if not paths: return _utils.empty_generator() return _read_parquet_iterator( paths=paths, categories=categories, chunked=chunked, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, keep_files=keep_files, )
path = path if path.endswith("/") else f"{path}/" session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3.to_parquet( df=df, path=path, index=index, dataset=True, mode="append", dtype=dtype, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, max_rows_by_file=max_rows_by_file, )["paths"] s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) copy_files_to_redshift( path=paths, manifest_directory=_utils.get_directory(path=path), con=con, table=table, schema=schema, iam_role=iam_role, mode=mode, diststyle=diststyle, distkey=distkey, sortstyle=sortstyle, sortkey=sortkey, primary_keys=primary_keys, varchar_lengths_default=varchar_lengths_default,
def _resolve_query_without_cache( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool, categories: Optional[List[str]], chunksize: Optional[Union[int, bool]], s3_output: Optional[str], workgroup: Optional[str], encryption: Optional[str], kms_key: Optional[str], keep_files: bool, ctas_temp_table_name: Optional[str], use_threads: bool, session: Optional[boto3.Session], ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """ Execute any query in Athena and returns results as Dataframe, back to `read_sql_query`. Usually called by `read_sql_query` when using cache is not possible. """ wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config(session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config["enforced"] is True else f",\n external_location = '{path}'\n" sql = ( f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}" ) _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')" ) raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in ["FAILED", "CANCELLED"]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"]["StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet( path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories ) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session ) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate( dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session ) return dfs
def unload_redshift( sql: str, path: str, con: sqlalchemy.engine.Engine, iam_role: str, categories: List[str] = None, chunked: bool = False, keep_files: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Load Pandas DataFrame from a Amazon Redshift query result using Parquet files on s3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.read_sql_query()`/`wr.db.read_sql_table()` to extract large Amazon Redshift data into a Pandas DataFrames through the **UNLOAD command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.db.read_sql_query()`/`wr.db.read_sql_table()` function, so it is only recommended to fetch +1MM rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- sql: str SQL query. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() iam_role : str AWS IAM role with the related permissions. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. keep_files : bool Should keep the stage files? chunked : bool If True will break the data in smaller DataFrames (Non deterministic number of lines). Otherwise return a single DataFrame with the whole data. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption Returns ------- pandas.DataFrame Pandas DataFrame Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> df = wr.db.unload_redshift( ... sql="SELECT * FROM public.mytable", ... path="s3://bucket/extracted_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_connection"), ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = unload_redshift_to_files(sql=sql, path=path, con=con, iam_role=iam_role, use_threads=use_threads, boto3_session=session) s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) if chunked is False: if not paths: # pragma: no cover return pd.DataFrame() df: pd.DataFrame = s3.read_parquet( path=paths, categories=categories, chunked=chunked, dataset=False, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, ) if keep_files is False: s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=session) return df if not paths: # pragma: no cover return _utils.empty_generator() return _read_parquet_iterator( paths=paths, categories=categories, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, keep_files=keep_files, )
def copy_to_redshift( # pylint: disable=too-many-arguments df: pd.DataFrame, path: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, index: bool = False, dtype: Optional[Dict[str, str]] = None, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[str] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, keep_files: bool = False, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> None: """Load Pandas DataFrame as a Table on Amazon Redshift using parquet files on S3 as stage. This is a **HIGH** latency and **HIGH** throughput alternative to `wr.db.to_sql()` to load large DataFrames into Amazon Redshift through the ** SQL COPY command**. This strategy has more overhead and requires more IAM privileges than the regular `wr.db.to_sql()` function, so it is only recommended to inserting +1MM rows at once. https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame. path : Union[str, List[str]] S3 path to write stage files (e.g. s3://bucket_name/any_name/) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. index : bool True to store the DataFrame index in file, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and Athena/Glue types to be casted. Useful when you have columns with undetermined or mixed data types. Only takes effect if dataset=True. (e.g. {'col name': 'bigint', 'col2 name': 'int'}) mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : str, optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). keep_files : bool Should keep the stage files? use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to s3fs, useful for server side encryption https://s3fs.readthedocs.io/en/latest/#serverside-encryption Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> import pandas as pd >>> wr.db.copy_to_redshift( ... df=pd.DataFrame({'col': [1, 2, 3]}), ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ path = path if path.endswith("/") else f"{path}/" session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3.to_parquet( # type: ignore df=df, path=path, index=index, dataset=True, mode="append", dtype=dtype, use_threads=use_threads, boto3_session=session, s3_additional_kwargs=s3_additional_kwargs, )["paths"] s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) copy_files_to_redshift( path=paths, manifest_directory=_utils.get_directory(path=path), con=con, table=table, schema=schema, iam_role=iam_role, mode=mode, diststyle=diststyle, distkey=distkey, sortstyle=sortstyle, sortkey=sortkey, primary_keys=primary_keys, varchar_lengths_default=varchar_lengths_default, varchar_lengths=varchar_lengths, use_threads=use_threads, boto3_session=session, ) if keep_files is False: s3.delete_objects(path=paths, use_threads=use_threads, boto3_session=session)
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[int] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- If `chunksize` is passed, then a Generator of DataFrames is returned. Note ---- If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes, but it still useful to overcome memory limitation. Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize: int, optional If specified, return an generator where chunksize is the number of rows to include in each chunk. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional None, 'SSE_S3', 'SSE_KMS', 'CSE_KMS'. kms_key : str, optional For SSE-KMS and CSE-KMS , this is the KMS key ARN or ID. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_s3_output, _, _ = _ensure_workgroup(session=session, workgroup=workgroup) if s3_output is None: if wg_s3_output is None: _s3_output: str = create_athena_bucket(boto3_session=session) else: _s3_output = wg_s3_output else: _s3_output = s3_output name: str = "" if ctas_approach is True: name = f"temp_table_{pa.compat.guid()}" _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output path: str = f"{_s3_output}/{name}" sql = (f"CREATE TABLE {name}\n" f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY',\n" f" external_location = '{path}'\n" f") AS\n" f"{sql}") _logger.debug(f"sql: {sql}") query_id: str = start_query_execution( sql=sql, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug(f"query_id: {query_id}") query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) dfs: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: bool = chunksize is not None _logger.debug(f"chunked: {chunked}") if not paths: if chunked is False: dfs = pd.DataFrame() else: dfs = _utils.empty_generator() else: s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) dfs = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) return dfs dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug(f"Start CSV reading from {path}") ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") if chunksize is None: return _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) _logger.debug(type(ret)) return _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries)
def read_sql_query( # pylint: disable=too-many-branches,too-many-locals,too-many-return-statements,too-many-statements sql: str, database: str, ctas_approach: bool = True, categories: List[str] = None, chunksize: Optional[Union[int, bool]] = None, s3_output: Optional[str] = None, workgroup: Optional[str] = None, encryption: Optional[str] = None, kms_key: Optional[str] = None, keep_files: bool = True, ctas_temp_table_name: Optional[str] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]: """Execute any SQL query on AWS Athena and return the results as a Pandas DataFrame. There are two approaches to be defined through ctas_approach parameter: 1 - `ctas_approach=True` (`Default`): Wrap the query with a CTAS and then reads the table data as parquet directly from s3. PROS: Faster and can handle some level of nested types. CONS: Requires create/delete table permissions on Glue and Does not support timestamp with time zone (A temporary table will be created and then deleted immediately). 2 - `ctas_approach False`: Does a regular query on Athena and parse the regular CSV result on s3. PROS: Does not require create/delete table permissions on Glue and supports timestamp with time zone. CONS: Slower (But stills faster than other libraries that uses the regular Athena API) and does not handle nested types at all. Note ---- Valid encryption modes: [None, 'SSE_S3', 'SSE_KMS']. `P.S. 'CSE_KMS' is not supported.` Note ---- Create the default Athena bucket if it doesn't exist and s3_output is None. (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/) Note ---- ``Batching`` (`chunksize` argument) (Memory Friendly): Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame. There are two batching strategies on Wrangler: - If **chunksize=True**, a new DataFrame will be returned for each file in the query result. - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER. `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise in number of rows for each Dataframe. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- sql : str SQL query. database : str AWS Glue/Athena database name. ctas_approach: bool Wraps the query using a CTAS, and read the resulted parquet data on S3. If false, read the regular CSV on S3. categories: List[str], optional List of columns names that should be returned as pandas.Categorical. Recommended for memory restricted environments. chunksize : Union[int, bool], optional If passed will split the data in a Iterable of DataFrames (Memory friendly). If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize. If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER. s3_output : str, optional AWS S3 path. workgroup : str, optional Athena workgroup. encryption : str, optional Valid values: [None, 'SSE_S3', 'SSE_KMS']. Notice: 'CSE_KMS' is not supported. kms_key : str, optional For SSE-KMS, this is the KMS key ARN or ID. keep_files : bool Should Wrangler delete or keep the staging files produced by Athena? ctas_temp_table_name : str, optional The name of the temporary table and also the directory name on S3 where the CTAS result is stored. If None, it will use the follow random pattern: `f"temp_table_{pyarrow.compat.guid()}"`. On S3 this directory will be under under the pattern: `f"{s3_output}/{ctas_temp_table_name}/"`. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Union[pd.DataFrame, Iterator[pd.DataFrame]] Pandas DataFrame or Generator of Pandas DataFrames if chunksize is passed. Examples -------- >>> import awswrangler as wr >>> df = wr.athena.read_sql_query(sql='...', database='...') """ session: boto3.Session = _utils.ensure_session(session=boto3_session) wg_config: Dict[str, Union[bool, Optional[str]]] = _get_workgroup_config( session=session, workgroup=workgroup) _s3_output: str = _get_s3_output(s3_output=s3_output, wg_config=wg_config, boto3_session=session) _s3_output = _s3_output[:-1] if _s3_output[-1] == "/" else _s3_output name: str = "" if ctas_approach is True: if ctas_temp_table_name is not None: name = catalog.sanitize_table_name(ctas_temp_table_name) else: name = f"temp_table_{pa.compat.guid()}" path: str = f"{_s3_output}/{name}" ext_location: str = "\n" if wg_config[ "enforced"] is True else f",\n external_location = '{path}'\n" sql = (f'CREATE TABLE "{name}"\n' f"WITH(\n" f" format = 'Parquet',\n" f" parquet_compression = 'SNAPPY'" f"{ext_location}" f") AS\n" f"{sql}") _logger.debug("sql: %s", sql) query_id: str = _start_query_execution( sql=sql, wg_config=wg_config, database=database, s3_output=_s3_output, workgroup=workgroup, encryption=encryption, kms_key=kms_key, boto3_session=session, ) _logger.debug("query_id: %s", query_id) try: query_response: Dict[str, Any] = wait_query(query_execution_id=query_id, boto3_session=session) except exceptions.QueryFailed as ex: if ctas_approach is True: if "Column name not specified" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns names in your query. (E.g. 'SELECT MAX(col1) AS max_col1, ...')" ) if "Column type is unknown" in str(ex): raise exceptions.InvalidArgumentValue( "Please, define all columns types in your query. " "(E.g. 'SELECT CAST(NULL AS INTEGER) AS MY_COL, ...')") raise ex # pragma: no cover if query_response["QueryExecution"]["Status"]["State"] in [ "FAILED", "CANCELLED" ]: # pragma: no cover reason: str = query_response["QueryExecution"]["Status"][ "StateChangeReason"] message_error: str = f"Query error: {reason}" raise exceptions.AthenaQueryError(message_error) ret: Union[pd.DataFrame, Iterator[pd.DataFrame]] if ctas_approach is True: catalog.delete_table_if_exists(database=database, table=name, boto3_session=session) manifest_path: str = f"{_s3_output}/tables/{query_id}-manifest.csv" metadata_path: str = f"{_s3_output}/tables/{query_id}.metadata" _logger.debug("manifest_path: %s", manifest_path) _logger.debug("metadata_path: %s", metadata_path) s3.wait_objects_exist(paths=[manifest_path, metadata_path], use_threads=False, boto3_session=session) paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=session) chunked: Union[bool, int] = False if chunksize is None else chunksize _logger.debug("chunked: %s", chunked) if not paths: if chunked is False: return pd.DataFrame() return _utils.empty_generator() s3.wait_objects_exist(paths=paths, use_threads=False, boto3_session=session) ret = s3.read_parquet(path=paths, use_threads=use_threads, boto3_session=session, chunked=chunked, categories=categories) paths_delete: List[str] = paths + [manifest_path, metadata_path] _logger.debug(type(ret)) if chunked is False: if keep_files is False: s3.delete_objects(path=paths_delete, use_threads=use_threads, boto3_session=session) return ret if keep_files is False: return _delete_after_iterate(dfs=ret, paths=paths_delete, use_threads=use_threads, boto3_session=session) return ret dtype, parse_timestamps, parse_dates, converters, binaries = _get_query_metadata( query_execution_id=query_id, categories=categories, boto3_session=session) path = f"{_s3_output}/{query_id}.csv" s3.wait_objects_exist(paths=[path], use_threads=False, boto3_session=session) _logger.debug("Start CSV reading from %s", path) _chunksize: Optional[int] = chunksize if isinstance(chunksize, int) else None _logger.debug("_chunksize: %s", _chunksize) ret = s3.read_csv( path=[path], dtype=dtype, parse_dates=parse_timestamps, converters=converters, quoting=csv.QUOTE_ALL, keep_default_na=False, na_values=[""], chunksize=_chunksize, skip_blank_lines=False, use_threads=False, boto3_session=session, ) _logger.debug("Start type casting...") _logger.debug(type(ret)) if chunksize is None: df = _fix_csv_types(df=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: s3.delete_objects(path=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return df dfs = _fix_csv_types_generator(dfs=ret, parse_dates=parse_dates, binaries=binaries) if keep_files is False: return _delete_after_iterate(dfs=dfs, paths=[path, f"{path}.metadata"], use_threads=use_threads, boto3_session=session) return dfs
def copy_files_to_redshift( # pylint: disable=too-many-locals,too-many-arguments path: Union[str, List[str]], manifest_directory: str, con: sqlalchemy.engine.Engine, table: str, schema: str, iam_role: str, mode: str = "append", diststyle: str = "AUTO", distkey: Optional[str] = None, sortstyle: str = "COMPOUND", sortkey: Optional[str] = None, primary_keys: Optional[List[str]] = None, varchar_lengths_default: int = 256, varchar_lengths: Optional[Dict[str, int]] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> None: """Load Parquet files from S3 to a Table on Amazon Redshift (Through COPY command). https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html Note ---- If the table does not exist yet, it will be automatically created for you using the Parquet metadata to infer the columns data types. Note ---- In case of `use_threads=True` the number of process that will be spawned will be get from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). manifest_directory : str S3 prefix (e.g. s3://bucket/prefix) con : sqlalchemy.engine.Engine SQLAlchemy Engine. Please use, wr.db.get_engine(), wr.db.get_redshift_temp_engine() or wr.catalog.get_engine() table : str Table name schema : str Schema name iam_role : str AWS IAM role with the related permissions. mode : str Append, overwrite or upsert. diststyle : str Redshift distribution styles. Must be in ["AUTO", "EVEN", "ALL", "KEY"]. https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html distkey : str, optional Specifies a column name or positional number for the distribution key. sortstyle : str Sorting can be "COMPOUND" or "INTERLEAVED". https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html sortkey : str, optional List of columns to be sorted. primary_keys : List[str], optional Primary keys. varchar_lengths_default : int The size that will be set for all VARCHAR columns not specified with varchar_lengths. varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.db.copy_files_to_redshift( ... path="s3://bucket/my_parquet_files/", ... con=wr.catalog.get_engine(connection="my_glue_conn_name"), ... table="my_table", ... schema="public" ... iam_role="arn:aws:iam::XXX:role/XXX" ... ) """ _varchar_lengths: Dict[str, int] = {} if varchar_lengths is None else varchar_lengths session: boto3.Session = _utils.ensure_session(session=boto3_session) paths: List[str] = s3._path2list(path=path, boto3_session=session) # pylint: disable=protected-access manifest_directory = manifest_directory if manifest_directory.endswith("/") else f"{manifest_directory}/" manifest_path: str = f"{manifest_directory}manifest.json" write_redshift_copy_manifest( manifest_path=manifest_path, paths=paths, use_threads=use_threads, boto3_session=session ) s3.wait_objects_exist(paths=paths + [manifest_path], use_threads=False, boto3_session=session) athena_types, _ = s3.read_parquet_metadata( path=paths, dataset=False, use_threads=use_threads, boto3_session=session ) _logger.debug(f"athena_types: {athena_types}") redshift_types: Dict[str, str] = {} for col_name, col_type in athena_types.items(): length: int = _varchar_lengths[col_name] if col_name in _varchar_lengths else varchar_lengths_default redshift_types[col_name] = _data_types.athena2redshift(dtype=col_type, varchar_length=length) with con.begin() as _con: created_table, created_schema = _rs_create_table( con=_con, table=table, schema=schema, redshift_types=redshift_types, mode=mode, diststyle=diststyle, sortstyle=sortstyle, distkey=distkey, sortkey=sortkey, primary_keys=primary_keys, ) _rs_copy( con=_con, table=created_table, schema=created_schema, manifest_path=manifest_path, iam_role=iam_role, num_files=len(paths), ) if table != created_table: # upsert _rs_upsert(con=_con, schema=schema, table=table, temp_table=created_table, primary_keys=primary_keys) s3.delete_objects(path=[manifest_path], use_threads=use_threads, boto3_session=session)