def _add_partitions( database: str, table: str, boto3_session: Optional[boto3.Session], inputs: List[Dict[str, Any]], catalog_id: Optional[str] = None, ) -> None: chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs, max_length=100) client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) for chunk in chunks: # pylint: disable=too-many-nested-blocks res: Dict[str, Any] = client_glue.batch_create_partition( **_catalog_id(catalog_id=catalog_id, DatabaseName=database, TableName=table, PartitionInputList=chunk)) if ("Errors" in res) and res["Errors"]: for error in res["Errors"]: if "ErrorDetail" in error: if "ErrorCode" in error["ErrorDetail"]: if error["ErrorDetail"][ "ErrorCode"] != "AlreadyExistsException": raise exceptions.ServiceApiError(str( res["Errors"]))
def add_parquet_partitions( database: str, table: str, partitions_values: Dict[str, List[str]], compression: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Add partitions (metadata) to a Parquet Table in the AWS Glue Catalog. Parameters ---------- database : str Database name. table : str Table name. partitions_values: Dict[str, List[str]] Dictionary with keys as S3 path locations and values as a list of partitions values as str (e.g. {'s3://bucket/prefix/y=2020/m=10/': ['2020', '10']}). compression: str, optional Compression style (``None``, ``snappy``, ``gzip``, etc). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.add_parquet_partitions( ... database='default', ... table='my_table', ... partitions_values={ ... 's3://bucket/prefix/y=2020/m=10/': ['2020', '10'], ... 's3://bucket/prefix/y=2020/m=11/': ['2020', '11'], ... 's3://bucket/prefix/y=2020/m=12/': ['2020', '12'] ... } ... ) """ inputs: List[Dict[str, Any]] = [ _parquet_partition_definition(location=k, values=v, compression=compression) for k, v in partitions_values.items() ] chunks: List[List[Dict[str, Any]]] = _utils.chunkify(lst=inputs, max_length=100) client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) for chunk in chunks: res: Dict[str, Any] = client_glue.batch_create_partition( DatabaseName=database, TableName=table, PartitionInputList=chunk) if ("Errors" in res) and res["Errors"]: # pragma: no cover raise exceptions.ServiceApiError(str(res["Errors"]))
def delete_partitions( table: str, database: str, partitions_values: List[List[str]], catalog_id: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Delete specified partitions in a AWS Glue Catalog table. Parameters ---------- table : str Table name. database : str Table name. catalog_id : str, optional The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default. partitions_values : List[List[str]] List of lists of partitions values as strings. (e.g. [['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']]). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.catalog.delete_partitions( ... table='my_table', ... database='awswrangler_test', ... partitions_values=[['2020', '10', '25'], ['2020', '11', '16'], ['2020', '12', '19']] ... ) """ client_glue: boto3.client = _utils.client(service_name="glue", session=boto3_session) chunks: List[List[List[str]]] = _utils.chunkify(lst=partitions_values, max_length=25) for chunk in chunks: client_glue.batch_delete_partition(**_catalog_id( catalog_id=catalog_id, DatabaseName=database, TableName=table, PartitionsToDelete=[{ "Values": v } for v in chunk], ))
def delete_objects(path: Union[str, List[str]], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None) -> None: """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- path : Union[str, List[str]] S3 prefix (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix """ paths: List[str] = path2list(path=path, boto3_session=boto3_session) if len(paths) < 1: return client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) for bucket, keys in buckets.items(): chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) if use_threads is False: for chunk in chunks: _delete_objects(bucket=bucket, keys=chunk, client_s3=client_s3) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map(_delete_objects, itertools.repeat(bucket), chunks, itertools.repeat(client_s3)))
def write( df: pd.DataFrame, database: str, table: str, time_col: str, measure_col: str, dimensions_cols: List[str], num_threads: int = 32, boto3_session: Optional[boto3.Session] = None, ) -> List[Dict[str, str]]: """Store a Pandas DataFrame into a Amazon Timestream table. Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html database : str Amazon Timestream database name. table : str Amazon Timestream table name. time_col : str DataFrame column name to be used as time. MUST be a timestamp column. measure_col : str DataFrame column name to be used as measure. dimensions_cols : List[str] List of DataFrame column names to be used as dimensions. num_threads : str Number of thread to be used for concurrent writing. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. Returns ------- List[Dict[str, str]] Rejected records. Examples -------- Store a Pandas DataFrame into a Amazon Timestream table. >>> import awswrangler as wr >>> import pandas as pd >>> df = pd.DataFrame( >>> { >>> "time": [datetime.now(), datetime.now(), datetime.now()], >>> "dim0": ["foo", "boo", "bar"], >>> "dim1": [1, 2, 3], >>> "measure": [1.0, 1.1, 1.2], >>> } >>> ) >>> rejected_records = wr.timestream.write( >>> df=df, >>> database="sampleDB", >>> table="sampleTable", >>> time_col="time", >>> measure_col="measure", >>> dimensions_cols=["dim0", "dim1"], >>> ) >>> assert len(rejected_records) == 0 """ measure_type: str = _data_types.timestream_type_from_pandas( df[[measure_col]]) _logger.debug("measure_type: %s", measure_type) cols_names: List[str] = [time_col, measure_col] + dimensions_cols _logger.debug("cols_names: %s", cols_names) batches: List[List[Any]] = _utils.chunkify(lst=_df2list(df=df[cols_names]), max_length=100) _logger.debug("len(batches): %s", len(batches)) with concurrent.futures.ThreadPoolExecutor( max_workers=num_threads) as executor: res: List[List[Any]] = list( executor.map( _write_batch, itertools.repeat(database), itertools.repeat(table), itertools.repeat(cols_names), itertools.repeat(measure_type), batches, itertools.repeat( _utils.boto3_to_primitives(boto3_session=boto3_session)), )) return [item for sublist in res for item in sublist]
def delete_objects( path: Union[str, List[str]], use_threads: bool = True, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ) -> None: """Delete Amazon S3 objects from a received S3 prefix or list of S3 objects paths. This function accepts Unix shell-style wildcards in the path argument. * (matches everything), ? (matches any single character), [seq] (matches any character in seq), [!seq] (matches any character not in seq). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Note ---- The filter by last_modified begin last_modified end is applied after list all S3 files Parameters ---------- path : Union[str, List[str]] S3 prefix (accepts Unix shell-style wildcards) (e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]). use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. last_modified_begin Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. last_modified_end: datetime, optional Filter the s3 files by the Last modified date of the object. The filter is applied only after list all s3 files. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- None None. Examples -------- >>> import awswrangler as wr >>> wr.s3.delete_objects(['s3://bucket/key0', 's3://bucket/key1']) # Delete both objects >>> wr.s3.delete_objects('s3://bucket/prefix') # Delete all objects under the received prefix """ paths: List[str] = _path2list( path=path, boto3_session=boto3_session, last_modified_begin=last_modified_begin, last_modified_end=last_modified_end, ) if len(paths) < 1: return buckets: Dict[str, List[str]] = _split_paths_by_bucket(paths=paths) for bucket, keys in buckets.items(): chunks: List[List[str]] = _utils.chunkify(lst=keys, max_length=1_000) if len(chunks) == 1: _delete_objects(bucket=bucket, keys=chunks[0], boto3_session=boto3_session) elif use_threads is False: for chunk in chunks: _delete_objects(bucket=bucket, keys=chunk, boto3_session=boto3_session) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map( _delete_objects_concurrent, itertools.repeat(bucket), chunks, itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), ))