def _copy_objects( batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], ) -> None: _logger.debug("len(batch): %s", len(batch)) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) if s3_additional_kwargs is None: boto3_kwargs: Optional[Dict[str, Any]] = None else: boto3_kwargs = get_botocore_valid_kwargs( function_name="copy_object", s3_additional_kwargs=s3_additional_kwargs) for source, target in batch: source_bucket, source_key = _utils.parse_path(path=source) copy_source: Dict[str, str] = { "Bucket": source_bucket, "Key": source_key } target_bucket, target_key = _utils.parse_path(path=target) resource_s3.meta.client.copy( CopySource=copy_source, Bucket=target_bucket, Key=target_key, SourceClient=client_s3, ExtraArgs=boto3_kwargs, Config=TransferConfig(num_download_attempts=10, use_threads=use_threads), )
def _get_connection_attributes_from_catalog( connection: str, catalog_id: Optional[str], dbname: Optional[str], boto3_session: Optional[boto3.Session] ) -> ConnectionAttributes: details: Dict[str, Any] = get_connection(name=connection, catalog_id=catalog_id, boto3_session=boto3_session)[ "ConnectionProperties" ] if ";databaseName=" in details["JDBC_CONNECTION_URL"]: database_sep = ";databaseName=" else: database_sep = "/" port, database = details["JDBC_CONNECTION_URL"].split(":")[3].split(database_sep) ssl_context: Optional[ssl.SSLContext] = None if details.get("JDBC_ENFORCE_SSL") == "true": ssl_cert_path: Optional[str] = details.get("CUSTOM_JDBC_CERT") ssl_cadata: Optional[str] = None if ssl_cert_path: bucket_name, key_path = _utils.parse_path(ssl_cert_path) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) try: ssl_cadata = client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read().decode("utf-8") except client_s3.exception.NoSuchKey: raise exceptions.NoFilesFound( # pylint: disable=raise-missing-from f"No CA certificate found at {ssl_cert_path}." ) ssl_context = ssl.create_default_context(cadata=ssl_cadata) return ConnectionAttributes( kind=details["JDBC_CONNECTION_URL"].split(":")[1].lower(), user=details["USERNAME"], password=details["PASSWORD"], host=details["JDBC_CONNECTION_URL"].split(":")[2].replace("/", ""), port=int(port), database=dbname if dbname is not None else database, ssl_context=ssl_context, )
def _describe_object( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, Any]], version_id: Optional[str] = None, ) -> Tuple[str, Dict[str, Any]]: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) bucket: str key: str bucket, key = _utils.parse_path(path=path) if s3_additional_kwargs: extra_kwargs: Dict[str, Any] = _fs.get_botocore_valid_kwargs( function_name="head_object", s3_additional_kwargs=s3_additional_kwargs) else: extra_kwargs = {} desc: Dict[str, Any] if version_id: extra_kwargs["VersionId"] = version_id desc = _utils.try_it(f=client_s3.head_object, ex=client_s3.exceptions.NoSuchKey, Bucket=bucket, Key=key, **extra_kwargs) return path, desc
def _list_objects( path: str, delimiter: Optional[str] = None, suffix: Optional[str] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[str]: bucket: str prefix: str bucket, prefix = _utils.parse_path(path=path) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} if delimiter is not None: args["Delimiter"] = delimiter response_iterator = paginator.paginate(**args) paths: List[str] = [] for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List] = page.get("Contents") if contents is not None: for content in contents: if (content is not None) and ("Key" in content): key: str = content["Key"] if (suffix is None) or key.endswith(suffix): paths.append(f"s3://{bucket}/{key}") else: prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") return paths
def does_object_exist(path: str, s3_additional_kwargs: Optional[Dict[str, Any]] = None, boto3_session: Optional[boto3.Session] = None) -> bool: """Check if object exists on S3. Parameters ---------- path: str S3 path (e.g. s3://bucket/key). s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests. Valid parameters: "RequestPayer", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'RequestPayer': 'requester'} boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if exists, False otherwise. Examples -------- Using the default boto3 session >>> import awswrangler as wr >>> wr.s3.does_object_exist('s3://bucket/key_real') True >>> wr.s3.does_object_exist('s3://bucket/key_unreal') False Using a custom boto3 session >>> import boto3 >>> import awswrangler as wr >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) True >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) False """ client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) bucket: str key: str bucket, key = _utils.parse_path(path=path) if s3_additional_kwargs: extra_kwargs: Dict[str, Any] = _fs.get_botocore_valid_kwargs( function_name="head_object", s3_additional_kwargs=s3_additional_kwargs) else: extra_kwargs = {} try: client_s3.head_object(Bucket=bucket, Key=key, **extra_kwargs) return True except botocore.exceptions.ClientError as ex: if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: return False raise ex
def __init__( self, path: str, s3_block_size: int, mode: str, use_threads: Union[bool, int], s3_additional_kwargs: Optional[Dict[str, str]], boto3_session: Optional[boto3.Session], newline: Optional[str], encoding: Optional[str], ) -> None: super().__init__() self._use_threads = use_threads self._newline: str = "\n" if newline is None else newline self._encoding: str = "utf-8" if encoding is None else encoding self._bucket, self._key = _utils.parse_path(path=path) self._boto3_session: boto3.Session = _utils.ensure_session(session=boto3_session) if mode not in {"rb", "wb", "r", "w"}: raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode) self._mode: str = "rb" if mode is None else mode self._one_shot_download: bool = False if 0 < s3_block_size < 3: raise exceptions.InvalidArgumentValue( "s3_block_size MUST > 2 to define a valid size or " "< 1 to avoid blocks and always execute one shot downloads." ) if s3_block_size <= 0: _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size) self._one_shot_download = True self._s3_block_size: int = s3_block_size self._s3_half_block_size: int = s3_block_size // 2 self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) self._loc: int = 0 if self.readable() is True: self._cache: bytes = b"" self._start: int = 0 self._end: int = 0 size: Optional[int] = size_objects( path=[path], use_threads=False, boto3_session=self._boto3_session, s3_additional_kwargs=self._s3_additional_kwargs, )[path] if size is None: raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}") self._size: int = size _logger.debug("self._size: %s", self._size) _logger.debug("self._s3_block_size: %s", self._s3_block_size) elif self.writable() is True: self._mpu: Dict[str, Any] = {} self._buffer: io.BytesIO = io.BytesIO() self._parts_count: int = 0 self._size = 0 self._upload_proxy: _UploadProxy = _UploadProxy(use_threads=self._use_threads) else: raise RuntimeError(f"Invalid mode: {self._mode}")
def _describe_object(path: str, boto3_session: boto3.Session) -> Tuple[str, Dict[str, Any]]: client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) bucket: str key: str bucket, key = _utils.parse_path(path=path) desc: Dict[str, Any] = _utils.try_it( f=client_s3.head_object, ex=client_s3.exceptions.NoSuchKey, Bucket=bucket, Key=key ) return path, desc
def _extract_ctas_manifest_paths(path: str, boto3_session: Optional[boto3.Session] = None ) -> List[str]: """Get the list of paths of the generated files.""" bucket_name, key_path = _utils.parse_path(path) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) body: bytes = client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read() return [x for x in body.decode("utf-8").split("\n") if x != ""]
def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]: buckets: Dict[str, List[str]] = {} bucket: str key: str for path in paths: bucket, key = _utils.parse_path(path=path) if bucket not in buckets: buckets[bucket] = [] buckets[bucket].append(key) return buckets
def submit_ecr_credentials_refresh( cluster_id: str, path: str, action_on_failure: str = "CONTINUE", boto3_session: Optional[boto3.Session] = None) -> str: """Update internal ECR credentials. Parameters ---------- cluster_id : str Cluster ID. path : str Amazon S3 path where Wrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/) action_on_failure : str 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE' boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- str Step ID. Examples -------- >>> import awswrangler as wr >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/") """ path = path[:-1] if path.endswith("/") else path path_script: str = f"{path}/ecr_credentials_refresh.py" session: boto3.Session = _utils.ensure_session(session=boto3_session) client_s3: boto3.client = _utils.client(service_name="s3", session=session) bucket, key = _utils.parse_path(path=path_script) region: str = _utils.get_region_from_session(boto3_session=boto3_session) client_s3.put_object( Body=_get_ecr_credentials_refresh_content(region).encode( encoding="utf-8"), Bucket=bucket, Key=key) command: str = f"spark-submit --deploy-mode cluster {path_script}" name: str = "ECR Credentials Refresh" step: Dict[str, Any] = build_step(name=name, command=command, action_on_failure=action_on_failure, script=False, boto3_session=session) client_emr: boto3.client = _utils.client(service_name="emr", session=session) response: Dict[str, Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) _logger.debug("response: \n%s", pprint.pformat(response)) return response["StepIds"][0]
def _list_objects( # pylint: disable=too-many-branches path: str, delimiter: Optional[str] = None, suffix: Union[str, List[str], None] = None, ignore_suffix: Union[str, List[str], None] = None, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ) -> List[str]: bucket: str prefix_original: str bucket, prefix_original = _utils.parse_path(path=path) prefix: str = _prefix_cleanup(prefix=prefix_original) _suffix: Union[List[str], None] = [suffix] if isinstance(suffix, str) else suffix _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance(ignore_suffix, str) else ignore_suffix client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}} if delimiter is not None: args["Delimiter"] = delimiter response_iterator = paginator.paginate(**args) paths: List[str] = [] _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end) for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List[Dict[str, Any]]] = page.get("Contents") if contents is not None: for content in contents: key: str = content["Key"] if (content is not None) and ("Key" in content): if (_suffix is None) or key.endswith(tuple(_suffix)): if last_modified_begin is not None: if content["LastModified"] < last_modified_begin: continue if last_modified_end is not None: if content["LastModified"] > last_modified_end: continue paths.append(f"s3://{bucket}/{key}") else: prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") if prefix != prefix_original: paths = fnmatch.filter(paths, path) if _ignore_suffix is not None: paths = [p for p in paths if p.endswith(tuple(_ignore_suffix)) is False] return paths
def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool, boto3_session: boto3.Session) -> None: _logger.debug("len(batch): %s", len(batch)) client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) resource_s3: boto3.resource = _utils.resource(service_name="s3", session=boto3_session) for source, target in batch: source_bucket, source_key = _utils.parse_path(path=source) copy_source: Dict[str, str] = { "Bucket": source_bucket, "Key": source_key } target_bucket, target_key = _utils.parse_path(path=target) resource_s3.meta.client.copy( CopySource=copy_source, Bucket=target_bucket, Key=target_key, SourceClient=client_s3, Config=TransferConfig(num_download_attempts=15, use_threads=use_threads), )
def does_object_exist(path: str, boto3_session: Optional[boto3.Session] = None) -> bool: """Check if object exists on S3. Parameters ---------- path: str S3 path (e.g. s3://bucket/key). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- bool True if exists, False otherwise. Examples -------- Using the default boto3 session >>> import awswrangler as wr >>> wr.s3.does_object_exist('s3://bucket/key_real') True >>> wr.s3.does_object_exist('s3://bucket/key_unreal') False Using a custom boto3 session >>> import boto3 >>> import awswrangler as wr >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session()) True >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session()) False """ client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) bucket: str key: str bucket, key = _utils.parse_path(path=path) try: client_s3.head_object(Bucket=bucket, Key=key) return True except botocore.exceptions.ClientError as ex: if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: return False raise ex
def _wait_objects( waiter_name: str, paths: List[str], delay: Optional[Union[int, float]] = None, max_attempts: Optional[int] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> None: delay = 5 if delay is None else delay max_attempts = 20 if max_attempts is None else max_attempts _delay: int = int(delay) if isinstance(delay, float) else delay if len(paths) < 1: return None _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] if len(_paths) == 1: _wait_object( path=_paths[0], waiter_name=waiter_name, delay=_delay, max_attempts=max_attempts, boto3_session=boto3_session, ) elif use_threads is False: for path in _paths: _wait_object(path=path, waiter_name=waiter_name, delay=_delay, max_attempts=max_attempts, boto3_session=boto3_session) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map( _wait_object_concurrent, _paths, itertools.repeat(waiter_name), itertools.repeat(_delay), itertools.repeat(max_attempts), itertools.repeat( _utils.boto3_to_primitives( boto3_session=boto3_session)), )) return None
def _describe_object( path: str, wait_time: Optional[Union[int, float]], client_s3: boto3.client ) -> Tuple[str, Dict[str, Any]]: wait_time = int(wait_time) if isinstance(wait_time, float) else wait_time tries: int = wait_time if (wait_time is not None) and (wait_time > 0) else 1 bucket: str key: str bucket, key = _utils.parse_path(path=path) desc: Dict[str, Any] = {} for i in range(tries, 0, -1): try: desc = client_s3.head_object(Bucket=bucket, Key=key) break except botocore.exceptions.ClientError as e: # pragma: no cover if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404: # Not Found _logger.debug("Object not found. %s seconds remaining to wait.", i) if i == 1: # Last try, there is no more need to sleep break time.sleep(1) else: raise e return path, desc
def _wait_objects( waiter_name: str, paths: List[str], delay: Optional[Union[int, float]] = None, max_attempts: Optional[int] = None, use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, ) -> None: delay = 5 if delay is None else delay max_attempts = 20 if max_attempts is None else max_attempts _delay: int = int(delay) if isinstance(delay, float) else delay if len(paths) < 1: return None client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths] if use_threads is False: waiter = client_s3.get_waiter(waiter_name) for bucket, key in _paths: waiter.wait(Bucket=bucket, Key=key, WaiterConfig={ "Delay": _delay, "MaxAttempts": max_attempts }) else: cpus: int = _utils.ensure_cpu_count(use_threads=use_threads) with concurrent.futures.ThreadPoolExecutor( max_workers=cpus) as executor: list( executor.map( _wait_objects_concurrent, _paths, itertools.repeat(waiter_name), itertools.repeat(client_s3), itertools.repeat(_delay), itertools.repeat(max_attempts), )) return None
def write_redshift_copy_manifest( manifest_path: str, paths: List[str], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, str]] = None, ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]: """Write Redshift copy manifest and return its structure. Only Parquet files are supported. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- manifest_path : str Amazon S3 manifest path (e.g. s3://...) paths: List[str] List of S3 paths (Parquet Files) to be copied. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs: Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass", "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging". e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'} Returns ------- Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] Manifest content. Examples -------- Copying two files to Redshift cluster. >>> import awswrangler as wr >>> wr.db.write_redshift_copy_manifest( ... path="s3://bucket/my.manifest", ... paths=["s3://...parquet", "s3://...parquet"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) objects_sizes: Dict[str, Optional[int]] = s3.size_objects( path=paths, use_threads=use_threads, boto3_session=session) manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = { "entries": [] } path: str size: Optional[int] for path, size in objects_sizes.items(): if size is not None: entry: Dict[str, Union[str, bool, Dict[str, int]]] = { "url": path, "mandatory": True, "meta": { "content_length": size }, } manifest["entries"].append(entry) payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) additional_kwargs: Dict[ str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs _logger.debug("payload: %s", payload) client_s3: boto3.client = _utils.client(service_name="s3", session=session) _logger.debug("bucket: %s", bucket) _logger.debug("key: %s", key) client_s3.put_object(Body=payload, Bucket=bucket, Key=key, **additional_kwargs) return manifest
def write_redshift_copy_manifest( manifest_path: str, paths: List[str], use_threads: bool = True, boto3_session: Optional[boto3.Session] = None ) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]: """Write Redshift copy manifest and return its structure. Only Parquet files are supported. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count(). Parameters ---------- manifest_path : str Amazon S3 manifest path (e.g. s3://...) paths: List[str] List of S3 paths (Parquet Files) to be copied. use_threads : bool True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. Returns ------- Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] Manifest content. Examples -------- Copying two files to Redshift cluster. >>> import awswrangler as wr >>> wr.db.write_redshift_copy_manifest( ... path="s3://bucket/my.manifest", ... paths=["s3://...parquet", "s3://...parquet"] ... ) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) objects_sizes: Dict[str, Optional[int]] = s3.size_objects( path=paths, use_threads=use_threads, boto3_session=session) manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = { "entries": [] } path: str size: Optional[int] for path, size in objects_sizes.items(): if size is not None: entry: Dict[str, Union[str, bool, Dict[str, int]]] = { "url": path, "mandatory": True, "meta": { "content_length": size }, } manifest["entries"].append(entry) payload: str = json.dumps(manifest) bucket: str bucket, key = _utils.parse_path(manifest_path) _logger.debug(f"payload: {payload}") client_s3: boto3.client = _utils.client(service_name="s3", session=session) _logger.debug(f"bucket: {bucket}") _logger.debug(f"key: {key}") client_s3.put_object(Body=payload, Bucket=bucket, Key=key) return manifest
def _list_objects( # pylint: disable=too-many-branches path: str, s3_additional_kwargs: Optional[Dict[str, Any]], delimiter: Optional[str] = None, suffix: Union[str, List[str], None] = None, ignore_suffix: Union[str, List[str], None] = None, last_modified_begin: Optional[datetime.datetime] = None, last_modified_end: Optional[datetime.datetime] = None, boto3_session: Optional[boto3.Session] = None, ignore_empty: bool = False, ) -> Iterator[List[str]]: bucket: str prefix_original: str bucket, prefix_original = _utils.parse_path(path=path) prefix: str = _prefix_cleanup(prefix=prefix_original) _suffix: Union[List[str], None] = [suffix] if isinstance(suffix, str) else suffix _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance( ignore_suffix, str) else ignore_suffix client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session) default_pagination: Dict[str, int] = {"PageSize": 1000} extra_kwargs: Dict[str, Any] = {"PaginationConfig": default_pagination} if s3_additional_kwargs: extra_kwargs = _fs.get_botocore_valid_kwargs( function_name="list_objects_v2", s3_additional_kwargs=s3_additional_kwargs) extra_kwargs["PaginationConfig"] = ( s3_additional_kwargs["PaginationConfig"] if "PaginationConfig" in s3_additional_kwargs else default_pagination) paginator = client_s3.get_paginator("list_objects_v2") args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, **extra_kwargs} if delimiter is not None: args["Delimiter"] = delimiter _logger.debug("args: %s", args) response_iterator = paginator.paginate(**args) paths: List[str] = [] _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end) for page in response_iterator: # pylint: disable=too-many-nested-blocks if delimiter is None: contents: Optional[List[Dict[str, Any]]] = page.get("Contents") if contents is not None: for content in contents: key: str = content["Key"] if ignore_empty and content.get("Size", 0) == 0: _logger.debug("Skipping empty file: %s", f"s3://{bucket}/{key}") elif (content is not None) and ("Key" in content): if (_suffix is None) or key.endswith(tuple(_suffix)): if last_modified_begin is not None: if content[ "LastModified"] < last_modified_begin: continue if last_modified_end is not None: if content["LastModified"] > last_modified_end: continue paths.append(f"s3://{bucket}/{key}") else: prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get( "CommonPrefixes") if prefixes is not None: for pfx in prefixes: if (pfx is not None) and ("Prefix" in pfx): key = pfx["Prefix"] paths.append(f"s3://{bucket}/{key}") if prefix != prefix_original: paths = fnmatch.filter(paths, path) if _ignore_suffix is not None: paths = [ p for p in paths if p.endswith(tuple(_ignore_suffix)) is False ] if paths: yield paths paths = []
def select_query( sql: str, path: str, input_serialization: str, input_serialization_params: Dict[str, Union[bool, str]], compression: Optional[str] = None, use_threads: Union[bool, int] = False, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> pd.DataFrame: r"""Filter contents of an Amazon S3 object based on SQL statement. Note: Scan ranges are only supported for uncompressed CSV/JSON, CSV (without quoted delimiters) and JSON objects (in LINES mode only). It means scanning cannot be split across threads if the latter conditions are not met, leading to lower performance. Parameters ---------- sql: str SQL statement used to query the object. path: str S3 path to the object (e.g. s3://bucket/key). input_serialization: str, Format of the S3 object queried. Valid values: "CSV", "JSON", or "Parquet". Case sensitive. input_serialization_params: Dict[str, Union[bool, str]] Dictionary describing the serialization of the S3 object. compression: Optional[str] Compression type of the S3 object. Valid values: None, "gzip", or "bzip2". gzip and bzip2 are only valid for CSV and JSON objects. use_threads : Union[bool, int] True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() is used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session is used if none is provided. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. Valid values: "SSECustomerAlgorithm", "SSECustomerKey", "ExpectedBucketOwner". e.g. s3_additional_kwargs={'SSECustomerAlgorithm': 'md5'} Returns ------- pandas.DataFrame Pandas DataFrame with results from query. Examples -------- Reading a gzip compressed JSON document >>> import awswrangler as wr >>> df = wr.s3.select_query( ... sql='SELECT * FROM s3object[*][*]', ... path='s3://bucket/key.json.gzip', ... input_serialization='JSON', ... input_serialization_params={ ... 'Type': 'Document', ... }, ... compression="gzip", ... ) Reading an entire CSV object using threads >>> import awswrangler as wr >>> df = wr.s3.select_query( ... sql='SELECT * FROM s3object', ... path='s3://bucket/key.csv', ... input_serialization='CSV', ... input_serialization_params={ ... 'FileHeaderInfo': 'Use', ... 'RecordDelimiter': '\r\n' ... }, ... use_threads=True, ... ) Reading a single column from Parquet object with pushdown filter >>> import awswrangler as wr >>> df = wr.s3.select_query( ... sql='SELECT s.\"id\" FROM s3object s where s.\"id\" = 1.0', ... path='s3://bucket/key.snappy.parquet', ... input_serialization='Parquet', ... ) """ if path.endswith("/"): raise exceptions.InvalidArgumentValue("<path> argument should be an S3 key, not a prefix.") if input_serialization not in ["CSV", "JSON", "Parquet"]: raise exceptions.InvalidArgumentValue("<input_serialization> argument must be 'CSV', 'JSON' or 'Parquet'") if compression not in [None, "gzip", "bzip2"]: raise exceptions.InvalidCompression(f"Invalid {compression} compression, please use None, 'gzip' or 'bzip2'.") if compression and (input_serialization not in ["CSV", "JSON"]): raise exceptions.InvalidArgumentCombination( "'gzip' or 'bzip2' are only valid for input 'CSV' or 'JSON' objects." ) bucket, key = _utils.parse_path(path) args: Dict[str, Any] = { "Bucket": bucket, "Key": key, "Expression": sql, "ExpressionType": "SQL", "RequestProgress": {"Enabled": False}, "InputSerialization": { input_serialization: input_serialization_params, "CompressionType": compression.upper() if compression else "NONE", }, "OutputSerialization": { "JSON": {}, }, } if s3_additional_kwargs: args.update(s3_additional_kwargs) _logger.debug("args:\n%s", pprint.pformat(args)) if any( [ compression, input_serialization_params.get("AllowQuotedRecordDelimiter"), input_serialization_params.get("Type") == "Document", ] ): # Scan range is only supported for uncompressed CSV/JSON, CSV (without quoted delimiters) # and JSON objects (in LINES mode only) _logger.debug("Scan ranges are not supported given provided input.") return pd.DataFrame(_select_object_content(args=args, boto3_session=boto3_session)) return _paginate_stream(args=args, path=path, use_threads=use_threads, boto3_session=boto3_session)
def index_json( client: OpenSearch, path: str, index: str, doc_type: Optional[str] = None, boto3_session: Optional[boto3.Session] = boto3.Session(), json_path: Optional[str] = None, **kwargs: Any, ) -> Dict[str, Any]: """Index all documents from JSON file to OpenSearch index. The JSON file should be in a JSON-Lines text format (newline-delimited JSON) - https://jsonlines.org/ OR if the is a single large JSON please provide `json_path`. Parameters ---------- client : OpenSearch instance of opensearchpy.OpenSearch to use. path : str s3 or local path to the JSON file which contains the documents. index : str Name of the index. doc_type : str, optional Name of the document type (for Elasticsearch versions 5.x and earlier). json_path : str, optional JsonPath expression to specify explicit path to a single name element in a JSON hierarchical data structure. Read more about `JsonPath <https://jsonpath.com>`_ boto3_session : boto3.Session(), optional Boto3 Session to be used to access s3 if s3 path is provided. The default boto3 Session will be used if boto3_session receive None. **kwargs : KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents` which is used to execute the operation Returns ------- Dict[str, Any] Response payload https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response. Examples -------- Writing contents of JSON file >>> import awswrangler as wr >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT') >>> wr.opensearch.index_json( ... client=client, ... path='docs.json', ... index='sample-index1' ... ) """ _logger.debug("indexing %s from %s", index, path) if boto3_session is None: raise ValueError("boto3_session cannot be None") if path.startswith("s3://"): bucket, key = parse_path(path) s3 = boto3_session.client("s3") obj = s3.get_object(Bucket=bucket, Key=key) body = obj["Body"].read() lines = body.splitlines() documents = [json.loads(line) for line in lines] if json_path: documents = _get_documents_w_json_path(documents, json_path) else: # local path documents = list(_file_line_generator(path, is_json=True)) if json_path: documents = _get_documents_w_json_path(documents, json_path) return index_documents(client=client, documents=documents, index=index, doc_type=doc_type, **kwargs)