Esempio n. 1
0
def _copy_objects(
    batch: List[Tuple[str, str]],
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, Any]],
) -> None:
    _logger.debug("len(batch): %s", len(batch))
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    resource_s3: boto3.resource = _utils.resource(service_name="s3",
                                                  session=boto3_session)
    if s3_additional_kwargs is None:
        boto3_kwargs: Optional[Dict[str, Any]] = None
    else:
        boto3_kwargs = get_botocore_valid_kwargs(
            function_name="copy_object",
            s3_additional_kwargs=s3_additional_kwargs)
    for source, target in batch:
        source_bucket, source_key = _utils.parse_path(path=source)
        copy_source: Dict[str, str] = {
            "Bucket": source_bucket,
            "Key": source_key
        }
        target_bucket, target_key = _utils.parse_path(path=target)
        resource_s3.meta.client.copy(
            CopySource=copy_source,
            Bucket=target_bucket,
            Key=target_key,
            SourceClient=client_s3,
            ExtraArgs=boto3_kwargs,
            Config=TransferConfig(num_download_attempts=10,
                                  use_threads=use_threads),
        )
def _get_connection_attributes_from_catalog(
    connection: str, catalog_id: Optional[str], dbname: Optional[str], boto3_session: Optional[boto3.Session]
) -> ConnectionAttributes:
    details: Dict[str, Any] = get_connection(name=connection, catalog_id=catalog_id, boto3_session=boto3_session)[
        "ConnectionProperties"
    ]
    if ";databaseName=" in details["JDBC_CONNECTION_URL"]:
        database_sep = ";databaseName="
    else:
        database_sep = "/"
    port, database = details["JDBC_CONNECTION_URL"].split(":")[3].split(database_sep)
    ssl_context: Optional[ssl.SSLContext] = None
    if details.get("JDBC_ENFORCE_SSL") == "true":
        ssl_cert_path: Optional[str] = details.get("CUSTOM_JDBC_CERT")
        ssl_cadata: Optional[str] = None
        if ssl_cert_path:
            bucket_name, key_path = _utils.parse_path(ssl_cert_path)
            client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
            try:
                ssl_cadata = client_s3.get_object(Bucket=bucket_name, Key=key_path)["Body"].read().decode("utf-8")
            except client_s3.exception.NoSuchKey:
                raise exceptions.NoFilesFound(  # pylint: disable=raise-missing-from
                    f"No CA certificate found at {ssl_cert_path}."
                )
        ssl_context = ssl.create_default_context(cadata=ssl_cadata)
    return ConnectionAttributes(
        kind=details["JDBC_CONNECTION_URL"].split(":")[1].lower(),
        user=details["USERNAME"],
        password=details["PASSWORD"],
        host=details["JDBC_CONNECTION_URL"].split(":")[2].replace("/", ""),
        port=int(port),
        database=dbname if dbname is not None else database,
        ssl_context=ssl_context,
    )
Esempio n. 3
0
def _describe_object(
    path: str,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, Any]],
    version_id: Optional[str] = None,
) -> Tuple[str, Dict[str, Any]]:
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    bucket: str
    key: str
    bucket, key = _utils.parse_path(path=path)
    if s3_additional_kwargs:
        extra_kwargs: Dict[str, Any] = _fs.get_botocore_valid_kwargs(
            function_name="head_object",
            s3_additional_kwargs=s3_additional_kwargs)
    else:
        extra_kwargs = {}
    desc: Dict[str, Any]
    if version_id:
        extra_kwargs["VersionId"] = version_id
    desc = _utils.try_it(f=client_s3.head_object,
                         ex=client_s3.exceptions.NoSuchKey,
                         Bucket=bucket,
                         Key=key,
                         **extra_kwargs)
    return path, desc
Esempio n. 4
0
def _list_objects(
    path: str,
    delimiter: Optional[str] = None,
    suffix: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> List[str]:
    bucket: str
    prefix: str
    bucket, prefix = _utils.parse_path(path=path)
    client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
    paginator = client_s3.get_paginator("list_objects_v2")
    args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}}
    if delimiter is not None:
        args["Delimiter"] = delimiter
    response_iterator = paginator.paginate(**args)
    paths: List[str] = []
    for page in response_iterator:  # pylint: disable=too-many-nested-blocks
        if delimiter is None:
            contents: Optional[List] = page.get("Contents")
            if contents is not None:
                for content in contents:
                    if (content is not None) and ("Key" in content):
                        key: str = content["Key"]
                        if (suffix is None) or key.endswith(suffix):
                            paths.append(f"s3://{bucket}/{key}")
        else:
            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes")
            if prefixes is not None:
                for pfx in prefixes:
                    if (pfx is not None) and ("Prefix" in pfx):
                        key = pfx["Prefix"]
                        paths.append(f"s3://{bucket}/{key}")
    return paths
Esempio n. 5
0
def does_object_exist(path: str,
                      s3_additional_kwargs: Optional[Dict[str, Any]] = None,
                      boto3_session: Optional[boto3.Session] = None) -> bool:
    """Check if object exists on S3.

    Parameters
    ----------
    path: str
        S3 path (e.g. s3://bucket/key).
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forward to botocore requests. Valid parameters: "RequestPayer", "ExpectedBucketOwner".
        e.g. s3_additional_kwargs={'RequestPayer': 'requester'}
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    bool
        True if exists, False otherwise.

    Examples
    --------
    Using the default boto3 session

    >>> import awswrangler as wr
    >>> wr.s3.does_object_exist('s3://bucket/key_real')
    True
    >>> wr.s3.does_object_exist('s3://bucket/key_unreal')
    False

    Using a custom boto3 session

    >>> import boto3
    >>> import awswrangler as wr
    >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session())
    True
    >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session())
    False

    """
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    bucket: str
    key: str
    bucket, key = _utils.parse_path(path=path)
    if s3_additional_kwargs:
        extra_kwargs: Dict[str, Any] = _fs.get_botocore_valid_kwargs(
            function_name="head_object",
            s3_additional_kwargs=s3_additional_kwargs)
    else:
        extra_kwargs = {}
    try:
        client_s3.head_object(Bucket=bucket, Key=key, **extra_kwargs)
        return True
    except botocore.exceptions.ClientError as ex:
        if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
            return False
        raise ex
Esempio n. 6
0
    def __init__(
        self,
        path: str,
        s3_block_size: int,
        mode: str,
        use_threads: Union[bool, int],
        s3_additional_kwargs: Optional[Dict[str, str]],
        boto3_session: Optional[boto3.Session],
        newline: Optional[str],
        encoding: Optional[str],
    ) -> None:
        super().__init__()
        self._use_threads = use_threads
        self._newline: str = "\n" if newline is None else newline
        self._encoding: str = "utf-8" if encoding is None else encoding
        self._bucket, self._key = _utils.parse_path(path=path)
        self._boto3_session: boto3.Session = _utils.ensure_session(session=boto3_session)
        if mode not in {"rb", "wb", "r", "w"}:
            raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode)
        self._mode: str = "rb" if mode is None else mode
        self._one_shot_download: bool = False
        if 0 < s3_block_size < 3:
            raise exceptions.InvalidArgumentValue(
                "s3_block_size MUST > 2 to define a valid size or "
                "< 1 to avoid blocks and always execute one shot downloads."
            )
        if s3_block_size <= 0:
            _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size)
            self._one_shot_download = True
        self._s3_block_size: int = s3_block_size
        self._s3_half_block_size: int = s3_block_size // 2
        self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
        self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session)
        self._loc: int = 0

        if self.readable() is True:
            self._cache: bytes = b""
            self._start: int = 0
            self._end: int = 0
            size: Optional[int] = size_objects(
                path=[path],
                use_threads=False,
                boto3_session=self._boto3_session,
                s3_additional_kwargs=self._s3_additional_kwargs,
            )[path]
            if size is None:
                raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}")
            self._size: int = size
            _logger.debug("self._size: %s", self._size)
            _logger.debug("self._s3_block_size: %s", self._s3_block_size)
        elif self.writable() is True:
            self._mpu: Dict[str, Any] = {}
            self._buffer: io.BytesIO = io.BytesIO()
            self._parts_count: int = 0
            self._size = 0
            self._upload_proxy: _UploadProxy = _UploadProxy(use_threads=self._use_threads)
        else:
            raise RuntimeError(f"Invalid mode: {self._mode}")
def _describe_object(path: str, boto3_session: boto3.Session) -> Tuple[str, Dict[str, Any]]:
    client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
    bucket: str
    key: str
    bucket, key = _utils.parse_path(path=path)
    desc: Dict[str, Any] = _utils.try_it(
        f=client_s3.head_object, ex=client_s3.exceptions.NoSuchKey, Bucket=bucket, Key=key
    )
    return path, desc
Esempio n. 8
0
def _extract_ctas_manifest_paths(path: str,
                                 boto3_session: Optional[boto3.Session] = None
                                 ) -> List[str]:
    """Get the list of paths of the generated files."""
    bucket_name, key_path = _utils.parse_path(path)
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    body: bytes = client_s3.get_object(Bucket=bucket_name,
                                       Key=key_path)["Body"].read()
    return [x for x in body.decode("utf-8").split("\n") if x != ""]
Esempio n. 9
0
def _split_paths_by_bucket(paths: List[str]) -> Dict[str, List[str]]:
    buckets: Dict[str, List[str]] = {}
    bucket: str
    key: str
    for path in paths:
        bucket, key = _utils.parse_path(path=path)
        if bucket not in buckets:
            buckets[bucket] = []
        buckets[bucket].append(key)
    return buckets
Esempio n. 10
0
def submit_ecr_credentials_refresh(
        cluster_id: str,
        path: str,
        action_on_failure: str = "CONTINUE",
        boto3_session: Optional[boto3.Session] = None) -> str:
    """Update internal ECR credentials.

    Parameters
    ----------
    cluster_id : str
        Cluster ID.
    path : str
        Amazon S3 path where Wrangler will stage the script ecr_credentials_refresh.py (e.g. s3://bucket/emr/)
    action_on_failure : str
        'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER', 'CANCEL_AND_WAIT', 'CONTINUE'
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    str
        Step ID.

    Examples
    --------
    >>> import awswrangler as wr
    >>> step_id = wr.emr.submit_ecr_credentials_refresh("cluster_id", "s3://bucket/emr/")

    """
    path = path[:-1] if path.endswith("/") else path
    path_script: str = f"{path}/ecr_credentials_refresh.py"
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    bucket, key = _utils.parse_path(path=path_script)
    region: str = _utils.get_region_from_session(boto3_session=boto3_session)
    client_s3.put_object(
        Body=_get_ecr_credentials_refresh_content(region).encode(
            encoding="utf-8"),
        Bucket=bucket,
        Key=key)
    command: str = f"spark-submit --deploy-mode cluster {path_script}"
    name: str = "ECR Credentials Refresh"
    step: Dict[str, Any] = build_step(name=name,
                                      command=command,
                                      action_on_failure=action_on_failure,
                                      script=False,
                                      boto3_session=session)
    client_emr: boto3.client = _utils.client(service_name="emr",
                                             session=session)
    response: Dict[str,
                   Any] = client_emr.add_job_flow_steps(JobFlowId=cluster_id,
                                                        Steps=[step])
    _logger.debug("response: \n%s", pprint.pformat(response))
    return response["StepIds"][0]
Esempio n. 11
0
def _list_objects(  # pylint: disable=too-many-branches
    path: str,
    delimiter: Optional[str] = None,
    suffix: Union[str, List[str], None] = None,
    ignore_suffix: Union[str, List[str], None] = None,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
) -> List[str]:
    bucket: str
    prefix_original: str
    bucket, prefix_original = _utils.parse_path(path=path)
    prefix: str = _prefix_cleanup(prefix=prefix_original)
    _suffix: Union[List[str], None] = [suffix] if isinstance(suffix, str) else suffix
    _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance(ignore_suffix, str) else ignore_suffix
    client_s3: boto3.client = _utils.client(service_name="s3", session=boto3_session)
    paginator = client_s3.get_paginator("list_objects_v2")
    args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, "PaginationConfig": {"PageSize": 1000}}
    if delimiter is not None:
        args["Delimiter"] = delimiter
    response_iterator = paginator.paginate(**args)
    paths: List[str] = []
    _validate_datetimes(last_modified_begin=last_modified_begin, last_modified_end=last_modified_end)

    for page in response_iterator:  # pylint: disable=too-many-nested-blocks
        if delimiter is None:
            contents: Optional[List[Dict[str, Any]]] = page.get("Contents")
            if contents is not None:
                for content in contents:
                    key: str = content["Key"]
                    if (content is not None) and ("Key" in content):
                        if (_suffix is None) or key.endswith(tuple(_suffix)):
                            if last_modified_begin is not None:
                                if content["LastModified"] < last_modified_begin:
                                    continue
                            if last_modified_end is not None:
                                if content["LastModified"] > last_modified_end:
                                    continue
                            paths.append(f"s3://{bucket}/{key}")
        else:
            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get("CommonPrefixes")
            if prefixes is not None:
                for pfx in prefixes:
                    if (pfx is not None) and ("Prefix" in pfx):
                        key = pfx["Prefix"]
                        paths.append(f"s3://{bucket}/{key}")

    if prefix != prefix_original:
        paths = fnmatch.filter(paths, path)

    if _ignore_suffix is not None:
        paths = [p for p in paths if p.endswith(tuple(_ignore_suffix)) is False]

    return paths
Esempio n. 12
0
def _copy_objects(batch: List[Tuple[str, str]], use_threads: bool,
                  boto3_session: boto3.Session) -> None:
    _logger.debug("len(batch): %s", len(batch))
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    resource_s3: boto3.resource = _utils.resource(service_name="s3",
                                                  session=boto3_session)
    for source, target in batch:
        source_bucket, source_key = _utils.parse_path(path=source)
        copy_source: Dict[str, str] = {
            "Bucket": source_bucket,
            "Key": source_key
        }
        target_bucket, target_key = _utils.parse_path(path=target)
        resource_s3.meta.client.copy(
            CopySource=copy_source,
            Bucket=target_bucket,
            Key=target_key,
            SourceClient=client_s3,
            Config=TransferConfig(num_download_attempts=15,
                                  use_threads=use_threads),
        )
Esempio n. 13
0
def does_object_exist(path: str,
                      boto3_session: Optional[boto3.Session] = None) -> bool:
    """Check if object exists on S3.

    Parameters
    ----------
    path: str
        S3 path (e.g. s3://bucket/key).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    bool
        True if exists, False otherwise.

    Examples
    --------
    Using the default boto3 session

    >>> import awswrangler as wr
    >>> wr.s3.does_object_exist('s3://bucket/key_real')
    True
    >>> wr.s3.does_object_exist('s3://bucket/key_unreal')
    False

    Using a custom boto3 session

    >>> import boto3
    >>> import awswrangler as wr
    >>> wr.s3.does_object_exist('s3://bucket/key_real', boto3_session=boto3.Session())
    True
    >>> wr.s3.does_object_exist('s3://bucket/key_unreal', boto3_session=boto3.Session())
    False

    """
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    bucket: str
    key: str
    bucket, key = _utils.parse_path(path=path)
    try:
        client_s3.head_object(Bucket=bucket, Key=key)
        return True
    except botocore.exceptions.ClientError as ex:
        if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404:
            return False
        raise ex
Esempio n. 14
0
def _wait_objects(
    waiter_name: str,
    paths: List[str],
    delay: Optional[Union[int, float]] = None,
    max_attempts: Optional[int] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    delay = 5 if delay is None else delay
    max_attempts = 20 if max_attempts is None else max_attempts
    _delay: int = int(delay) if isinstance(delay, float) else delay
    if len(paths) < 1:
        return None
    _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths]
    if len(_paths) == 1:
        _wait_object(
            path=_paths[0],
            waiter_name=waiter_name,
            delay=_delay,
            max_attempts=max_attempts,
            boto3_session=boto3_session,
        )
    elif use_threads is False:
        for path in _paths:
            _wait_object(path=path,
                         waiter_name=waiter_name,
                         delay=_delay,
                         max_attempts=max_attempts,
                         boto3_session=boto3_session)
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            list(
                executor.map(
                    _wait_object_concurrent,
                    _paths,
                    itertools.repeat(waiter_name),
                    itertools.repeat(_delay),
                    itertools.repeat(max_attempts),
                    itertools.repeat(
                        _utils.boto3_to_primitives(
                            boto3_session=boto3_session)),
                ))
    return None
Esempio n. 15
0
def _describe_object(
    path: str, wait_time: Optional[Union[int, float]], client_s3: boto3.client
) -> Tuple[str, Dict[str, Any]]:
    wait_time = int(wait_time) if isinstance(wait_time, float) else wait_time
    tries: int = wait_time if (wait_time is not None) and (wait_time > 0) else 1
    bucket: str
    key: str
    bucket, key = _utils.parse_path(path=path)
    desc: Dict[str, Any] = {}
    for i in range(tries, 0, -1):
        try:
            desc = client_s3.head_object(Bucket=bucket, Key=key)
            break
        except botocore.exceptions.ClientError as e:  # pragma: no cover
            if e.response["ResponseMetadata"]["HTTPStatusCode"] == 404:  # Not Found
                _logger.debug("Object not found. %s seconds remaining to wait.", i)
                if i == 1:  # Last try, there is no more need to sleep
                    break
                time.sleep(1)
            else:
                raise e
    return path, desc
Esempio n. 16
0
def _wait_objects(
    waiter_name: str,
    paths: List[str],
    delay: Optional[Union[int, float]] = None,
    max_attempts: Optional[int] = None,
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
) -> None:
    delay = 5 if delay is None else delay
    max_attempts = 20 if max_attempts is None else max_attempts
    _delay: int = int(delay) if isinstance(delay, float) else delay
    if len(paths) < 1:
        return None
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    _paths: List[Tuple[str, str]] = [_utils.parse_path(path=p) for p in paths]
    if use_threads is False:
        waiter = client_s3.get_waiter(waiter_name)
        for bucket, key in _paths:
            waiter.wait(Bucket=bucket,
                        Key=key,
                        WaiterConfig={
                            "Delay": _delay,
                            "MaxAttempts": max_attempts
                        })
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            list(
                executor.map(
                    _wait_objects_concurrent,
                    _paths,
                    itertools.repeat(waiter_name),
                    itertools.repeat(client_s3),
                    itertools.repeat(_delay),
                    itertools.repeat(max_attempts),
                ))
    return None
Esempio n. 17
0
def write_redshift_copy_manifest(
    manifest_path: str,
    paths: List[str],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, str]] = None,
) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]:
    """Write Redshift copy manifest and return its structure.

    Only Parquet files are supported.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    manifest_path : str
        Amazon S3 manifest path (e.g. s3://...)
    paths: List[str]
        List of S3 paths (Parquet Files) to be copied.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs:
        Forward to botocore requests. Valid parameters: "ACL", "Metadata", "ServerSideEncryption", "StorageClass",
        "SSECustomerAlgorithm", "SSECustomerKey", "SSEKMSKeyId", "SSEKMSEncryptionContext", "Tagging".
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMY_KEY_ARN'}

    Returns
    -------
    Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]
        Manifest content.

    Examples
    --------
    Copying two files to Redshift cluster.

    >>> import awswrangler as wr
    >>> wr.db.write_redshift_copy_manifest(
    ...     path="s3://bucket/my.manifest",
    ...     paths=["s3://...parquet", "s3://...parquet"]
    ... )

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    objects_sizes: Dict[str, Optional[int]] = s3.size_objects(
        path=paths, use_threads=use_threads, boto3_session=session)
    manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = {
        "entries": []
    }
    path: str
    size: Optional[int]
    for path, size in objects_sizes.items():
        if size is not None:
            entry: Dict[str, Union[str, bool, Dict[str, int]]] = {
                "url": path,
                "mandatory": True,
                "meta": {
                    "content_length": size
                },
            }
            manifest["entries"].append(entry)
    payload: str = json.dumps(manifest)
    bucket: str
    bucket, key = _utils.parse_path(manifest_path)
    additional_kwargs: Dict[
        str,
        str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
    _logger.debug("payload: %s", payload)
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    _logger.debug("bucket: %s", bucket)
    _logger.debug("key: %s", key)
    client_s3.put_object(Body=payload,
                         Bucket=bucket,
                         Key=key,
                         **additional_kwargs)
    return manifest
Esempio n. 18
0
def write_redshift_copy_manifest(
    manifest_path: str,
    paths: List[str],
    use_threads: bool = True,
    boto3_session: Optional[boto3.Session] = None
) -> Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]:
    """Write Redshift copy manifest and return its structure.

    Only Parquet files are supported.

    Note
    ----
    In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().

    Parameters
    ----------
    manifest_path : str
        Amazon S3 manifest path (e.g. s3://...)
    paths: List[str]
        List of S3 paths (Parquet Files) to be copied.
    use_threads : bool
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]]
        Manifest content.

    Examples
    --------
    Copying two files to Redshift cluster.

    >>> import awswrangler as wr
    >>> wr.db.write_redshift_copy_manifest(
    ...     path="s3://bucket/my.manifest",
    ...     paths=["s3://...parquet", "s3://...parquet"]
    ... )

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    objects_sizes: Dict[str, Optional[int]] = s3.size_objects(
        path=paths, use_threads=use_threads, boto3_session=session)
    manifest: Dict[str, List[Dict[str, Union[str, bool, Dict[str, int]]]]] = {
        "entries": []
    }
    path: str
    size: Optional[int]
    for path, size in objects_sizes.items():
        if size is not None:
            entry: Dict[str, Union[str, bool, Dict[str, int]]] = {
                "url": path,
                "mandatory": True,
                "meta": {
                    "content_length": size
                },
            }
            manifest["entries"].append(entry)
    payload: str = json.dumps(manifest)
    bucket: str
    bucket, key = _utils.parse_path(manifest_path)
    _logger.debug(f"payload: {payload}")
    client_s3: boto3.client = _utils.client(service_name="s3", session=session)
    _logger.debug(f"bucket: {bucket}")
    _logger.debug(f"key: {key}")
    client_s3.put_object(Body=payload, Bucket=bucket, Key=key)
    return manifest
Esempio n. 19
0
def _list_objects(  # pylint: disable=too-many-branches
    path: str,
    s3_additional_kwargs: Optional[Dict[str, Any]],
    delimiter: Optional[str] = None,
    suffix: Union[str, List[str], None] = None,
    ignore_suffix: Union[str, List[str], None] = None,
    last_modified_begin: Optional[datetime.datetime] = None,
    last_modified_end: Optional[datetime.datetime] = None,
    boto3_session: Optional[boto3.Session] = None,
    ignore_empty: bool = False,
) -> Iterator[List[str]]:
    bucket: str
    prefix_original: str
    bucket, prefix_original = _utils.parse_path(path=path)
    prefix: str = _prefix_cleanup(prefix=prefix_original)
    _suffix: Union[List[str],
                   None] = [suffix] if isinstance(suffix, str) else suffix
    _ignore_suffix: Union[List[str], None] = [ignore_suffix] if isinstance(
        ignore_suffix, str) else ignore_suffix
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)
    default_pagination: Dict[str, int] = {"PageSize": 1000}
    extra_kwargs: Dict[str, Any] = {"PaginationConfig": default_pagination}
    if s3_additional_kwargs:
        extra_kwargs = _fs.get_botocore_valid_kwargs(
            function_name="list_objects_v2",
            s3_additional_kwargs=s3_additional_kwargs)
        extra_kwargs["PaginationConfig"] = (
            s3_additional_kwargs["PaginationConfig"] if "PaginationConfig"
            in s3_additional_kwargs else default_pagination)
    paginator = client_s3.get_paginator("list_objects_v2")
    args: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix, **extra_kwargs}
    if delimiter is not None:
        args["Delimiter"] = delimiter
    _logger.debug("args: %s", args)
    response_iterator = paginator.paginate(**args)
    paths: List[str] = []
    _validate_datetimes(last_modified_begin=last_modified_begin,
                        last_modified_end=last_modified_end)

    for page in response_iterator:  # pylint: disable=too-many-nested-blocks
        if delimiter is None:
            contents: Optional[List[Dict[str, Any]]] = page.get("Contents")
            if contents is not None:
                for content in contents:
                    key: str = content["Key"]
                    if ignore_empty and content.get("Size", 0) == 0:
                        _logger.debug("Skipping empty file: %s",
                                      f"s3://{bucket}/{key}")
                    elif (content is not None) and ("Key" in content):
                        if (_suffix is None) or key.endswith(tuple(_suffix)):
                            if last_modified_begin is not None:
                                if content[
                                        "LastModified"] < last_modified_begin:
                                    continue
                            if last_modified_end is not None:
                                if content["LastModified"] > last_modified_end:
                                    continue
                            paths.append(f"s3://{bucket}/{key}")
        else:
            prefixes: Optional[List[Optional[Dict[str, str]]]] = page.get(
                "CommonPrefixes")
            if prefixes is not None:
                for pfx in prefixes:
                    if (pfx is not None) and ("Prefix" in pfx):
                        key = pfx["Prefix"]
                        paths.append(f"s3://{bucket}/{key}")

        if prefix != prefix_original:
            paths = fnmatch.filter(paths, path)

        if _ignore_suffix is not None:
            paths = [
                p for p in paths if p.endswith(tuple(_ignore_suffix)) is False
            ]

        if paths:
            yield paths
        paths = []
Esempio n. 20
0
def select_query(
    sql: str,
    path: str,
    input_serialization: str,
    input_serialization_params: Dict[str, Union[bool, str]],
    compression: Optional[str] = None,
    use_threads: Union[bool, int] = False,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
    r"""Filter contents of an Amazon S3 object based on SQL statement.

    Note: Scan ranges are only supported for uncompressed CSV/JSON, CSV (without quoted delimiters)
    and JSON objects (in LINES mode only). It means scanning cannot be split across threads if the latter
    conditions are not met, leading to lower performance.

    Parameters
    ----------
    sql: str
        SQL statement used to query the object.
    path: str
        S3 path to the object (e.g. s3://bucket/key).
    input_serialization: str,
        Format of the S3 object queried.
        Valid values: "CSV", "JSON", or "Parquet". Case sensitive.
    input_serialization_params: Dict[str, Union[bool, str]]
        Dictionary describing the serialization of the S3 object.
    compression: Optional[str]
        Compression type of the S3 object.
        Valid values: None, "gzip", or "bzip2". gzip and bzip2 are only valid for CSV and JSON objects.
    use_threads : Union[bool, int]
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() is used as the max number of threads.
        If integer is provided, specified number is used.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session is used if none is provided.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forwarded to botocore requests.
        Valid values: "SSECustomerAlgorithm", "SSECustomerKey", "ExpectedBucketOwner".
        e.g. s3_additional_kwargs={'SSECustomerAlgorithm': 'md5'}

    Returns
    -------
    pandas.DataFrame
        Pandas DataFrame with results from query.

    Examples
    --------
    Reading a gzip compressed JSON document

    >>> import awswrangler as wr
    >>> df = wr.s3.select_query(
    ...     sql='SELECT * FROM s3object[*][*]',
    ...     path='s3://bucket/key.json.gzip',
    ...     input_serialization='JSON',
    ...     input_serialization_params={
    ...         'Type': 'Document',
    ...     },
    ...     compression="gzip",
    ... )

    Reading an entire CSV object using threads

    >>> import awswrangler as wr
    >>> df = wr.s3.select_query(
    ...     sql='SELECT * FROM s3object',
    ...     path='s3://bucket/key.csv',
    ...     input_serialization='CSV',
    ...     input_serialization_params={
    ...         'FileHeaderInfo': 'Use',
    ...         'RecordDelimiter': '\r\n'
    ...     },
    ...     use_threads=True,
    ... )

    Reading a single column from Parquet object with pushdown filter

    >>> import awswrangler as wr
    >>> df = wr.s3.select_query(
    ...     sql='SELECT s.\"id\" FROM s3object s where s.\"id\" = 1.0',
    ...     path='s3://bucket/key.snappy.parquet',
    ...     input_serialization='Parquet',
    ... )
    """
    if path.endswith("/"):
        raise exceptions.InvalidArgumentValue("<path> argument should be an S3 key, not a prefix.")
    if input_serialization not in ["CSV", "JSON", "Parquet"]:
        raise exceptions.InvalidArgumentValue("<input_serialization> argument must be 'CSV', 'JSON' or 'Parquet'")
    if compression not in [None, "gzip", "bzip2"]:
        raise exceptions.InvalidCompression(f"Invalid {compression} compression, please use None, 'gzip' or 'bzip2'.")
    if compression and (input_serialization not in ["CSV", "JSON"]):
        raise exceptions.InvalidArgumentCombination(
            "'gzip' or 'bzip2' are only valid for input 'CSV' or 'JSON' objects."
        )
    bucket, key = _utils.parse_path(path)

    args: Dict[str, Any] = {
        "Bucket": bucket,
        "Key": key,
        "Expression": sql,
        "ExpressionType": "SQL",
        "RequestProgress": {"Enabled": False},
        "InputSerialization": {
            input_serialization: input_serialization_params,
            "CompressionType": compression.upper() if compression else "NONE",
        },
        "OutputSerialization": {
            "JSON": {},
        },
    }
    if s3_additional_kwargs:
        args.update(s3_additional_kwargs)
    _logger.debug("args:\n%s", pprint.pformat(args))

    if any(
        [
            compression,
            input_serialization_params.get("AllowQuotedRecordDelimiter"),
            input_serialization_params.get("Type") == "Document",
        ]
    ):  # Scan range is only supported for uncompressed CSV/JSON, CSV (without quoted delimiters)
        # and JSON objects (in LINES mode only)
        _logger.debug("Scan ranges are not supported given provided input.")
        return pd.DataFrame(_select_object_content(args=args, boto3_session=boto3_session))
    return _paginate_stream(args=args, path=path, use_threads=use_threads, boto3_session=boto3_session)
Esempio n. 21
0
def index_json(
    client: OpenSearch,
    path: str,
    index: str,
    doc_type: Optional[str] = None,
    boto3_session: Optional[boto3.Session] = boto3.Session(),
    json_path: Optional[str] = None,
    **kwargs: Any,
) -> Dict[str, Any]:
    """Index all documents from JSON file to OpenSearch index.

    The JSON file should be in a JSON-Lines text format (newline-delimited JSON) - https://jsonlines.org/
    OR if the is a single large JSON please provide `json_path`.

    Parameters
    ----------
    client : OpenSearch
        instance of opensearchpy.OpenSearch to use.
    path : str
        s3 or local path to the JSON file which contains the documents.
    index : str
        Name of the index.
    doc_type : str, optional
        Name of the document type (for Elasticsearch versions 5.x and earlier).
    json_path : str, optional
        JsonPath expression to specify explicit path to a single name element
        in a JSON hierarchical data structure.
        Read more about `JsonPath <https://jsonpath.com>`_
    boto3_session : boto3.Session(), optional
        Boto3 Session to be used to access s3 if s3 path is provided.
        The default boto3 Session will be used if boto3_session receive None.
    **kwargs :
        KEYWORD arguments forwarded to :func:`~awswrangler.opensearch.index_documents`
        which is used to execute the operation

    Returns
    -------
    Dict[str, Any]
        Response payload
        https://opensearch.org/docs/opensearch/rest-api/document-apis/bulk/#response.

    Examples
    --------
    Writing contents of JSON file

    >>> import awswrangler as wr
    >>> client = wr.opensearch.connect(host='DOMAIN-ENDPOINT')
    >>> wr.opensearch.index_json(
    ...     client=client,
    ...     path='docs.json',
    ...     index='sample-index1'
    ... )
    """
    _logger.debug("indexing %s from %s", index, path)

    if boto3_session is None:
        raise ValueError("boto3_session cannot be None")

    if path.startswith("s3://"):
        bucket, key = parse_path(path)
        s3 = boto3_session.client("s3")
        obj = s3.get_object(Bucket=bucket, Key=key)
        body = obj["Body"].read()
        lines = body.splitlines()
        documents = [json.loads(line) for line in lines]
        if json_path:
            documents = _get_documents_w_json_path(documents, json_path)
    else:  # local path
        documents = list(_file_line_generator(path, is_json=True))
        if json_path:
            documents = _get_documents_w_json_path(documents, json_path)
    return index_documents(client=client,
                           documents=documents,
                           index=index,
                           doc_type=doc_type,
                           **kwargs)