Ejemplo n.º 1
0
    def execute(self, context: 'Context') -> List[str]:
        # list all files in an Google Cloud Storage bucket
        hook = GCSHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            impersonation_chain=self.google_impersonation_chain,
        )

        self.log.info(
            'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s',
            self.bucket,
            self.delimiter,
            self.prefix,
        )

        files = hook.list(bucket_name=self.bucket,
                          prefix=self.prefix,
                          delimiter=self.delimiter)

        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id,
                         verify=self.dest_verify,
                         extra_args=self.dest_s3_extra_args)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key)
            # look for the bucket and the prefix to avoid look into
            # parent directories/keys
            existing_files = s3_hook.list_keys(bucket_name, prefix=prefix)
            # in case that no files exists, return an empty array to avoid errors
            existing_files = existing_files if existing_files is not None else []
            # remove the prefix for the existing files to allow the match
            existing_files = [
                file.replace(prefix, '', 1) for file in existing_files
            ]
            files = list(set(files) - set(existing_files))

        if files:

            for file in files:
                with hook.provide_file(
                        object_name=file,
                        bucket_name=self.bucket) as local_tmp_file:
                    dest_key = self.dest_s3_key + file
                    self.log.info("Saving file to %s", dest_key)

                    s3_hook.load_file(
                        filename=local_tmp_file.name,
                        key=dest_key,
                        replace=self.replace,
                        acl_policy=self.s3_acl_policy,
                    )

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
Ejemplo n.º 2
0
 def _resolve_bucket_and_key(self, key):
     """If key is URI, parse bucket"""
     if self.bucket_name is None:
         return S3Hook.parse_s3_url(key)
     else:
         parsed_url = urlparse(key)
         if parsed_url.scheme != '' or parsed_url.netloc != '':
             raise AirflowException('If bucket_name provided, bucket_key must be relative path, not URI.')
         return self.bucket_name, key
Ejemplo n.º 3
0
    def execute(self, context):
        # use the super to list all files in an Google Cloud Storage bucket
        files = super().execute(context)
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id,
                         verify=self.dest_verify)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key)
            # look for the bucket and the prefix to avoid look into
            # parent directories/keys
            existing_files = s3_hook.list_keys(bucket_name, prefix=prefix)
            # in case that no files exists, return an empty array to avoid errors
            existing_files = existing_files if existing_files is not None else []
            # remove the prefix for the existing files to allow the match
            existing_files = [
                file.replace(prefix, '', 1) for file in existing_files
            ]
            files = list(set(files) - set(existing_files))

        if files:
            hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id,
                delegate_to=self.delegate_to)

            for file in files:
                file_bytes = hook.download(self.bucket, file)

                dest_key = self.dest_s3_key + file
                self.log.info("Saving file to %s", dest_key)

                s3_hook.load_bytes(file_bytes,
                                   key=dest_key,
                                   replace=self.replace)

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
Ejemplo n.º 4
0
    def check_s3_url(self, s3url: str) -> bool:
        """
        Check if an S3 URL exists

        :param s3url: S3 url
        :rtype: bool
        """
        bucket, key = S3Hook.parse_s3_url(s3url)
        if not self.s3_hook.check_for_bucket(bucket_name=bucket):
            raise AirflowException(
                f"The input S3 Bucket {bucket} does not exist ")
        if (key and not self.s3_hook.check_for_key(key=key, bucket_name=bucket)
                and not self.s3_hook.check_for_prefix(
                    prefix=key, bucket_name=bucket, delimiter='/')):
            # check if s3 key exists in the case user provides a single file
            # or if s3 prefix exists in the case user provides multiple files in
            # a prefix
            raise AirflowException(
                f"The input S3 Key or Prefix {s3url} does not exist in the Bucket {bucket}"
            )
        return True
Ejemplo n.º 5
0
 def test_parse_s3_url(self):
     parsed = S3Hook.parse_s3_url(self.s3_test_url)
     self.assertEqual(parsed,
                      ("test", "this/is/not/a-real-key.txt"),
                      "Incorrect parsing of the s3 url")
Ejemplo n.º 6
0
 def test_parse_s3_url(self):
     parsed = S3Hook.parse_s3_url("s3://test/this/is/not/a-real-key.txt")
     assert parsed == (
         "test",
         "this/is/not/a-real-key.txt"), "Incorrect parsing of the s3 url"
Ejemplo n.º 7
0
from airflow.providers.amazon.aws.example_dags.example_google_api_to_s3_transfer_advanced import (
    S3_DESTINATION_KEY as ADVANCED_S3_DESTINATION_KEY, )
from airflow.providers.amazon.aws.example_dags.example_google_api_to_s3_transfer_basic import (
    S3_DESTINATION_KEY as BASIC_S3_DESTINATION_KEY, )
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from tests.providers.google.cloud.utils.gcp_authenticator import GMP_KEY
from tests.test_utils.amazon_system_helpers import (
    AWS_DAG_FOLDER,
    AmazonSystemTest,
    provide_aws_context,
    provide_aws_s3_bucket,
)
from tests.test_utils.gcp_system_helpers import GoogleSystemTest, provide_gcp_context

BASIC_BUCKET, _ = S3Hook.parse_s3_url(BASIC_S3_DESTINATION_KEY)
ADVANCED_BUCKET, _ = S3Hook.parse_s3_url(ADVANCED_S3_DESTINATION_KEY)


@pytest.fixture
def provide_s3_bucket_basic():
    with provide_aws_s3_bucket(BASIC_BUCKET):
        yield


@pytest.fixture
def provide_s3_bucket_advanced():
    with provide_aws_s3_bucket(ADVANCED_BUCKET):
        yield