def test_is_s3_path(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        assert s3_util.is_s3_path("s3a://app/s3/path/o.obj") is True
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(AWSCredentials("", ""), "is_s3_path",
                       ("s3a://app/s3/path/o.obj", ), {}))

        # case 2: point to data bucket
        assert s3_util.is_s3_path(obj="/landing/s3/path/o.obj") is False
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(AWSCredentials("", ""), "is_s3_path", (),
                       {"obj": "/landing/s3/path/o.obj"}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="is_s3_path", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.is_s3_path("s3://unknown/s3/path/o.obj")
Beispiel #2
0
    def __init__(self, hql_validation_function, hql_validation_error=None):
        self.hql_validation_function = hql_validation_function
        self.hql_validation_error = hql_validation_error
        self.bucket_landing = TestS3Table.LANDING_SPEC.bucket
        self.bucket_lake = TestS3Table.LAKE_SPEC.bucket
        self.statements = []

        self.s3_util = AWSS3CredentialsWrapper(
            [], [self.bucket_landing, self.bucket_lake],
            AWSCredentials("aws_access_key_api", "aws_secret_key_api"),
            AWSCredentials("aws_access_key_s3_put", "aws_secret_key_s3_put"),
            AWSCredentials("aws_access_key_s3_del", "aws_secret_key_s3_del"))
Beispiel #3
0
    def test_move_object(self):
        test_src_bucket_name = "test_src_bucket"
        test_destination_bucket_name = "test_destination_bucket"
        test_src_key = "test_src_key"
        test_destination_key = "test_destination_key"
        test_content = "aaa1"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_src_bucket_name)
        s3_resource.create_bucket(Bucket=test_destination_bucket_name)
        s3_resource.Bucket(test_src_bucket_name).put_object(Key=test_src_key,
                                                            Body=test_content)

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.move_object(
            ("s3://" + test_src_bucket_name + "/" + test_src_key),
            ("s3://" + test_destination_bucket_name + "/" +
             test_destination_key))

        destination_objects = list(
            s3_resource.Bucket(test_destination_bucket_name).objects.all())
        assert len(destination_objects) == 1
        assert destination_objects[0].key == test_destination_key

        src_objects = list(
            s3_resource.Bucket(test_src_bucket_name).objects.all())
        assert len(src_objects) == 0
 def is_s3_path(*args, **kwargs):
     """
     With this one function we will test invocation of static methods
     """
     FakeS3Util.calls.append(
         Invocation(AWSCredentials("", ""), "is_s3_path", args, kwargs))
     return S3Util.is_s3_path(*args, **kwargs)
Beispiel #5
0
    def test_wait_for_file_availability(self):
        bucket = "cur_bucket"
        key = "stdout.txt"
        data = "no output"

        s3_full_path = "s3://{}/{}".format(bucket, key)

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=bucket)

        def create_file():
            s3_resource.Bucket(bucket).put_object(Key=key, Body=data)

        s3_util = S3Util(AWSCredentials("", ""))

        polling_interval = 0.02
        timeout = 0.5

        with ConcurrentExecutor(create_file, 0.2):
            s3_util.wait_for_file_availability(s3_full_path, polling_interval,
                                               timeout)

        s3_util.delete_object(s3_full_path)

        err_msg = "File {} failed to be available after {} seconds.".format(
            s3_full_path, timeout)

        with pytest.raises(M3DAWSAPIException, match=err_msg):
            s3_util.wait_for_file_availability(s3_full_path, polling_interval,
                                               timeout)
Beispiel #6
0
    def test_delete_object(self):
        test_bucket_name = "test_bucket"
        test_key = "test_dir/test_key"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        s3_resource.Bucket(test_bucket_name).put_object(Key=test_key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.delete_object("s3://" + test_bucket_name + "/" + test_key)

        remaining_objects = list(
            s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(remaining_objects) == 0
Beispiel #7
0
    def test_upload_object(self):
        test_bucket_name = "test_bucket"
        test_key = "test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json"
        file_name = "test/resources/test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json"

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.upload_object(file_name,
                              "s3://" + test_bucket_name + "/" + test_key)

        s3_objects = list(s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(s3_objects) == 1
        assert s3_objects[0].key == test_key
Beispiel #8
0
    def create_s3_resource(aws_credentials=None):
        """
        Initialize and return boto3 resource for S3.
        :param aws_credentials: AWS credentials. Empty values will be used if it is None.
        :return: initialized boto3 resource object for S3
        """
        if not aws_credentials:
            aws_credentials = AWSCredentials("", "")

        s3_resource = boto3.resource(
            "s3",
            aws_access_key_id=aws_credentials.access_key_id,
            aws_secret_access_key=aws_credentials.secret_access_key)

        return s3_resource
Beispiel #9
0
    def create_emr_client(aws_region, aws_credentials=None):
        """
        Initialize and return boto3 client for EMR.
        :param aws_region: AWS region
        :param aws_credentials: AWS credentials. Empty values will be used if it is None.
        :return: initialized boto3 client object for EMR
        """
        if not aws_credentials:
            aws_credentials = AWSCredentials("", "")

        emr_client = boto3.client(
            'emr',
            region_name=aws_region,
            aws_access_key_id=aws_credentials.access_key_id,
            aws_secret_access_key=aws_credentials.secret_access_key)

        return emr_client
Beispiel #10
0
    def create_output_file(self, step, dummy_text):
        logging.info("step={{id: {}, state: {}}}".format(step.id, step.state))
        step_id = step.id
        s3_log_file_location = "s3://{}/log/{}/steps/{}/stdout.gz" \
            .format(self.default_log_bucket, self.emr_cluster_id, step_id)

        local_log_file_location = self.local_run_dir.join("stdout.gz")

        logging.info(
            "local_log_file_location={}".format(local_log_file_location))
        logging.info("s3_log_file_location={}".format(
            str(s3_log_file_location)))

        with gzip.open(str(local_log_file_location), 'wb') as f:
            f.write(dummy_text.encode("utf-8"))

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.upload_object(str(local_log_file_location),
                              str(s3_log_file_location))
Beispiel #11
0
    def test_delete_objects(self):
        test_bucket_name = "test_bucket"
        test_prefix = "test_dir"
        test_keys = [
            "test_key1", "{}/test_key2".format(test_prefix),
            "{}/test_key3".format(test_prefix),
            "{}/test_key4".format(test_prefix)
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        for key in test_keys:
            s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.delete_objects("s3://" + test_bucket_name + "/" + test_prefix)

        remaining_objects = list(
            s3_resource.Bucket(test_bucket_name).objects.all())
        assert len(remaining_objects) == 1
        assert remaining_objects[0].key == test_keys[0]
Beispiel #12
0
    def test_move_objects(self):
        test_src_bucket_name = "test_src_bucket"
        test_destination_bucket_name = "test_destination_bucket"
        test_src_prefix = "test_src_dir"
        test_destination_prefix = "test_destination_dir"
        test_src_keys = [
            "test_key1", "{}/test_key2".format(test_src_prefix),
            "{}/test_key3".format(test_src_prefix),
            "{}/test_key4".format(test_src_prefix)
        ]
        test_destination_keys = [
            "{}/test_key2".format(test_destination_prefix),
            "{}/test_key3".format(test_destination_prefix),
            "{}/test_key4".format(test_destination_prefix)
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_src_bucket_name)
        s3_resource.create_bucket(Bucket=test_destination_bucket_name)

        for key in test_src_keys:
            s3_resource.Bucket(test_src_bucket_name).put_object(Key=key,
                                                                Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        s3_util.move_objects(
            ("s3://" + test_src_bucket_name + "/" + test_src_prefix),
            ("s3://" + test_destination_bucket_name + "/" +
             test_destination_prefix))

        src_objects = list(
            s3_resource.Bucket(test_src_bucket_name).objects.all())
        assert len(src_objects) == 1
        assert src_objects[0].key == test_src_keys[0]

        destination_objects = s3_resource.Bucket(
            test_destination_bucket_name).objects.all()
        assert sorted(map(lambda x: x.key,
                          destination_objects)) == test_destination_keys
Beispiel #13
0
    def test_list_objects_in_bucket(self):
        test_bucket_name = "test_bucket"
        test_prefix = "test_dir"
        test_keys = [
            "test_key1", "{}/test_key2".format(test_prefix),
            "{}/test_key3".format(test_prefix),
            "{}/test_key4".format(test_prefix)
        ]
        test_resources = [
            "s3://{}/".format(test_bucket_name) + key for key in test_keys
        ]

        s3_resource = Boto3Util.create_s3_resource()
        s3_resource.create_bucket(Bucket=test_bucket_name)
        for key in test_keys:
            s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="")

        s3_util = S3Util(AWSCredentials("", ""))
        keys = s3_util.list_objects("s3://" + test_bucket_name + "/" +
                                    test_prefix)

        assert keys == test_resources[1:4]
Beispiel #14
0
    def test_parses_basic_attributes_from_system_config_file(self, _):
        """
        Test case checks that all relevant key-values are extracted from sconx file and assigned to correct member
        variables of EMRSystem object.
        """
        aws_api_credentials = AWSCredentials("fake_aws_api_access_key",
                                             "fake_aws_api_secret_key")
        aws_api_credentials_file = self.local_run_dir.join(
            "aws-credentials-emr-api.json")
        self.dump_aws_credentials(aws_api_credentials,
                                  str(aws_api_credentials_file))

        aws_s3_put_credentials = AWSCredentials("fake_aws_s3_put_access_key",
                                                "fake_aws_s3_put_secret_key")
        aws_s3_put_credentials_file = self.local_run_dir.join(
            "aws-credentials-emr-s3_put.json")
        self.dump_aws_credentials(aws_s3_put_credentials,
                                  str(aws_s3_put_credentials_file))

        aws_s3_del_credentials = AWSCredentials("fake_aws_s3_del_access_key",
                                                "fake_aws_s3_del_secret_key")
        aws_s3_del_credentials_file = self.local_run_dir.join(
            "aws-credentials-emr-s3_del.json")
        self.dump_aws_credentials(aws_s3_del_credentials,
                                  str(aws_s3_del_credentials_file))

        test_scon_json = TestEMRSystem.test_scon_json_template.format(
            aws_api_credentials=str(aws_api_credentials_file),
            aws_s3_put_credentials=str(aws_s3_put_credentials_file),
            aws_s3_del_credentials=str(aws_s3_del_credentials_file))

        s3_scon_file = self.local_run_dir.join("scon-emr-emr-test.json")
        s3_scon_file.write(test_scon_json)

        MockConfigService.scon_path = str(s3_scon_file)

        emr_system = EMRSystem(*self.test_emr_system_arguments)

        expected_system_params = {
            "bucket_landing":
            "m3d-da-bdp-test-landing",
            "bucket_lake":
            "m3d-da-bdp-test-lake",
            "bucket_mart_cal":
            "m3d-da-bdp-test-mart-cal",
            "bucket_log":
            "io.3stripes.factory.test.ireland.infrastructure-logs",
            "default_ebs_size":
            "128",
            "default_emr_version":
            "emr-5.17.0",
            "aws_api_credentials":
            aws_api_credentials,
            "aws_s3_put_credentials":
            aws_s3_put_credentials,
            "aws_s3_del_credentials":
            aws_s3_del_credentials,
            "api_action_timeout_seconds":
            120,
            "api_action_polling_interval_seconds":
            3,
            "api_long_timeout_seconds":
            300,
            "aws_region":
            "eu-west-1",
            "packages_to_deploy": ["hadoop"],
            "configs_to_deploy": ["test_config_1", "test_config_2"],
            "subdir_archive":
            "test_archive/",
            "subdir_header":
            "test_header/",
            "subdir_config":
            "test_config/",
            "subdir_data":
            "test_data/",
            "subdir_delta_table":
            "delta_table/",
            "subdir_data_backup":
            "data_backup/",
            "subdir_error":
            "test_error/",
            "subdir_work":
            "test_work/",
            "subdir_log":
            "test_log/",
            "subdir_apps":
            "test_apps/",
            "subdir_m3d_engine":
            "test_m3d_engine/",
            "subdir_loading":
            "test_loading/",
            "subdir_full_load":
            "test_full_load/",
            "subdir_delta_load":
            "test_delta_load/",
            "subdir_delta_lake_load":
            "test_delta_lake_load/",
            "subdir_append_load":
            "test_append_load/",
            "subdir_black_whole":
            "test_black_whole/",
            "subdir_credentials":
            "test_credentials/",
            "subdir_keytab":
            "test_keytab/",
            "subdir_tmp":
            "test_tmp/",
            "subdir_code":
            "m3d",
            "subdir_metadata":
            "metadata",
            "spark_jar_name":
            "test_jar.jar",
            "dir_apps":
            "s3://m3d-da-landing-application/m3d-test/test_environment/test_apps/",
            "dir_apps_algorithm":
            "s3://m3d-da-landing-application/m3d-test/"
            "test_environment/test_apps/test_m3d_engine/",
            "dir_apps_loading":
            "s3://m3d-da-landing-application/m3d-test/test_environment/"
            "test_apps/test_loading/",
            "dir_tmp_s3":
            "s3://m3d-da-landing-application/m3d-test/test_environment/test_tmp/",
            "dir_tmp_local":
            "/test_tmp/",
            "spark_jar_path":
            "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/" +
            "test_subdir_projects_m3d_api/test_jar.jar",
            "dir_m3d_api_deployment":
            "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/test_subdir_projects_m3d_api",
            "dir_metadata_deployment":
            "s3://m3d-da-landing-application/m3d-test/test_environment/metadata/test_subdir_projects_m3d_api"
        }

        for param in expected_system_params.keys():
            assert getattr(emr_system, param) == expected_system_params[param]
Beispiel #15
0
class TestEMRClusterClient(UnitTestBase):
    emr_cluster_name = "test_cluster"
    aws_region = "us-east-1"
    aws_credentials = AWSCredentials("test_access_key", "test_secret_key")
    timeout_seconds = 0.5
    retry_seconds = 0.1
    long_timeout_seconds = 3.0

    @staticmethod
    def env_setup(emr_cluster_name, aws_region, aws_credentials, timeout_ms,
                  retry_ms, long_timeout_ms):
        run_job_flow_args = dict(Instances={
            'InstanceCount': 3,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'test_zone'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
                                 JobFlowRole='EMR_EC2_DefaultRole',
                                 LogUri='s3://mybucket/log/',
                                 Name=emr_cluster_name,
                                 ServiceRole='EMR_DefaultRole',
                                 VisibleToAllUsers=True)

        emr_client = Boto3Util.create_emr_client(aws_region)
        emr_cluster_id = emr_client.run_job_flow(
            **run_job_flow_args)['JobFlowId']

        emr_cluster_client = EMRClusterClient(emr_cluster_id, aws_region,
                                              aws_credentials, timeout_ms,
                                              retry_ms, long_timeout_ms)

        return emr_cluster_client, emr_cluster_id

    @staticmethod
    def _compress_string(s):
        out = io.BytesIO()
        with gzip.GzipFile(fileobj=out, mode="w") as gzip_s:
            gzip_s.write(s.encode())

        compressed_str = out.getvalue()
        return compressed_str

    @pytest.mark.emr
    @moto.mock_emr
    def test_get_cluster_state(self):
        emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                               self.aws_region,
                                               self.aws_credentials,
                                               self.timeout_seconds,
                                               self.retry_seconds,
                                               self.long_timeout_seconds)

        cluster_state = emr_cluster_client.get_cluster_state()

        assert cluster_state == "WAITING"

    @pytest.mark.emr
    @moto.mock_emr
    def test_wait_for_cluster_startup(self):
        emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                               self.aws_region,
                                               self.aws_credentials,
                                               self.timeout_seconds,
                                               self.retry_seconds,
                                               self.long_timeout_seconds)

        cluster_state = emr_cluster_client.wait_for_cluster_startup()

        assert cluster_state == "WAITING"

    @pytest.mark.emr
    @moto.mock_emr
    def test_wait_for_bootstrapping_cluster(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            # Change step state to BOOTSTRAPPING so that wait times out
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_cluster.state = "BOOTSTRAPPING"

            err_msg = "Cluster {} failed to start after {} seconds.".format(
                emr_cluster_id, self.timeout_seconds)

            with pytest.raises(M3DAWSAPIException, match=err_msg):
                emr_cluster_client.wait_for_cluster_startup()

    @pytest.mark.emr
    @moto.mock_emr
    def test_add_step(self):
        # Expected response is of the format s-XXXXXXXXXXXXX
        emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                               self.aws_region,
                                               self.aws_credentials,
                                               self.timeout_seconds,
                                               self.retry_seconds,
                                               self.long_timeout_seconds)

        step_name = "Test_Step"
        command_str = "/usr/bin/spark-submit --class spark.job.main.class"

        emr_step_id = emr_cluster_client.add_step(step_name, command_str)

        assert str(emr_step_id).startswith("s-")
        assert len(emr_step_id) == 15

    @pytest.mark.emr
    @moto.mock_emr
    def test_get_step_status(self):
        emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                               self.aws_region,
                                               self.aws_credentials,
                                               self.timeout_seconds,
                                               self.retry_seconds,
                                               self.long_timeout_seconds)

        step_name = "Test_Step"
        command_str = "/usr/bin/spark-submit --class spark.job.main.class"

        emr_step_id = emr_cluster_client.add_step(step_name, command_str)
        emr_step_status, emr_step_failure_details = emr_cluster_client.get_step_status(
            emr_step_id)

        assert str(emr_step_id).startswith("s-")
        assert emr_step_status == "STARTING"
        assert emr_step_failure_details is None

    @pytest.mark.emr
    @moto.mock_emr
    def test_wait_for_step_completion_without_state_change(self):
        with pytest.raises(M3DAWSAPIException):
            # In this test we expect exception because the state of the step will be STARTING
            emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                                   self.aws_region,
                                                   self.aws_credentials,
                                                   self.timeout_seconds,
                                                   self.retry_seconds,
                                                   self.long_timeout_seconds)

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            emr_cluster_client.wait_for_step_completion(
                emr_step_id, self.long_timeout_seconds)

    @pytest.mark.emr
    def test_add_step_to_cluster_with_state_change(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            logging.info(str(emr_step_id))

            cluster_steps = emr_cluster_client.get_list_of_steps()
            assert 1 == len(cluster_steps)
            assert cluster_steps[0] == emr_step_id

            emr_step_status, _ = emr_cluster_client.get_step_status(
                emr_step_id)
            assert emr_step_status == "STARTING"

            # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING"
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_cluster.steps[0].state = "RUNNING"

            def complete_step():
                # Wait for some time to let EMRClusterClient poll a few times.
                fake_cluster.steps[0].state = "COMPLETED"

            with ConcurrentExecutor(complete_step, 0.2):
                emr_cluster_client.wait_for_step_completion(
                    emr_step_id, self.long_timeout_seconds)

    @pytest.mark.emr
    @moto.mock_s3
    def test_add_step_to_cluster_fail_without_output(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            s3_resource = Boto3Util.create_s3_resource()
            s3_resource.create_bucket(Bucket="mybucket")

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            cluster_steps = emr_cluster_client.get_list_of_steps()
            assert 1 == len(cluster_steps)
            assert cluster_steps[0] == emr_step_id

            emr_step_status, _ = emr_cluster_client.get_step_status(
                emr_step_id)
            assert emr_step_status == "STARTING"

            # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING"
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_step = fake_cluster.steps[0]
            fake_step.state = "RUNNING"

            def fail_step():
                fake_step.state = "FAILED"

            # Make sure that we do not wait for 300 seconds for gz file to be available.
            EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds

            # Required for correct log path generation in MockedMethod.
            MockedMethod.emr_cluster_id = emr_cluster_id

            stderr_gz_path = MockedMethod.log_file_template.format(
                emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id)

            err_msg = "File {} failed to be available after {} seconds.".\
                format(stderr_gz_path, self.timeout_seconds)

            with pytest.raises(M3DAWSAPIException, match=err_msg):
                # Wait for some time to let EMRClusterClient poll a few times.
                with ConcurrentExecutor(fail_step, 0.4):
                    with patch(
                            "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status",
                            side_effect=MockedMethod.get_step_status_mocked):
                        emr_cluster_client.wait_for_step_completion(
                            emr_step_id, self.long_timeout_seconds)

    @pytest.mark.emr
    @moto.mock_s3
    def test_add_step_to_cluster_fail_with_output(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            s3_resource = Boto3Util.create_s3_resource()
            s3_resource.create_bucket(Bucket="mybucket")

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            cluster_steps = emr_cluster_client.get_list_of_steps()
            assert 1 == len(cluster_steps)
            assert cluster_steps[0] == emr_step_id

            emr_step_status, _ = emr_cluster_client.get_step_status(
                emr_step_id)
            assert emr_step_status == "STARTING"

            # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING"
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_step = fake_cluster.steps[0]
            fake_step.state = "RUNNING"

            # Make sure that we do not wait for 300 seconds for gz file to be available.
            EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds

            # Required for correct log path generation in MockedMethod.
            MockedMethod.emr_cluster_id = emr_cluster_id

            stderr_gz_path = MockedMethod.log_file_template.format(
                emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id)

            expected_content = "Lots of content here!!!"

            def fail_step_and_write_output():
                fake_step.state = "FAILED"

                time.sleep(0.3)

                compressed_content = TestEMRClusterClient._compress_string(
                    expected_content)

                bucket, key = emr_cluster_client.s3_util.get_bucket_and_key(
                    stderr_gz_path)
                s3_resource.Bucket(bucket).put_object(Key=key,
                                                      Body=compressed_content)

            with pytest.raises(M3DAWSAPIException) as exc:
                # Wait for some time to let EMRClusterClient poll a few times.
                with ConcurrentExecutor(fail_step_and_write_output, 0.3):
                    with patch(
                            "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status",
                            side_effect=MockedMethod.get_step_status_mocked):
                        emr_cluster_client.wait_for_step_completion(
                            emr_step_id, self.long_timeout_seconds)

            err_msg = "EMR Step with cluster_id='{}' and step_id='{}' failed to complete".\
                format(emr_cluster_id, emr_step_id)

            assert err_msg in str(exc.value)
            assert stderr_gz_path in str(exc.value)

            resulting_content = emr_cluster_client.s3_util.read_gzip_file_content(
                stderr_gz_path)
            assert expected_content == resulting_content

    @pytest.mark.emr
    @moto.mock_emr
    def test_wait_for_spark_step_completion(self):
        with pytest.raises(M3DAWSAPIException):
            # In this test we expect exception because the state of the step will be starting
            emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                                   self.aws_region,
                                                   self.aws_credentials,
                                                   self.timeout_seconds,
                                                   self.retry_seconds,
                                                   self.long_timeout_seconds)

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            emr_cluster_client.wait_for_spark_step_completion(emr_step_id)

    @pytest.mark.emr
    @moto.mock_emr
    def test_get_list_of_steps(self):
        emr_cluster_client, _ = self.env_setup(self.emr_cluster_name,
                                               self.aws_region,
                                               self.aws_credentials,
                                               self.timeout_seconds,
                                               self.retry_seconds,
                                               self.long_timeout_seconds)

        step_name = "Test_Step_{}"
        command_str_0 = "/usr/bin/spark-submit --class spark.job.main.class"
        command_str_1 = "spark-submit --class totally.different.main.class config.json"
        command_str_2 = "hive --silent -f s3://app.bucket/path/to/query.hql"

        emr_step_id_0 = emr_cluster_client.add_step(step_name.format(0),
                                                    command_str_0)
        emr_step_id_1 = emr_cluster_client.add_step(step_name.format(1),
                                                    command_str_1)
        emr_step_id_2 = emr_cluster_client.add_step(step_name.format(2),
                                                    command_str_2)

        step_ids = emr_cluster_client.get_list_of_steps()

        assert len(step_ids) == 3

        # Because of a bug in moto step ids are returned in the same order as submission.
        # In reality the order should be reversed. So no because of this we have to compare with reverse order.
        # ToDo: Change comparison order once bug in boto is fixed, https://github.com/spulec/moto/issues/1866
        assert step_ids[2] == emr_step_id_0
        assert step_ids[1] == emr_step_id_1
        assert step_ids[0] == emr_step_id_2

    @pytest.mark.emr
    def test_get_step_output_path(self):
        mock_emr_obj = moto.mock_emr()
        with mock_emr_obj:
            emr_cluster_client, emr_cluster_id = self.env_setup(
                self.emr_cluster_name, self.aws_region, self.aws_credentials,
                self.timeout_seconds, self.retry_seconds,
                self.long_timeout_seconds)

            step_name = "Test_Step"
            command_str = "/usr/bin/spark-submit --class spark.job.main.class"

            emr_step_id = emr_cluster_client.add_step(step_name, command_str)

            # Change step state to COMPLETED
            emr_backend = mock_emr_obj.backends[self.aws_region]
            fake_cluster = emr_backend.clusters[emr_cluster_id]
            fake_cluster.steps[0].state = "COMPLETED"

            emr_cluster_client.wait_for_step_completion(
                emr_step_id, self.long_timeout_seconds)

            output_file = emr_cluster_client.get_step_output_path(emr_step_id)

            expected_output_file = "s3://mybucket/log/{}/steps/{}/stdout.gz".format(
                emr_cluster_client.emr_cluster_id, emr_step_id)

            assert output_file == expected_output_file

    @pytest.mark.emr
    @moto.mock_emr
    def test_get_step_err_path(self):
        emr_cluster_client, emr_cluster_id = self.env_setup(
            self.emr_cluster_name, self.aws_region, self.aws_credentials,
            self.timeout_seconds, self.retry_seconds,
            self.long_timeout_seconds)

        step_name = "Test_Step"
        command_str = "/usr/bin/spark-submit --class spark.job.main.class"

        emr_step_id = emr_cluster_client.add_step(step_name, command_str)

        MockedMethod.emr_cluster_id = emr_cluster_id
        with patch(
                "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status",
                side_effect=MockedMethod.get_step_status_mocked):
            err_file = emr_cluster_client.get_step_err_path(emr_step_id)

        expected_err_file = "s3://mybucket/log/{}/steps/{}/stderr.gz".format(
            emr_cluster_client.emr_cluster_id, emr_step_id)

        assert err_file == expected_err_file
class TestAWSS3CredentialsWrapper(object):

    aws_credentials_api = AWSCredentials("access_key_id-api",
                                         "secret_access_key-api")
    aws_credentials_put = AWSCredentials("access_key_id-put",
                                         "secret_access_key-put")
    aws_credentials_del = AWSCredentials("access_key_id-del",
                                         "secret_access_key-del")

    ERROR_METHOD_NOT_SUPPORTED_TEMPLATE = \
        "The following method is not supported for AWSFactoryS3Wrapper: {attr}()"
    ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE = \
        "{attr} is not an attribute of S3Util"

    ERROR_UNMANAGED_BUCKET_TEMPLATE = \
        "AWSFactoryS3Wrapper.{attr}() has been called with arguments pointing to S3 buckets which are " \
        "not managed by AWSFactoryS3Wrapper: {buckets}"

    ERROR_CROSS_BUCKET_ACCESS_TEMPLATE = \
        "AWSFactoryS3Wrapper.{attr}() has been called with arguments pointing to both" \
        " data and applications S3 buckets, this is not supported."

    @staticmethod
    def _create_aws_factory_s3_util(application_buckets, data_buckets):
        s3_util = AWSS3CredentialsWrapper.__new__(AWSS3CredentialsWrapper)
        s3_util.app_buckets = application_buckets
        s3_util.data_buckets = data_buckets

        s3_util.s3_util_api = FakeS3Util(
            TestAWSS3CredentialsWrapper.aws_credentials_api)
        s3_util.s3_util_put = FakeS3Util(
            TestAWSS3CredentialsWrapper.aws_credentials_put)
        s3_util.s3_util_del = FakeS3Util(
            TestAWSS3CredentialsWrapper.aws_credentials_del)

        return s3_util

    @staticmethod
    def _reset():
        FakeS3Util.calls = []

    def setup_method(self, _):
        TestAWSS3CredentialsWrapper._reset()

    @staticmethod
    def _assert_calls_formatted(call_0, call_1):
        not_equal_msg = "<{}> is not equal to <{}>".format(call_0, call_1)
        assert call_0 == call_1, not_equal_msg

    @pytest.mark.emr
    def test_upload_object(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app"], ["data"])

        # case 1: point to application bucket
        s3_util.upload_object("local/path", "s3://app/s3/path")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "upload_object", ("local/path", "s3://app/s3/path"),
                       {}))

        # case 2: point to data bucket
        s3_util.upload_object("local/path", "s3://data/s3/path")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "upload_object", ("local/path", "s3://data/s3/path"),
                       {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="upload_object", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.upload_object("local/path", "s3://unknown/s3/path")

    @pytest.mark.emr
    def test_upload_child_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.upload_child_objects("local/path",
                                     "s3://app/s3/path",
                                     recursive=True)
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "upload_child_objects",
                       ("local/path", "s3://app/s3/path"),
                       {"recursive": True}))

        # case 2: point to data bucket
        s3_util.upload_child_objects("local/path",
                                     "s3://lake/s3/path",
                                     fn_pattern="*.py")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "upload_child_objects",
                       ("local/path", "s3://lake/s3/path"),
                       {"fn_pattern": "*.py"}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="upload_child_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.upload_child_objects("local/path", "s3://unknown/s3/path")

    @pytest.mark.emr
    def test_delete_object(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.delete_object("s3://log/s3/path/some.log")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "delete_object", ("s3://log/s3/path/some.log", ), {}))

        # case 2: point to data bucket
        s3_util.delete_object("s3://landing/s3/path/some.log")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_del,
                       "delete_object", ("s3://landing/s3/path/some.log", ),
                       {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="delete_object", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.delete_object("s3://unknown/s3/path/some.log")

    @pytest.mark.emr
    def test_delete_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.delete_objects("s3://log/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "delete_objects", ("s3://log/s3/path/", ), {}))

        # case 2: point to data bucket
        s3_util.delete_objects("s3://landing/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_del,
                       "delete_objects", ("s3://landing/s3/path/", ), {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="delete_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.delete_objects("s3://unknown/s3/path/")

    @pytest.mark.emr
    def test_delete_child_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.delete_child_objects("s3://log/s3/path")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "delete_child_objects", ("s3://log/s3/path", ), {}))

        # case 2: point to data bucket
        s3_util.delete_child_objects("s3://landing/s3/path")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_del,
                       "delete_child_objects", ("s3://landing/s3/path", ), {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="delete_child_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.delete_child_objects("s3://unknown/s3/path/")

    @pytest.mark.emr
    def test_list_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.list_objects("s3://log/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "list_objects", ("s3://log/s3/path/", ), {}))

        # case 2: point to data bucket
        s3_util.list_objects("s3://landing/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "list_objects", ("s3://landing/s3/path/", ), {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="list_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.list_objects("s3://unknown/s3/path/")

    @pytest.mark.emr
    def test_list_child_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.list_child_objects("s3://app/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "list_child_objects", ("s3://app/s3/path/", ), {}))

        # case 2: point to data bucket
        s3_util.list_child_objects("s3://landing/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "list_child_objects", ("s3://landing/s3/path/", ), {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="list_child_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.list_child_objects("s3://unknown/s3/path/")

    @pytest.mark.emr
    def test_move_object(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application buckets
        s3_util.move_object("s3://app/s3/path/some.obj",
                            destination_s3_path="s3://log/s3/path/some.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "move_object", ("s3://app/s3/path/some.obj", ),
                       {"destination_s3_path": "s3://log/s3/path/some.obj"}))

        # case 2: point to data buckets
        err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format(
            attr="move_object")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.move_object(
                src_s3_path="s3://landing/s3/path/some.obj",
                destination_s3_path="s3://lake/s3/path/some.obj")

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_object", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_object("s3://unknown/d1/o.obj",
                                "s3://unknown/d2/o.obj")

        # case 4: try to copy from app to landing
        err_msg = TestAWSS3CredentialsWrapper.ERROR_CROSS_BUCKET_ACCESS_TEMPLATE.format(
            attr="move_object")
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_object("s3://app/d/o.obj", "s3://landing/d/o.obj")

        # case 5: try to copy from app to unmanaged
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_object", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_object("s3://app/d1/o.obj", "s3://unknown/d2/o.obj")

        # case 6: try to copy from landing to unmanaged
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_object", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_object("s3://landing/d1/o.obj",
                                "s3://unknown/d2/o.obj")

    @pytest.mark.emr
    def test_move_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application buckets
        s3_util.move_objects("s3://app/s3/path/",
                             destination_s3_prefix_path="s3://log/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "move_objects", ("s3://app/s3/path/", ),
                       {"destination_s3_prefix_path": "s3://log/s3/path/"}))

        # case 2: point to data buckets
        err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format(
            attr="move_objects")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.move_objects("s3://landing/s3/path/", "s3://lake/s3/path/")

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_objects("s3://unknown/d1/o.obj",
                                 "s3://unknown/d2/o.obj")

        # case 4: try to copy from app to landing
        err_msg = TestAWSS3CredentialsWrapper.ERROR_CROSS_BUCKET_ACCESS_TEMPLATE.format(
            attr="move_objects")
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_objects("s3://app/d/", "s3://landing/d/")

        # case 5: try to copy from app to unmanaged
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_objects("s3://app/d1/o.obj", "s3://unknown/d2/o.obj")

        # case 6: try to copy from landing to unmanaged
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_objects("s3://landing/d1/o.obj",
                                 "s3://unknown/d2/o.obj")

    @pytest.mark.emr
    def test_move_child_objects(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application buckets
        s3_util.move_child_objects("s3://app/s3/path/", "s3://log/s3/path/")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "move_child_objects",
                       ("s3://app/s3/path/", "s3://log/s3/path/"), {}))

        # case 2: point to data buckets
        err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format(
            attr="move_child_objects")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.move_child_objects(
                src_s3_prefix_path="s3://landing/s3/path/",
                destination_s3_prefix_path="s3://lake/s3/path/")

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_child_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_child_objects("s3://unknown/d1/o.obj",
                                       "s3://unknown/d2/o.obj")

        # case 4: try to copy from app to landing
        err_msg = TestAWSS3CredentialsWrapper.ERROR_CROSS_BUCKET_ACCESS_TEMPLATE.format(
            attr="move_child_objects")
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_child_objects("s3://app/d/", "s3://landing/d/")

        # case 5: try to copy from app to unmanaged
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_child_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_child_objects("s3://app/d1/o.obj",
                                       "s3://unknown/d2/o.obj")

        # case 6: try to copy from landing to unmanaged
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="move_child_objects", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.move_child_objects("s3://landing/d1/o.obj",
                                       "s3://unknown/d2/o.obj")

    @pytest.mark.emr
    def test_object_exists(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.object_exists("s3://app/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "object_exists", ("s3://app/s3/path/o.obj", ), {}))

        # case 2: point to data bucket
        s3_util.object_exists(s3_path="s3://landing/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "object_exists", (),
                       {"s3_path": "s3://landing/s3/path/o.obj"}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="object_exists", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.object_exists("s3://unknown/s3/path/o.obj")

    @pytest.mark.emr
    def test_read_gzip_file_content(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.read_gzip_file_content("s3://app/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "read_gzip_file_content", ("s3://app/s3/path/o.obj", ),
                       {}))

        # case 2: point to data bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format(
            attr="read_gzip_file_content")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.read_gzip_file_content(
                s3_path="s3://landing/s3/path/o.obj")

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="read_gzip_file_content", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.read_gzip_file_content("s3://unknown/s3/path/o.obj")

    @pytest.mark.emr
    def test_get_bucket_and_key(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.get_bucket_and_key("s3://app/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "get_bucket_and_key", ("s3://app/s3/path/o.obj", ), {}))

        # case 2: point to data bucket
        s3_util.get_bucket_and_key(object_key="s3://landing/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "get_bucket_and_key", (),
                       {"object_key": "s3://landing/s3/path/o.obj"}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="get_bucket_and_key", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.get_bucket_and_key("s3://unknown/s3/path/o.obj")

    @pytest.mark.emr
    def test_wait_for_file_availability(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.wait_for_file_availability(
            s3_file_location="s3://app/s3/path/o.obj",
            polling_interval_seconds=3,
            timeout_seconds=10)
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(
                TestAWSS3CredentialsWrapper.aws_credentials_api,
                "wait_for_file_availability", (), {
                    "s3_file_location": "s3://app/s3/path/o.obj",
                    "polling_interval_seconds": 3,
                    "timeout_seconds": 10
                }))

        # case 2: point to data bucket
        s3_util.wait_for_file_availability("s3://landing/s3/path/o.obj", 3000,
                                           10000)
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "wait_for_file_availability",
                       ("s3://landing/s3/path/o.obj", 3000, 10000), {}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="wait_for_file_availability", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.wait_for_file_availability("s3://unknown/s3/path/o.obj")

    @pytest.mark.emr
    def test_normalize_s3_path(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        s3_util.normalize_s3_path("s3a://app/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api,
                       "normalize_s3_path", ("s3a://app/s3/path/o.obj", ), {}))

        # case 2: point to data bucket
        s3_util.normalize_s3_path(s3_path="s3n://landing/s3/path/o.obj")
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put,
                       "normalize_s3_path", (),
                       {"s3_path": "s3n://landing/s3/path/o.obj"}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="normalize_s3_path", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.normalize_s3_path("s3://unknown/s3/path/o.obj")

    @pytest.mark.emr
    def test_is_s3_path(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket
        assert s3_util.is_s3_path("s3a://app/s3/path/o.obj") is True
        self._assert_calls_formatted(
            FakeS3Util.calls[0],
            Invocation(AWSCredentials("", ""), "is_s3_path",
                       ("s3a://app/s3/path/o.obj", ), {}))

        # case 2: point to data bucket
        assert s3_util.is_s3_path(obj="/landing/s3/path/o.obj") is False
        self._assert_calls_formatted(
            FakeS3Util.calls[1],
            Invocation(AWSCredentials("", ""), "is_s3_path", (),
                       {"obj": "/landing/s3/path/o.obj"}))

        # case 3: point to un-managed bucket
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format(
            attr="is_s3_path", buckets=["unknown"])
        err_msg = re.escape(err_msg)

        with pytest.raises(M3DIllegalArgumentException, match=err_msg):
            s3_util.is_s3_path("s3://unknown/s3/path/o.obj")

    @pytest.mark.emr
    def test_unknown_method(self):
        s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util(
            ["app", "log"], ["landing", "lake"])

        # case 1: point to application bucket. s3_resource is not an attribute of S3Util.
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE.format(
            attr="s3_resource")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.s3_resource("s3://log/s3/path/")

        # case 2: point to data bucket. unknown_method is not an attribute of S3Util.
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE.format(
            attr="unknown_method")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.unknown_method("s3://landing/s3/path/")

        # case 3: point to un-managed bucket. unknown_method is still not an attribute of S3Util.
        err_msg = TestAWSS3CredentialsWrapper.ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE.format(
            attr="unknown_method")
        err_msg = re.escape(err_msg)

        with pytest.raises(AttributeError, match=err_msg):
            s3_util.unknown_method("s3://unknown/s3/path/")
Beispiel #17
0
    def __init__(
            self,
            config,
            source_system,
            database,
            environment,
            emr_cluster_id=None,
            spark_params=None
    ):
        """
        Initialize Hadoop system

        :param config: system config file
        :param source_system: destination system code
        :param database: destination database code
        :param environment: destination schema code
        :param emr_cluster_id: id EMR cluster
        :param spark_params: spark specific parameters
        """

        # call super constructor
        super(EMRSystem, self).__init__(config, source_system, database, environment)

        self.scon_full_path = self.config_service.get_scon_path(source_system, database)

        # system config files
        with open(self.scon_full_path) as data_file:
            params_system = json.load(data_file)

        # S3 buckets
        self.bucket_landing = params_system["environments"][self.environment]["s3_buckets"]["landing"]
        self.bucket_lake = params_system["environments"][self.environment]["s3_buckets"]["lake"]
        self.bucket_mart_cal = params_system["environments"][self.environment]["s3_buckets"]["mart_cal"]
        self.bucket_application = params_system["environments"][self.environment]["s3_buckets"]["application"]
        self.bucket_log = params_system["environments"][self.environment]["s3_buckets"]["log"]

        # EMR default configuration
        self.default_emr_version = params_system["emr"]["default_emr_version"]
        self.default_ebs_size = params_system["emr"]["default_ebs_size"]

        # base M3D application deployment directory
        self.s3_deployment_dir_base = params_system["environments"][self.environment]["s3_deployment_dir_base"]

        # AWS credentials
        self.aws_api_credentials = AWSCredentials.from_file(params_system["aws_api_credentials"])
        self.aws_s3_put_credentials = AWSCredentials.from_file(params_system["aws_s3_put_credentials"])
        self.aws_s3_del_credentials = AWSCredentials.from_file(params_system["aws_s3_del_credentials"])

        # configurations
        self.api_action_timeout_seconds = params_system["api_action_timeout_seconds"]
        self.api_action_polling_interval_seconds = params_system["api_action_polling_interval_seconds"]
        self.api_long_timeout_seconds = params_system["api_long_timeout_seconds"]
        self.aws_region = params_system["aws_region"]
        self.packages_to_deploy = params_system["packages_to_deploy"]
        self.configs_to_deploy = params_system["configs_to_deploy"]

        # base directories
        self.s3_dir_base = params_system["s3_dir_base"]

        # defined sub-directories
        self.subdir_archive = params_system["subdir"]["archive"]
        self.subdir_header = params_system["subdir"]["header"]
        self.subdir_config = params_system["subdir"]["config"]
        self.subdir_data = params_system["subdir"]["data"]
        self.subdir_delta_table = params_system["subdir"]["delta_table"]
        self.subdir_data_backup = DataSystem.DirectoryName.DATA_BACKUP
        self.subdir_error = params_system["subdir"]["error"]
        self.subdir_work = params_system["subdir"]["work"]
        self.subdir_log = params_system["subdir"]["log"]
        self.subdir_apps = params_system["subdir"]["apps"]
        self.subdir_m3d_engine = params_system["subdir"]["m3d_engine"]
        self.subdir_loading = params_system["subdir"]["loading"]
        self.subdir_full_load = params_system["subdir"]["full_load"]
        self.subdir_delta_load = params_system["subdir"]["delta_load"]
        self.subdir_delta_lake_load = params_system["subdir"]["delta_lake_load"]
        self.subdir_append_load = params_system["subdir"]["append_load"]
        self.subdir_black_whole = params_system["subdir"]["black_whole"]
        self.subdir_credentials = params_system["subdir"]["credentials"]
        self.subdir_keytab = params_system["subdir"]["keytab"]
        self.subdir_tmp = params_system["subdir"]["tmp"]

        # deployment directories of M3D application and metadata (tconx)
        self.subdir_code = params_system["subdir"]["m3d"]
        self.subdir_metadata = params_system["subdir"]["metadata"]

        # spark arguments
        self.spark_main_class = params_system["spark"]["main_class"]
        self.spark_jar_name = params_system["spark"]["jar_name"]

        self.spark_params = spark_params

        self.version_filename = "version.txt"

        s3_deployment_dir = "{protocol}{bucket}{base_dir}".format(
            protocol=ConfigService.Protocols.S3,
            bucket=self.bucket_application,
            base_dir=self.s3_deployment_dir_base
        )

        # derived directories: apps
        self.dir_apps = os.path.join(s3_deployment_dir, self.environment, self.subdir_apps)
        self.dir_apps_algorithm = os.path.join(self.dir_apps, self.subdir_m3d_engine)
        self.dir_apps_loading = os.path.join(self.dir_apps, self.subdir_loading)

        # tmp directory in S3
        self.dir_tmp_s3 = os.path.join(s3_deployment_dir, self.environment, self.subdir_tmp)

        # tmp directory in local filesystem
        self.dir_tmp_local = os.path.join("/", self.subdir_tmp)

        self.dir_m3d_api_deployment = os.path.join(
            s3_deployment_dir,
            self.environment,
            self.subdir_code,
            self.config_service.subdir_projects_m3d_api
        )

        self.dir_metadata_deployment = os.path.join(
            s3_deployment_dir,
            self.environment,
            self.subdir_metadata,
            self.config_service.subdir_projects_m3d_api
        )

        self.spark_jar_path = os.path.join(
            self.dir_m3d_api_deployment,
            self.spark_jar_name
        )

        # AWSS3CredentialsWrapper will do the routing of methods to correct underlying S3Util object.
        self.s3_util = AWSS3CredentialsWrapper(
            [self.bucket_application, self.bucket_log],
            [self.bucket_landing, self.bucket_lake, self.bucket_mart_cal],
            self.aws_api_credentials,
            self.aws_s3_put_credentials,
            self.aws_s3_del_credentials
        )

        # Set up EMRClusterClient
        self.emr_cluster_id = emr_cluster_id

        if emr_cluster_id is not None:
            self.emr_cluster_client = self._create_emr_cluster_client(emr_cluster_id)
        else:
            self.emr_cluster_client = None
Beispiel #18
0
    def env_setup(self, tmpdir, destination_system, destination_database,
                  destination_environment):
        """
        This functions creates test specific config.json, scon_emr configuration file in provided tmpdir.
        Directory structure will resemble that of config directory found in root folder of this repository.

            - config.json will be changed to point to a test specific configuration files.
            - scon_emr will be changed to point to a test case specific root directory in HDFS and connect to HTTP
              server on localhost.
            - scon_mdp will be changed to point to a dummy credentials configuration.

        :param tmpdir: test case specific temporary directory where configuration files will be created.
        :param destination_system: destination system code
        :param destination_database: destination database code
        :param destination_environment: destination environment code

        :return: Function will return several parameters:

                     m3d_config_path: paths of test-specific config.json. Should be passed to M3D API calls.
                     scon_emr_path: paths of test-specific scon_emr
                     m3d_config_dict: contents of test-specific config.json as dict
                     scon_emr_dict: contents of test-specific scon_emr as dict
        """
        m3d_config_dict = util.Util.load_dict(self.default_m3d_config)
        tag_config = m3d_config_dict["tags"]["config"]
        tag_system = m3d_config_dict["tags"]["system"]

        tag_credentials = "credentials"

        config_dir = tmpdir.mkdir(tag_config)
        config_system_dir = config_dir.mkdir(tag_system)
        config_credentials_dir = config_dir.mkdir(tag_credentials)

        m3d_config_dict["tags"]["config"] = str(config_dir)
        m3d_config_dict["dir_exec"] = str(self.local_run_dir.mkdir("tmp"))
        m3d_config_file = config_dir.mkdir("m3d").join("config.json")
        m3d_config_file.write(json.dumps(m3d_config_dict, indent=4))

        aws_api_credentials_file = config_credentials_dir.join(
            "credentials-{}-{}-api.json".format(destination_system,
                                                destination_database))

        aws_s3_put_credentials_file = config_credentials_dir.join(
            "credentials-{}-{}-s3_put.json".format(destination_system,
                                                   destination_database))

        aws_s3_del_credentials_file = config_credentials_dir.join(
            "credentials-{}-{}-s3_del.json".format(destination_system,
                                                   destination_database))

        self.dump_aws_credentials(
            AWSCredentials("test-aws-access-key-api",
                           "test-aws-secret-key-api"),
            str(aws_api_credentials_file))

        self.dump_aws_credentials(
            AWSCredentials("test-aws-access-key-s3_put",
                           "test-aws-secret-key-s3_put"),
            str(aws_s3_put_credentials_file))

        self.dump_aws_credentials(
            AWSCredentials("test-aws-access-key-s3_del",
                           "test-aws-secret-key-s3_del"),
            str(aws_s3_del_credentials_file))

        scon_emr_filename = \
            ConfigService.Prefixes.SCON + "-" + destination_system + "-" + destination_database + ".json"
        scon_emr_file = config_system_dir.join(scon_emr_filename)
        scon_emr_dict = util.Util.load_dict(self.default_scon_emr)

        scon_emr_dict["name_service"] = "localhost:9000"
        scon_emr_dict["credentials"] = "<placeholder_for_AWS_credentials_file>"
        scon_emr_dict["aws_region"] = self.default_aws_region

        scon_emr_dict["aws_api_credentials"] = str(aws_api_credentials_file)
        scon_emr_dict["aws_s3_put_credentials"] = str(
            aws_s3_put_credentials_file)
        scon_emr_dict["aws_s3_del_credentials"] = str(
            aws_s3_del_credentials_file)

        scon_emr_dict["api_gateway"] = self.default_server_url
        scon_emr_dict["api_action_timeout_seconds"] = 10
        scon_emr_dict["api_action_polling_interval_seconds"] = 0.2
        scon_emr_dict["api_long_timeout_seconds"] = 20

        scon_emr_dict["emr"]["default_emr_version"] = "emr-5.17.0"
        scon_emr_dict["emr"]["default_ebs_size"] = "128"

        scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] = \
            self.default_dev_landing_bucket
        scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] = \
            self.default_dev_lake_bucket
        scon_emr_dict["environments"][destination_environment]["s3_buckets"]["mart_cal"] = \
            self.default_dev_mart_cal_bucket
        scon_emr_dict["environments"][destination_environment]["s3_buckets"]["metadata"] = \
            self.default_dev_metadata_bucket
        scon_emr_dict["environments"][destination_environment]["s3_buckets"]["inbound"] = \
            self.default_dev_inbound_bucket
        scon_emr_dict["environments"][destination_environment]["s3_buckets"]["application"] = \
            self.default_dev_application_bucket
        scon_emr_dict["environments"][destination_environment]["s3_buckets"][
            "log"] = self.default_log_bucket

        scon_emr_file.write(json.dumps(scon_emr_dict, indent=4))

        logging.debug(
            "test case configuration is saved in \"{}\" directory".format(
                str(config_dir)))

        return str(m3d_config_file), str(
            scon_emr_file), m3d_config_dict, scon_emr_dict