def test_is_s3_path(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket assert s3_util.is_s3_path("s3a://app/s3/path/o.obj") is True self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(AWSCredentials("", ""), "is_s3_path", ("s3a://app/s3/path/o.obj", ), {})) # case 2: point to data bucket assert s3_util.is_s3_path(obj="/landing/s3/path/o.obj") is False self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(AWSCredentials("", ""), "is_s3_path", (), {"obj": "/landing/s3/path/o.obj"})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="is_s3_path", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.is_s3_path("s3://unknown/s3/path/o.obj")
def __init__(self, hql_validation_function, hql_validation_error=None): self.hql_validation_function = hql_validation_function self.hql_validation_error = hql_validation_error self.bucket_landing = TestS3Table.LANDING_SPEC.bucket self.bucket_lake = TestS3Table.LAKE_SPEC.bucket self.statements = [] self.s3_util = AWSS3CredentialsWrapper( [], [self.bucket_landing, self.bucket_lake], AWSCredentials("aws_access_key_api", "aws_secret_key_api"), AWSCredentials("aws_access_key_s3_put", "aws_secret_key_s3_put"), AWSCredentials("aws_access_key_s3_del", "aws_secret_key_s3_del"))
def test_move_object(self): test_src_bucket_name = "test_src_bucket" test_destination_bucket_name = "test_destination_bucket" test_src_key = "test_src_key" test_destination_key = "test_destination_key" test_content = "aaa1" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_src_bucket_name) s3_resource.create_bucket(Bucket=test_destination_bucket_name) s3_resource.Bucket(test_src_bucket_name).put_object(Key=test_src_key, Body=test_content) s3_util = S3Util(AWSCredentials("", "")) s3_util.move_object( ("s3://" + test_src_bucket_name + "/" + test_src_key), ("s3://" + test_destination_bucket_name + "/" + test_destination_key)) destination_objects = list( s3_resource.Bucket(test_destination_bucket_name).objects.all()) assert len(destination_objects) == 1 assert destination_objects[0].key == test_destination_key src_objects = list( s3_resource.Bucket(test_src_bucket_name).objects.all()) assert len(src_objects) == 0
def is_s3_path(*args, **kwargs): """ With this one function we will test invocation of static methods """ FakeS3Util.calls.append( Invocation(AWSCredentials("", ""), "is_s3_path", args, kwargs)) return S3Util.is_s3_path(*args, **kwargs)
def test_wait_for_file_availability(self): bucket = "cur_bucket" key = "stdout.txt" data = "no output" s3_full_path = "s3://{}/{}".format(bucket, key) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=bucket) def create_file(): s3_resource.Bucket(bucket).put_object(Key=key, Body=data) s3_util = S3Util(AWSCredentials("", "")) polling_interval = 0.02 timeout = 0.5 with ConcurrentExecutor(create_file, 0.2): s3_util.wait_for_file_availability(s3_full_path, polling_interval, timeout) s3_util.delete_object(s3_full_path) err_msg = "File {} failed to be available after {} seconds.".format( s3_full_path, timeout) with pytest.raises(M3DAWSAPIException, match=err_msg): s3_util.wait_for_file_availability(s3_full_path, polling_interval, timeout)
def test_delete_object(self): test_bucket_name = "test_bucket" test_key = "test_dir/test_key" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) s3_resource.Bucket(test_bucket_name).put_object(Key=test_key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.delete_object("s3://" + test_bucket_name + "/" + test_key) remaining_objects = list( s3_resource.Bucket(test_bucket_name).objects.all()) assert len(remaining_objects) == 0
def test_upload_object(self): test_bucket_name = "test_bucket" test_key = "test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json" file_name = "test/resources/test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) s3_util = S3Util(AWSCredentials("", "")) s3_util.upload_object(file_name, "s3://" + test_bucket_name + "/" + test_key) s3_objects = list(s3_resource.Bucket(test_bucket_name).objects.all()) assert len(s3_objects) == 1 assert s3_objects[0].key == test_key
def create_s3_resource(aws_credentials=None): """ Initialize and return boto3 resource for S3. :param aws_credentials: AWS credentials. Empty values will be used if it is None. :return: initialized boto3 resource object for S3 """ if not aws_credentials: aws_credentials = AWSCredentials("", "") s3_resource = boto3.resource( "s3", aws_access_key_id=aws_credentials.access_key_id, aws_secret_access_key=aws_credentials.secret_access_key) return s3_resource
def create_emr_client(aws_region, aws_credentials=None): """ Initialize and return boto3 client for EMR. :param aws_region: AWS region :param aws_credentials: AWS credentials. Empty values will be used if it is None. :return: initialized boto3 client object for EMR """ if not aws_credentials: aws_credentials = AWSCredentials("", "") emr_client = boto3.client( 'emr', region_name=aws_region, aws_access_key_id=aws_credentials.access_key_id, aws_secret_access_key=aws_credentials.secret_access_key) return emr_client
def create_output_file(self, step, dummy_text): logging.info("step={{id: {}, state: {}}}".format(step.id, step.state)) step_id = step.id s3_log_file_location = "s3://{}/log/{}/steps/{}/stdout.gz" \ .format(self.default_log_bucket, self.emr_cluster_id, step_id) local_log_file_location = self.local_run_dir.join("stdout.gz") logging.info( "local_log_file_location={}".format(local_log_file_location)) logging.info("s3_log_file_location={}".format( str(s3_log_file_location))) with gzip.open(str(local_log_file_location), 'wb') as f: f.write(dummy_text.encode("utf-8")) s3_util = S3Util(AWSCredentials("", "")) s3_util.upload_object(str(local_log_file_location), str(s3_log_file_location))
def test_delete_objects(self): test_bucket_name = "test_bucket" test_prefix = "test_dir" test_keys = [ "test_key1", "{}/test_key2".format(test_prefix), "{}/test_key3".format(test_prefix), "{}/test_key4".format(test_prefix) ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) for key in test_keys: s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.delete_objects("s3://" + test_bucket_name + "/" + test_prefix) remaining_objects = list( s3_resource.Bucket(test_bucket_name).objects.all()) assert len(remaining_objects) == 1 assert remaining_objects[0].key == test_keys[0]
def test_move_objects(self): test_src_bucket_name = "test_src_bucket" test_destination_bucket_name = "test_destination_bucket" test_src_prefix = "test_src_dir" test_destination_prefix = "test_destination_dir" test_src_keys = [ "test_key1", "{}/test_key2".format(test_src_prefix), "{}/test_key3".format(test_src_prefix), "{}/test_key4".format(test_src_prefix) ] test_destination_keys = [ "{}/test_key2".format(test_destination_prefix), "{}/test_key3".format(test_destination_prefix), "{}/test_key4".format(test_destination_prefix) ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_src_bucket_name) s3_resource.create_bucket(Bucket=test_destination_bucket_name) for key in test_src_keys: s3_resource.Bucket(test_src_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.move_objects( ("s3://" + test_src_bucket_name + "/" + test_src_prefix), ("s3://" + test_destination_bucket_name + "/" + test_destination_prefix)) src_objects = list( s3_resource.Bucket(test_src_bucket_name).objects.all()) assert len(src_objects) == 1 assert src_objects[0].key == test_src_keys[0] destination_objects = s3_resource.Bucket( test_destination_bucket_name).objects.all() assert sorted(map(lambda x: x.key, destination_objects)) == test_destination_keys
def test_list_objects_in_bucket(self): test_bucket_name = "test_bucket" test_prefix = "test_dir" test_keys = [ "test_key1", "{}/test_key2".format(test_prefix), "{}/test_key3".format(test_prefix), "{}/test_key4".format(test_prefix) ] test_resources = [ "s3://{}/".format(test_bucket_name) + key for key in test_keys ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) for key in test_keys: s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) keys = s3_util.list_objects("s3://" + test_bucket_name + "/" + test_prefix) assert keys == test_resources[1:4]
def test_parses_basic_attributes_from_system_config_file(self, _): """ Test case checks that all relevant key-values are extracted from sconx file and assigned to correct member variables of EMRSystem object. """ aws_api_credentials = AWSCredentials("fake_aws_api_access_key", "fake_aws_api_secret_key") aws_api_credentials_file = self.local_run_dir.join( "aws-credentials-emr-api.json") self.dump_aws_credentials(aws_api_credentials, str(aws_api_credentials_file)) aws_s3_put_credentials = AWSCredentials("fake_aws_s3_put_access_key", "fake_aws_s3_put_secret_key") aws_s3_put_credentials_file = self.local_run_dir.join( "aws-credentials-emr-s3_put.json") self.dump_aws_credentials(aws_s3_put_credentials, str(aws_s3_put_credentials_file)) aws_s3_del_credentials = AWSCredentials("fake_aws_s3_del_access_key", "fake_aws_s3_del_secret_key") aws_s3_del_credentials_file = self.local_run_dir.join( "aws-credentials-emr-s3_del.json") self.dump_aws_credentials(aws_s3_del_credentials, str(aws_s3_del_credentials_file)) test_scon_json = TestEMRSystem.test_scon_json_template.format( aws_api_credentials=str(aws_api_credentials_file), aws_s3_put_credentials=str(aws_s3_put_credentials_file), aws_s3_del_credentials=str(aws_s3_del_credentials_file)) s3_scon_file = self.local_run_dir.join("scon-emr-emr-test.json") s3_scon_file.write(test_scon_json) MockConfigService.scon_path = str(s3_scon_file) emr_system = EMRSystem(*self.test_emr_system_arguments) expected_system_params = { "bucket_landing": "m3d-da-bdp-test-landing", "bucket_lake": "m3d-da-bdp-test-lake", "bucket_mart_cal": "m3d-da-bdp-test-mart-cal", "bucket_log": "io.3stripes.factory.test.ireland.infrastructure-logs", "default_ebs_size": "128", "default_emr_version": "emr-5.17.0", "aws_api_credentials": aws_api_credentials, "aws_s3_put_credentials": aws_s3_put_credentials, "aws_s3_del_credentials": aws_s3_del_credentials, "api_action_timeout_seconds": 120, "api_action_polling_interval_seconds": 3, "api_long_timeout_seconds": 300, "aws_region": "eu-west-1", "packages_to_deploy": ["hadoop"], "configs_to_deploy": ["test_config_1", "test_config_2"], "subdir_archive": "test_archive/", "subdir_header": "test_header/", "subdir_config": "test_config/", "subdir_data": "test_data/", "subdir_delta_table": "delta_table/", "subdir_data_backup": "data_backup/", "subdir_error": "test_error/", "subdir_work": "test_work/", "subdir_log": "test_log/", "subdir_apps": "test_apps/", "subdir_m3d_engine": "test_m3d_engine/", "subdir_loading": "test_loading/", "subdir_full_load": "test_full_load/", "subdir_delta_load": "test_delta_load/", "subdir_delta_lake_load": "test_delta_lake_load/", "subdir_append_load": "test_append_load/", "subdir_black_whole": "test_black_whole/", "subdir_credentials": "test_credentials/", "subdir_keytab": "test_keytab/", "subdir_tmp": "test_tmp/", "subdir_code": "m3d", "subdir_metadata": "metadata", "spark_jar_name": "test_jar.jar", "dir_apps": "s3://m3d-da-landing-application/m3d-test/test_environment/test_apps/", "dir_apps_algorithm": "s3://m3d-da-landing-application/m3d-test/" "test_environment/test_apps/test_m3d_engine/", "dir_apps_loading": "s3://m3d-da-landing-application/m3d-test/test_environment/" "test_apps/test_loading/", "dir_tmp_s3": "s3://m3d-da-landing-application/m3d-test/test_environment/test_tmp/", "dir_tmp_local": "/test_tmp/", "spark_jar_path": "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/" + "test_subdir_projects_m3d_api/test_jar.jar", "dir_m3d_api_deployment": "s3://m3d-da-landing-application/m3d-test/test_environment/m3d/test_subdir_projects_m3d_api", "dir_metadata_deployment": "s3://m3d-da-landing-application/m3d-test/test_environment/metadata/test_subdir_projects_m3d_api" } for param in expected_system_params.keys(): assert getattr(emr_system, param) == expected_system_params[param]
class TestEMRClusterClient(UnitTestBase): emr_cluster_name = "test_cluster" aws_region = "us-east-1" aws_credentials = AWSCredentials("test_access_key", "test_secret_key") timeout_seconds = 0.5 retry_seconds = 0.1 long_timeout_seconds = 3.0 @staticmethod def env_setup(emr_cluster_name, aws_region, aws_credentials, timeout_ms, retry_ms, long_timeout_ms): run_job_flow_args = dict(Instances={ 'InstanceCount': 3, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'test_zone' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log/', Name=emr_cluster_name, ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True) emr_client = Boto3Util.create_emr_client(aws_region) emr_cluster_id = emr_client.run_job_flow( **run_job_flow_args)['JobFlowId'] emr_cluster_client = EMRClusterClient(emr_cluster_id, aws_region, aws_credentials, timeout_ms, retry_ms, long_timeout_ms) return emr_cluster_client, emr_cluster_id @staticmethod def _compress_string(s): out = io.BytesIO() with gzip.GzipFile(fileobj=out, mode="w") as gzip_s: gzip_s.write(s.encode()) compressed_str = out.getvalue() return compressed_str @pytest.mark.emr @moto.mock_emr def test_get_cluster_state(self): emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) cluster_state = emr_cluster_client.get_cluster_state() assert cluster_state == "WAITING" @pytest.mark.emr @moto.mock_emr def test_wait_for_cluster_startup(self): emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) cluster_state = emr_cluster_client.wait_for_cluster_startup() assert cluster_state == "WAITING" @pytest.mark.emr @moto.mock_emr def test_wait_for_bootstrapping_cluster(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) # Change step state to BOOTSTRAPPING so that wait times out emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_cluster.state = "BOOTSTRAPPING" err_msg = "Cluster {} failed to start after {} seconds.".format( emr_cluster_id, self.timeout_seconds) with pytest.raises(M3DAWSAPIException, match=err_msg): emr_cluster_client.wait_for_cluster_startup() @pytest.mark.emr @moto.mock_emr def test_add_step(self): # Expected response is of the format s-XXXXXXXXXXXXX emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) assert str(emr_step_id).startswith("s-") assert len(emr_step_id) == 15 @pytest.mark.emr @moto.mock_emr def test_get_step_status(self): emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) emr_step_status, emr_step_failure_details = emr_cluster_client.get_step_status( emr_step_id) assert str(emr_step_id).startswith("s-") assert emr_step_status == "STARTING" assert emr_step_failure_details is None @pytest.mark.emr @moto.mock_emr def test_wait_for_step_completion_without_state_change(self): with pytest.raises(M3DAWSAPIException): # In this test we expect exception because the state of the step will be STARTING emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds) @pytest.mark.emr def test_add_step_to_cluster_with_state_change(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) logging.info(str(emr_step_id)) cluster_steps = emr_cluster_client.get_list_of_steps() assert 1 == len(cluster_steps) assert cluster_steps[0] == emr_step_id emr_step_status, _ = emr_cluster_client.get_step_status( emr_step_id) assert emr_step_status == "STARTING" # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING" emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_cluster.steps[0].state = "RUNNING" def complete_step(): # Wait for some time to let EMRClusterClient poll a few times. fake_cluster.steps[0].state = "COMPLETED" with ConcurrentExecutor(complete_step, 0.2): emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds) @pytest.mark.emr @moto.mock_s3 def test_add_step_to_cluster_fail_without_output(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket="mybucket") step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) cluster_steps = emr_cluster_client.get_list_of_steps() assert 1 == len(cluster_steps) assert cluster_steps[0] == emr_step_id emr_step_status, _ = emr_cluster_client.get_step_status( emr_step_id) assert emr_step_status == "STARTING" # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING" emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_step = fake_cluster.steps[0] fake_step.state = "RUNNING" def fail_step(): fake_step.state = "FAILED" # Make sure that we do not wait for 300 seconds for gz file to be available. EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds # Required for correct log path generation in MockedMethod. MockedMethod.emr_cluster_id = emr_cluster_id stderr_gz_path = MockedMethod.log_file_template.format( emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id) err_msg = "File {} failed to be available after {} seconds.".\ format(stderr_gz_path, self.timeout_seconds) with pytest.raises(M3DAWSAPIException, match=err_msg): # Wait for some time to let EMRClusterClient poll a few times. with ConcurrentExecutor(fail_step, 0.4): with patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status", side_effect=MockedMethod.get_step_status_mocked): emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds) @pytest.mark.emr @moto.mock_s3 def test_add_step_to_cluster_fail_with_output(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket="mybucket") step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) cluster_steps = emr_cluster_client.get_list_of_steps() assert 1 == len(cluster_steps) assert cluster_steps[0] == emr_step_id emr_step_status, _ = emr_cluster_client.get_step_status( emr_step_id) assert emr_step_status == "STARTING" # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING" emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_step = fake_cluster.steps[0] fake_step.state = "RUNNING" # Make sure that we do not wait for 300 seconds for gz file to be available. EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds # Required for correct log path generation in MockedMethod. MockedMethod.emr_cluster_id = emr_cluster_id stderr_gz_path = MockedMethod.log_file_template.format( emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id) expected_content = "Lots of content here!!!" def fail_step_and_write_output(): fake_step.state = "FAILED" time.sleep(0.3) compressed_content = TestEMRClusterClient._compress_string( expected_content) bucket, key = emr_cluster_client.s3_util.get_bucket_and_key( stderr_gz_path) s3_resource.Bucket(bucket).put_object(Key=key, Body=compressed_content) with pytest.raises(M3DAWSAPIException) as exc: # Wait for some time to let EMRClusterClient poll a few times. with ConcurrentExecutor(fail_step_and_write_output, 0.3): with patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status", side_effect=MockedMethod.get_step_status_mocked): emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds) err_msg = "EMR Step with cluster_id='{}' and step_id='{}' failed to complete".\ format(emr_cluster_id, emr_step_id) assert err_msg in str(exc.value) assert stderr_gz_path in str(exc.value) resulting_content = emr_cluster_client.s3_util.read_gzip_file_content( stderr_gz_path) assert expected_content == resulting_content @pytest.mark.emr @moto.mock_emr def test_wait_for_spark_step_completion(self): with pytest.raises(M3DAWSAPIException): # In this test we expect exception because the state of the step will be starting emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) emr_cluster_client.wait_for_spark_step_completion(emr_step_id) @pytest.mark.emr @moto.mock_emr def test_get_list_of_steps(self): emr_cluster_client, _ = self.env_setup(self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step_{}" command_str_0 = "/usr/bin/spark-submit --class spark.job.main.class" command_str_1 = "spark-submit --class totally.different.main.class config.json" command_str_2 = "hive --silent -f s3://app.bucket/path/to/query.hql" emr_step_id_0 = emr_cluster_client.add_step(step_name.format(0), command_str_0) emr_step_id_1 = emr_cluster_client.add_step(step_name.format(1), command_str_1) emr_step_id_2 = emr_cluster_client.add_step(step_name.format(2), command_str_2) step_ids = emr_cluster_client.get_list_of_steps() assert len(step_ids) == 3 # Because of a bug in moto step ids are returned in the same order as submission. # In reality the order should be reversed. So no because of this we have to compare with reverse order. # ToDo: Change comparison order once bug in boto is fixed, https://github.com/spulec/moto/issues/1866 assert step_ids[2] == emr_step_id_0 assert step_ids[1] == emr_step_id_1 assert step_ids[0] == emr_step_id_2 @pytest.mark.emr def test_get_step_output_path(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) # Change step state to COMPLETED emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_cluster.steps[0].state = "COMPLETED" emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds) output_file = emr_cluster_client.get_step_output_path(emr_step_id) expected_output_file = "s3://mybucket/log/{}/steps/{}/stdout.gz".format( emr_cluster_client.emr_cluster_id, emr_step_id) assert output_file == expected_output_file @pytest.mark.emr @moto.mock_emr def test_get_step_err_path(self): emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) MockedMethod.emr_cluster_id = emr_cluster_id with patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status", side_effect=MockedMethod.get_step_status_mocked): err_file = emr_cluster_client.get_step_err_path(emr_step_id) expected_err_file = "s3://mybucket/log/{}/steps/{}/stderr.gz".format( emr_cluster_client.emr_cluster_id, emr_step_id) assert err_file == expected_err_file
class TestAWSS3CredentialsWrapper(object): aws_credentials_api = AWSCredentials("access_key_id-api", "secret_access_key-api") aws_credentials_put = AWSCredentials("access_key_id-put", "secret_access_key-put") aws_credentials_del = AWSCredentials("access_key_id-del", "secret_access_key-del") ERROR_METHOD_NOT_SUPPORTED_TEMPLATE = \ "The following method is not supported for AWSFactoryS3Wrapper: {attr}()" ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE = \ "{attr} is not an attribute of S3Util" ERROR_UNMANAGED_BUCKET_TEMPLATE = \ "AWSFactoryS3Wrapper.{attr}() has been called with arguments pointing to S3 buckets which are " \ "not managed by AWSFactoryS3Wrapper: {buckets}" ERROR_CROSS_BUCKET_ACCESS_TEMPLATE = \ "AWSFactoryS3Wrapper.{attr}() has been called with arguments pointing to both" \ " data and applications S3 buckets, this is not supported." @staticmethod def _create_aws_factory_s3_util(application_buckets, data_buckets): s3_util = AWSS3CredentialsWrapper.__new__(AWSS3CredentialsWrapper) s3_util.app_buckets = application_buckets s3_util.data_buckets = data_buckets s3_util.s3_util_api = FakeS3Util( TestAWSS3CredentialsWrapper.aws_credentials_api) s3_util.s3_util_put = FakeS3Util( TestAWSS3CredentialsWrapper.aws_credentials_put) s3_util.s3_util_del = FakeS3Util( TestAWSS3CredentialsWrapper.aws_credentials_del) return s3_util @staticmethod def _reset(): FakeS3Util.calls = [] def setup_method(self, _): TestAWSS3CredentialsWrapper._reset() @staticmethod def _assert_calls_formatted(call_0, call_1): not_equal_msg = "<{}> is not equal to <{}>".format(call_0, call_1) assert call_0 == call_1, not_equal_msg @pytest.mark.emr def test_upload_object(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app"], ["data"]) # case 1: point to application bucket s3_util.upload_object("local/path", "s3://app/s3/path") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "upload_object", ("local/path", "s3://app/s3/path"), {})) # case 2: point to data bucket s3_util.upload_object("local/path", "s3://data/s3/path") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "upload_object", ("local/path", "s3://data/s3/path"), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="upload_object", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.upload_object("local/path", "s3://unknown/s3/path") @pytest.mark.emr def test_upload_child_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.upload_child_objects("local/path", "s3://app/s3/path", recursive=True) self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "upload_child_objects", ("local/path", "s3://app/s3/path"), {"recursive": True})) # case 2: point to data bucket s3_util.upload_child_objects("local/path", "s3://lake/s3/path", fn_pattern="*.py") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "upload_child_objects", ("local/path", "s3://lake/s3/path"), {"fn_pattern": "*.py"})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="upload_child_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.upload_child_objects("local/path", "s3://unknown/s3/path") @pytest.mark.emr def test_delete_object(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.delete_object("s3://log/s3/path/some.log") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "delete_object", ("s3://log/s3/path/some.log", ), {})) # case 2: point to data bucket s3_util.delete_object("s3://landing/s3/path/some.log") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_del, "delete_object", ("s3://landing/s3/path/some.log", ), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="delete_object", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.delete_object("s3://unknown/s3/path/some.log") @pytest.mark.emr def test_delete_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.delete_objects("s3://log/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "delete_objects", ("s3://log/s3/path/", ), {})) # case 2: point to data bucket s3_util.delete_objects("s3://landing/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_del, "delete_objects", ("s3://landing/s3/path/", ), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="delete_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.delete_objects("s3://unknown/s3/path/") @pytest.mark.emr def test_delete_child_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.delete_child_objects("s3://log/s3/path") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "delete_child_objects", ("s3://log/s3/path", ), {})) # case 2: point to data bucket s3_util.delete_child_objects("s3://landing/s3/path") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_del, "delete_child_objects", ("s3://landing/s3/path", ), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="delete_child_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.delete_child_objects("s3://unknown/s3/path/") @pytest.mark.emr def test_list_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.list_objects("s3://log/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "list_objects", ("s3://log/s3/path/", ), {})) # case 2: point to data bucket s3_util.list_objects("s3://landing/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "list_objects", ("s3://landing/s3/path/", ), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="list_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.list_objects("s3://unknown/s3/path/") @pytest.mark.emr def test_list_child_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.list_child_objects("s3://app/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "list_child_objects", ("s3://app/s3/path/", ), {})) # case 2: point to data bucket s3_util.list_child_objects("s3://landing/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "list_child_objects", ("s3://landing/s3/path/", ), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="list_child_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.list_child_objects("s3://unknown/s3/path/") @pytest.mark.emr def test_move_object(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application buckets s3_util.move_object("s3://app/s3/path/some.obj", destination_s3_path="s3://log/s3/path/some.obj") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "move_object", ("s3://app/s3/path/some.obj", ), {"destination_s3_path": "s3://log/s3/path/some.obj"})) # case 2: point to data buckets err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format( attr="move_object") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.move_object( src_s3_path="s3://landing/s3/path/some.obj", destination_s3_path="s3://lake/s3/path/some.obj") # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_object", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_object("s3://unknown/d1/o.obj", "s3://unknown/d2/o.obj") # case 4: try to copy from app to landing err_msg = TestAWSS3CredentialsWrapper.ERROR_CROSS_BUCKET_ACCESS_TEMPLATE.format( attr="move_object") err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_object("s3://app/d/o.obj", "s3://landing/d/o.obj") # case 5: try to copy from app to unmanaged err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_object", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_object("s3://app/d1/o.obj", "s3://unknown/d2/o.obj") # case 6: try to copy from landing to unmanaged err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_object", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_object("s3://landing/d1/o.obj", "s3://unknown/d2/o.obj") @pytest.mark.emr def test_move_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application buckets s3_util.move_objects("s3://app/s3/path/", destination_s3_prefix_path="s3://log/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "move_objects", ("s3://app/s3/path/", ), {"destination_s3_prefix_path": "s3://log/s3/path/"})) # case 2: point to data buckets err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format( attr="move_objects") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.move_objects("s3://landing/s3/path/", "s3://lake/s3/path/") # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_objects("s3://unknown/d1/o.obj", "s3://unknown/d2/o.obj") # case 4: try to copy from app to landing err_msg = TestAWSS3CredentialsWrapper.ERROR_CROSS_BUCKET_ACCESS_TEMPLATE.format( attr="move_objects") err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_objects("s3://app/d/", "s3://landing/d/") # case 5: try to copy from app to unmanaged err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_objects("s3://app/d1/o.obj", "s3://unknown/d2/o.obj") # case 6: try to copy from landing to unmanaged err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_objects("s3://landing/d1/o.obj", "s3://unknown/d2/o.obj") @pytest.mark.emr def test_move_child_objects(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application buckets s3_util.move_child_objects("s3://app/s3/path/", "s3://log/s3/path/") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "move_child_objects", ("s3://app/s3/path/", "s3://log/s3/path/"), {})) # case 2: point to data buckets err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format( attr="move_child_objects") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.move_child_objects( src_s3_prefix_path="s3://landing/s3/path/", destination_s3_prefix_path="s3://lake/s3/path/") # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_child_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_child_objects("s3://unknown/d1/o.obj", "s3://unknown/d2/o.obj") # case 4: try to copy from app to landing err_msg = TestAWSS3CredentialsWrapper.ERROR_CROSS_BUCKET_ACCESS_TEMPLATE.format( attr="move_child_objects") err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_child_objects("s3://app/d/", "s3://landing/d/") # case 5: try to copy from app to unmanaged err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_child_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_child_objects("s3://app/d1/o.obj", "s3://unknown/d2/o.obj") # case 6: try to copy from landing to unmanaged err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="move_child_objects", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.move_child_objects("s3://landing/d1/o.obj", "s3://unknown/d2/o.obj") @pytest.mark.emr def test_object_exists(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.object_exists("s3://app/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "object_exists", ("s3://app/s3/path/o.obj", ), {})) # case 2: point to data bucket s3_util.object_exists(s3_path="s3://landing/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "object_exists", (), {"s3_path": "s3://landing/s3/path/o.obj"})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="object_exists", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.object_exists("s3://unknown/s3/path/o.obj") @pytest.mark.emr def test_read_gzip_file_content(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.read_gzip_file_content("s3://app/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "read_gzip_file_content", ("s3://app/s3/path/o.obj", ), {})) # case 2: point to data bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_METHOD_NOT_SUPPORTED_TEMPLATE.format( attr="read_gzip_file_content") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.read_gzip_file_content( s3_path="s3://landing/s3/path/o.obj") # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="read_gzip_file_content", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.read_gzip_file_content("s3://unknown/s3/path/o.obj") @pytest.mark.emr def test_get_bucket_and_key(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.get_bucket_and_key("s3://app/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "get_bucket_and_key", ("s3://app/s3/path/o.obj", ), {})) # case 2: point to data bucket s3_util.get_bucket_and_key(object_key="s3://landing/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "get_bucket_and_key", (), {"object_key": "s3://landing/s3/path/o.obj"})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="get_bucket_and_key", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.get_bucket_and_key("s3://unknown/s3/path/o.obj") @pytest.mark.emr def test_wait_for_file_availability(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.wait_for_file_availability( s3_file_location="s3://app/s3/path/o.obj", polling_interval_seconds=3, timeout_seconds=10) self._assert_calls_formatted( FakeS3Util.calls[0], Invocation( TestAWSS3CredentialsWrapper.aws_credentials_api, "wait_for_file_availability", (), { "s3_file_location": "s3://app/s3/path/o.obj", "polling_interval_seconds": 3, "timeout_seconds": 10 })) # case 2: point to data bucket s3_util.wait_for_file_availability("s3://landing/s3/path/o.obj", 3000, 10000) self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "wait_for_file_availability", ("s3://landing/s3/path/o.obj", 3000, 10000), {})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="wait_for_file_availability", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.wait_for_file_availability("s3://unknown/s3/path/o.obj") @pytest.mark.emr def test_normalize_s3_path(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket s3_util.normalize_s3_path("s3a://app/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_api, "normalize_s3_path", ("s3a://app/s3/path/o.obj", ), {})) # case 2: point to data bucket s3_util.normalize_s3_path(s3_path="s3n://landing/s3/path/o.obj") self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(TestAWSS3CredentialsWrapper.aws_credentials_put, "normalize_s3_path", (), {"s3_path": "s3n://landing/s3/path/o.obj"})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="normalize_s3_path", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.normalize_s3_path("s3://unknown/s3/path/o.obj") @pytest.mark.emr def test_is_s3_path(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket assert s3_util.is_s3_path("s3a://app/s3/path/o.obj") is True self._assert_calls_formatted( FakeS3Util.calls[0], Invocation(AWSCredentials("", ""), "is_s3_path", ("s3a://app/s3/path/o.obj", ), {})) # case 2: point to data bucket assert s3_util.is_s3_path(obj="/landing/s3/path/o.obj") is False self._assert_calls_formatted( FakeS3Util.calls[1], Invocation(AWSCredentials("", ""), "is_s3_path", (), {"obj": "/landing/s3/path/o.obj"})) # case 3: point to un-managed bucket err_msg = TestAWSS3CredentialsWrapper.ERROR_UNMANAGED_BUCKET_TEMPLATE.format( attr="is_s3_path", buckets=["unknown"]) err_msg = re.escape(err_msg) with pytest.raises(M3DIllegalArgumentException, match=err_msg): s3_util.is_s3_path("s3://unknown/s3/path/o.obj") @pytest.mark.emr def test_unknown_method(self): s3_util = TestAWSS3CredentialsWrapper._create_aws_factory_s3_util( ["app", "log"], ["landing", "lake"]) # case 1: point to application bucket. s3_resource is not an attribute of S3Util. err_msg = TestAWSS3CredentialsWrapper.ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE.format( attr="s3_resource") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.s3_resource("s3://log/s3/path/") # case 2: point to data bucket. unknown_method is not an attribute of S3Util. err_msg = TestAWSS3CredentialsWrapper.ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE.format( attr="unknown_method") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.unknown_method("s3://landing/s3/path/") # case 3: point to un-managed bucket. unknown_method is still not an attribute of S3Util. err_msg = TestAWSS3CredentialsWrapper.ERROR_UNKNOWN_ATTRIBUTE_TEMPLATE.format( attr="unknown_method") err_msg = re.escape(err_msg) with pytest.raises(AttributeError, match=err_msg): s3_util.unknown_method("s3://unknown/s3/path/")
def __init__( self, config, source_system, database, environment, emr_cluster_id=None, spark_params=None ): """ Initialize Hadoop system :param config: system config file :param source_system: destination system code :param database: destination database code :param environment: destination schema code :param emr_cluster_id: id EMR cluster :param spark_params: spark specific parameters """ # call super constructor super(EMRSystem, self).__init__(config, source_system, database, environment) self.scon_full_path = self.config_service.get_scon_path(source_system, database) # system config files with open(self.scon_full_path) as data_file: params_system = json.load(data_file) # S3 buckets self.bucket_landing = params_system["environments"][self.environment]["s3_buckets"]["landing"] self.bucket_lake = params_system["environments"][self.environment]["s3_buckets"]["lake"] self.bucket_mart_cal = params_system["environments"][self.environment]["s3_buckets"]["mart_cal"] self.bucket_application = params_system["environments"][self.environment]["s3_buckets"]["application"] self.bucket_log = params_system["environments"][self.environment]["s3_buckets"]["log"] # EMR default configuration self.default_emr_version = params_system["emr"]["default_emr_version"] self.default_ebs_size = params_system["emr"]["default_ebs_size"] # base M3D application deployment directory self.s3_deployment_dir_base = params_system["environments"][self.environment]["s3_deployment_dir_base"] # AWS credentials self.aws_api_credentials = AWSCredentials.from_file(params_system["aws_api_credentials"]) self.aws_s3_put_credentials = AWSCredentials.from_file(params_system["aws_s3_put_credentials"]) self.aws_s3_del_credentials = AWSCredentials.from_file(params_system["aws_s3_del_credentials"]) # configurations self.api_action_timeout_seconds = params_system["api_action_timeout_seconds"] self.api_action_polling_interval_seconds = params_system["api_action_polling_interval_seconds"] self.api_long_timeout_seconds = params_system["api_long_timeout_seconds"] self.aws_region = params_system["aws_region"] self.packages_to_deploy = params_system["packages_to_deploy"] self.configs_to_deploy = params_system["configs_to_deploy"] # base directories self.s3_dir_base = params_system["s3_dir_base"] # defined sub-directories self.subdir_archive = params_system["subdir"]["archive"] self.subdir_header = params_system["subdir"]["header"] self.subdir_config = params_system["subdir"]["config"] self.subdir_data = params_system["subdir"]["data"] self.subdir_delta_table = params_system["subdir"]["delta_table"] self.subdir_data_backup = DataSystem.DirectoryName.DATA_BACKUP self.subdir_error = params_system["subdir"]["error"] self.subdir_work = params_system["subdir"]["work"] self.subdir_log = params_system["subdir"]["log"] self.subdir_apps = params_system["subdir"]["apps"] self.subdir_m3d_engine = params_system["subdir"]["m3d_engine"] self.subdir_loading = params_system["subdir"]["loading"] self.subdir_full_load = params_system["subdir"]["full_load"] self.subdir_delta_load = params_system["subdir"]["delta_load"] self.subdir_delta_lake_load = params_system["subdir"]["delta_lake_load"] self.subdir_append_load = params_system["subdir"]["append_load"] self.subdir_black_whole = params_system["subdir"]["black_whole"] self.subdir_credentials = params_system["subdir"]["credentials"] self.subdir_keytab = params_system["subdir"]["keytab"] self.subdir_tmp = params_system["subdir"]["tmp"] # deployment directories of M3D application and metadata (tconx) self.subdir_code = params_system["subdir"]["m3d"] self.subdir_metadata = params_system["subdir"]["metadata"] # spark arguments self.spark_main_class = params_system["spark"]["main_class"] self.spark_jar_name = params_system["spark"]["jar_name"] self.spark_params = spark_params self.version_filename = "version.txt" s3_deployment_dir = "{protocol}{bucket}{base_dir}".format( protocol=ConfigService.Protocols.S3, bucket=self.bucket_application, base_dir=self.s3_deployment_dir_base ) # derived directories: apps self.dir_apps = os.path.join(s3_deployment_dir, self.environment, self.subdir_apps) self.dir_apps_algorithm = os.path.join(self.dir_apps, self.subdir_m3d_engine) self.dir_apps_loading = os.path.join(self.dir_apps, self.subdir_loading) # tmp directory in S3 self.dir_tmp_s3 = os.path.join(s3_deployment_dir, self.environment, self.subdir_tmp) # tmp directory in local filesystem self.dir_tmp_local = os.path.join("/", self.subdir_tmp) self.dir_m3d_api_deployment = os.path.join( s3_deployment_dir, self.environment, self.subdir_code, self.config_service.subdir_projects_m3d_api ) self.dir_metadata_deployment = os.path.join( s3_deployment_dir, self.environment, self.subdir_metadata, self.config_service.subdir_projects_m3d_api ) self.spark_jar_path = os.path.join( self.dir_m3d_api_deployment, self.spark_jar_name ) # AWSS3CredentialsWrapper will do the routing of methods to correct underlying S3Util object. self.s3_util = AWSS3CredentialsWrapper( [self.bucket_application, self.bucket_log], [self.bucket_landing, self.bucket_lake, self.bucket_mart_cal], self.aws_api_credentials, self.aws_s3_put_credentials, self.aws_s3_del_credentials ) # Set up EMRClusterClient self.emr_cluster_id = emr_cluster_id if emr_cluster_id is not None: self.emr_cluster_client = self._create_emr_cluster_client(emr_cluster_id) else: self.emr_cluster_client = None
def env_setup(self, tmpdir, destination_system, destination_database, destination_environment): """ This functions creates test specific config.json, scon_emr configuration file in provided tmpdir. Directory structure will resemble that of config directory found in root folder of this repository. - config.json will be changed to point to a test specific configuration files. - scon_emr will be changed to point to a test case specific root directory in HDFS and connect to HTTP server on localhost. - scon_mdp will be changed to point to a dummy credentials configuration. :param tmpdir: test case specific temporary directory where configuration files will be created. :param destination_system: destination system code :param destination_database: destination database code :param destination_environment: destination environment code :return: Function will return several parameters: m3d_config_path: paths of test-specific config.json. Should be passed to M3D API calls. scon_emr_path: paths of test-specific scon_emr m3d_config_dict: contents of test-specific config.json as dict scon_emr_dict: contents of test-specific scon_emr as dict """ m3d_config_dict = util.Util.load_dict(self.default_m3d_config) tag_config = m3d_config_dict["tags"]["config"] tag_system = m3d_config_dict["tags"]["system"] tag_credentials = "credentials" config_dir = tmpdir.mkdir(tag_config) config_system_dir = config_dir.mkdir(tag_system) config_credentials_dir = config_dir.mkdir(tag_credentials) m3d_config_dict["tags"]["config"] = str(config_dir) m3d_config_dict["dir_exec"] = str(self.local_run_dir.mkdir("tmp")) m3d_config_file = config_dir.mkdir("m3d").join("config.json") m3d_config_file.write(json.dumps(m3d_config_dict, indent=4)) aws_api_credentials_file = config_credentials_dir.join( "credentials-{}-{}-api.json".format(destination_system, destination_database)) aws_s3_put_credentials_file = config_credentials_dir.join( "credentials-{}-{}-s3_put.json".format(destination_system, destination_database)) aws_s3_del_credentials_file = config_credentials_dir.join( "credentials-{}-{}-s3_del.json".format(destination_system, destination_database)) self.dump_aws_credentials( AWSCredentials("test-aws-access-key-api", "test-aws-secret-key-api"), str(aws_api_credentials_file)) self.dump_aws_credentials( AWSCredentials("test-aws-access-key-s3_put", "test-aws-secret-key-s3_put"), str(aws_s3_put_credentials_file)) self.dump_aws_credentials( AWSCredentials("test-aws-access-key-s3_del", "test-aws-secret-key-s3_del"), str(aws_s3_del_credentials_file)) scon_emr_filename = \ ConfigService.Prefixes.SCON + "-" + destination_system + "-" + destination_database + ".json" scon_emr_file = config_system_dir.join(scon_emr_filename) scon_emr_dict = util.Util.load_dict(self.default_scon_emr) scon_emr_dict["name_service"] = "localhost:9000" scon_emr_dict["credentials"] = "<placeholder_for_AWS_credentials_file>" scon_emr_dict["aws_region"] = self.default_aws_region scon_emr_dict["aws_api_credentials"] = str(aws_api_credentials_file) scon_emr_dict["aws_s3_put_credentials"] = str( aws_s3_put_credentials_file) scon_emr_dict["aws_s3_del_credentials"] = str( aws_s3_del_credentials_file) scon_emr_dict["api_gateway"] = self.default_server_url scon_emr_dict["api_action_timeout_seconds"] = 10 scon_emr_dict["api_action_polling_interval_seconds"] = 0.2 scon_emr_dict["api_long_timeout_seconds"] = 20 scon_emr_dict["emr"]["default_emr_version"] = "emr-5.17.0" scon_emr_dict["emr"]["default_ebs_size"] = "128" scon_emr_dict["environments"][destination_environment]["s3_buckets"]["landing"] = \ self.default_dev_landing_bucket scon_emr_dict["environments"][destination_environment]["s3_buckets"]["lake"] = \ self.default_dev_lake_bucket scon_emr_dict["environments"][destination_environment]["s3_buckets"]["mart_cal"] = \ self.default_dev_mart_cal_bucket scon_emr_dict["environments"][destination_environment]["s3_buckets"]["metadata"] = \ self.default_dev_metadata_bucket scon_emr_dict["environments"][destination_environment]["s3_buckets"]["inbound"] = \ self.default_dev_inbound_bucket scon_emr_dict["environments"][destination_environment]["s3_buckets"]["application"] = \ self.default_dev_application_bucket scon_emr_dict["environments"][destination_environment]["s3_buckets"][ "log"] = self.default_log_bucket scon_emr_file.write(json.dumps(scon_emr_dict, indent=4)) logging.debug( "test case configuration is saved in \"{}\" directory".format( str(config_dir))) return str(m3d_config_file), str( scon_emr_file), m3d_config_dict, scon_emr_dict