def test_move_object(self): test_src_bucket_name = "test_src_bucket" test_destination_bucket_name = "test_destination_bucket" test_src_key = "test_src_key" test_destination_key = "test_destination_key" test_content = "aaa1" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_src_bucket_name) s3_resource.create_bucket(Bucket=test_destination_bucket_name) s3_resource.Bucket(test_src_bucket_name).put_object(Key=test_src_key, Body=test_content) s3_util = S3Util(AWSCredentials("", "")) s3_util.move_object( ("s3://" + test_src_bucket_name + "/" + test_src_key), ("s3://" + test_destination_bucket_name + "/" + test_destination_key)) destination_objects = list( s3_resource.Bucket(test_destination_bucket_name).objects.all()) assert len(destination_objects) == 1 assert destination_objects[0].key == test_destination_key src_objects = list( s3_resource.Bucket(test_src_bucket_name).objects.all()) assert len(src_objects) == 0
def test_wait_for_file_availability(self): bucket = "cur_bucket" key = "stdout.txt" data = "no output" s3_full_path = "s3://{}/{}".format(bucket, key) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=bucket) def create_file(): s3_resource.Bucket(bucket).put_object(Key=key, Body=data) s3_util = S3Util(AWSCredentials("", "")) polling_interval = 0.02 timeout = 0.5 with ConcurrentExecutor(create_file, 0.2): s3_util.wait_for_file_availability(s3_full_path, polling_interval, timeout) s3_util.delete_object(s3_full_path) err_msg = "File {} failed to be available after {} seconds.".format( s3_full_path, timeout) with pytest.raises(M3DAWSAPIException, match=err_msg): s3_util.wait_for_file_availability(s3_full_path, polling_interval, timeout)
def env_setup(emr_cluster_name, aws_region, aws_credentials, timeout_ms, retry_ms, long_timeout_ms): run_job_flow_args = dict(Instances={ 'InstanceCount': 3, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'test_zone' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log/', Name=emr_cluster_name, ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True) emr_client = Boto3Util.create_emr_client(aws_region) emr_cluster_id = emr_client.run_job_flow( **run_job_flow_args)['JobFlowId'] emr_cluster_client = EMRClusterClient(emr_cluster_id, aws_region, aws_credentials, timeout_ms, retry_ms, long_timeout_ms) return emr_cluster_client, emr_cluster_id
def test_drop_tables_one_hive_table_fails_to_drop_2(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: self.LAKE_SPEC.table in x) s3_table.drop_tables() assert len(s3_table.emr_system.statements) == 1 # nothing should be deleted by drop_tables() call assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == self.LAKE_SPEC.keys assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == self.LANDING_SPEC.keys
def test_drop_tables_successful_execution(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: True) s3_table.drop_tables() assert len(s3_table.emr_system.statements) == 2 # nothing should be deleted by drop_tables() call assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == self.LAKE_SPEC.keys assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == self.LANDING_SPEC.keys
def setup_method(self, method): super(EMRSystemUnitTestBase, self).setup_method(method) # Setup EMR mock self.mock_emr = moto.mock_emr() self.mock_emr.start() self.emr_cluster_name = "test clustester for unit and integration tests" run_job_flow_args = dict(Instances={ 'InstanceCount': 3, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'test_zone' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://{}/log/'.format( self.default_log_bucket), Name=self.emr_cluster_name, ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True) emr_client = Boto3Util.create_emr_client(self.default_aws_region) self.emr_cluster_id = emr_client.run_job_flow( **run_job_flow_args)['JobFlowId'] logging.debug("Test case specific EMR cluster id is {}".format( self.emr_cluster_id)) # Setup S3 mock self.mock_s3 = moto.mock_s3() self.mock_s3.start() self.s3_resource = Boto3Util.create_s3_resource() self.s3_resource.create_bucket(Bucket=self.default_dev_landing_bucket) self.s3_resource.create_bucket(Bucket=self.default_dev_lake_bucket) self.s3_resource.create_bucket(Bucket=self.default_dev_mart_cal_bucket) self.s3_resource.create_bucket( Bucket=self.default_dev_application_bucket) self.s3_resource.create_bucket(Bucket=self.default_log_bucket)
def test_drop_tables_both_tables_fail_to_drop(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: False) with pytest.raises(Exception, match="^Unable to drop any of the following tables.+"): s3_table.drop_tables() assert len(s3_table.emr_system.statements) == 0 # nothing should be deleted by drop_tables() call assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == self.LAKE_SPEC.keys assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == self.LANDING_SPEC.keys
def test_truncate_tables_both_repairs_fail_unexpectedly(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: False) with pytest.raises(M3DException, match="^Failed to truncate any of the following tables: .+"): s3_table.truncate_tables() assert len(s3_table.emr_system.statements) == 0 assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket) assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
def test_add_step_to_cluster_fail_without_output(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket="mybucket") step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) cluster_steps = emr_cluster_client.get_list_of_steps() assert 1 == len(cluster_steps) assert cluster_steps[0] == emr_step_id emr_step_status, _ = emr_cluster_client.get_step_status( emr_step_id) assert emr_step_status == "STARTING" # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING" emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_step = fake_cluster.steps[0] fake_step.state = "RUNNING" def fail_step(): fake_step.state = "FAILED" # Make sure that we do not wait for 300 seconds for gz file to be available. EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds # Required for correct log path generation in MockedMethod. MockedMethod.emr_cluster_id = emr_cluster_id stderr_gz_path = MockedMethod.log_file_template.format( emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id) err_msg = "File {} failed to be available after {} seconds.".\ format(stderr_gz_path, self.timeout_seconds) with pytest.raises(M3DAWSAPIException, match=err_msg): # Wait for some time to let EMRClusterClient poll a few times. with ConcurrentExecutor(fail_step, 0.4): with patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status", side_effect=MockedMethod.get_step_status_mocked): emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds)
def test_truncate_tables_both_repairs_fail_expectedly(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table( s3_resource, lambda x: False, M3DEMRStepException("", "", "Table not found")) s3_table.truncate_tables() assert len(s3_table.emr_system.statements) == 0 assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket) assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
def test_delete_object(self): test_bucket_name = "test_bucket" test_key = "test_dir/test_key" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) s3_resource.Bucket(test_bucket_name).put_object(Key=test_key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.delete_object("s3://" + test_bucket_name + "/" + test_key) remaining_objects = list( s3_resource.Bucket(test_bucket_name).objects.all()) assert len(remaining_objects) == 0
def test_upload_object(self): test_bucket_name = "test_bucket" test_key = "test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json" file_name = "test/resources/test_s3_util/tconx-bdp-emr_test-dev-bi_test101.json" s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) s3_util = S3Util(AWSCredentials("", "")) s3_util.upload_object(file_name, "s3://" + test_bucket_name + "/" + test_key) s3_objects = list(s3_resource.Bucket(test_bucket_name).objects.all()) assert len(s3_objects) == 1 assert s3_objects[0].key == test_key
def test_truncate_tables_one_repair_fails_unexpectedly_1(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: self.LANDING_SPEC.table in x) s3_table.truncate_tables() assert len(s3_table.emr_system.statements) == 1 landing_dir = "s3://{}/{}".format(self.LANDING_SPEC.bucket, self.LANDING_SPEC.data_dir) assert s3_table.emr_system.statements == [ 'ALTER TABLE {} SET LOCATION "{}";'.format(self.LANDING_SPEC.table, landing_dir) ] assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket) assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
def test_truncate_tables_one_repair_fails_unexpectedly_2(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: self.LAKE_SPEC.table in x) s3_table.truncate_tables() assert len(s3_table.emr_system.statements) == 1 lake_dir = "s3://{}/{}".format(self.LAKE_SPEC.bucket, self.LAKE_SPEC.data_dir) assert s3_table.emr_system.statements == [ 'DROP TABLE {};\n'.format(self.LAKE_SPEC.table) + TestS3Table._get_table_ddl_lake(self.LAKE_SPEC.table, s3_table.columns_lake, lake_dir) + "\n" + 'MSCK REPAIR TABLE {};'.format(self.LAKE_SPEC.table) ] assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket) assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket)
def test_truncate_tables_wrong_files_not_deleted(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: True) landing_extra_keys = sorted(["test_key1", "test_key2"]) lake_extra_keys = sorted(["test_key1", "test_dir/test_key2"]) for k in landing_extra_keys: s3_resource.Bucket(self.LANDING_SPEC.bucket).put_object(Key=k, Body="") for k in lake_extra_keys: s3_resource.Bucket(self.LAKE_SPEC.bucket).put_object(Key=k, Body="") s3_table.truncate_tables() assert len(s3_table.emr_system.statements) == 2 assert self.list_objects_in_bucket(self.LAKE_SPEC.bucket) == lake_extra_keys assert self.list_objects_in_bucket(self.LANDING_SPEC.bucket) == landing_extra_keys
def test_truncate_tables_everything_deleted_successfully(self): s3_resource = Boto3Util.create_s3_resource() s3_table = self._create_s3_table(s3_resource, lambda x: True) s3_table.truncate_tables() assert len(s3_table.emr_system.statements) == 2 landing_dir = "s3://{}/{}".format(self.LANDING_SPEC.bucket, self.LANDING_SPEC.data_dir) lake_dir = "s3://{}/{}".format(self.LAKE_SPEC.bucket, self.LAKE_SPEC.data_dir) expected_statements = [ 'ALTER TABLE {} SET LOCATION "{}";'.format(self.LANDING_SPEC.table, landing_dir), 'DROP TABLE {};\n'.format(self.LAKE_SPEC.table) + TestS3Table._get_table_ddl_lake(self.LAKE_SPEC.table, s3_table.columns_lake, lake_dir) + "\n" + 'MSCK REPAIR TABLE {};'.format(self.LAKE_SPEC.table) ] assert s3_table.emr_system.statements == expected_statements assert not self.list_objects_in_bucket(self.LANDING_SPEC.bucket) assert not self.list_objects_in_bucket(self.LAKE_SPEC.bucket)
def test_delete_objects(self): test_bucket_name = "test_bucket" test_prefix = "test_dir" test_keys = [ "test_key1", "{}/test_key2".format(test_prefix), "{}/test_key3".format(test_prefix), "{}/test_key4".format(test_prefix) ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) for key in test_keys: s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.delete_objects("s3://" + test_bucket_name + "/" + test_prefix) remaining_objects = list( s3_resource.Bucket(test_bucket_name).objects.all()) assert len(remaining_objects) == 1 assert remaining_objects[0].key == test_keys[0]
def test_list_objects_in_bucket(self): test_bucket_name = "test_bucket" test_prefix = "test_dir" test_keys = [ "test_key1", "{}/test_key2".format(test_prefix), "{}/test_key3".format(test_prefix), "{}/test_key4".format(test_prefix) ] test_resources = [ "s3://{}/".format(test_bucket_name) + key for key in test_keys ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_bucket_name) for key in test_keys: s3_resource.Bucket(test_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) keys = s3_util.list_objects("s3://" + test_bucket_name + "/" + test_prefix) assert keys == test_resources[1:4]
def test_move_objects(self): test_src_bucket_name = "test_src_bucket" test_destination_bucket_name = "test_destination_bucket" test_src_prefix = "test_src_dir" test_destination_prefix = "test_destination_dir" test_src_keys = [ "test_key1", "{}/test_key2".format(test_src_prefix), "{}/test_key3".format(test_src_prefix), "{}/test_key4".format(test_src_prefix) ] test_destination_keys = [ "{}/test_key2".format(test_destination_prefix), "{}/test_key3".format(test_destination_prefix), "{}/test_key4".format(test_destination_prefix) ] s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket=test_src_bucket_name) s3_resource.create_bucket(Bucket=test_destination_bucket_name) for key in test_src_keys: s3_resource.Bucket(test_src_bucket_name).put_object(Key=key, Body="") s3_util = S3Util(AWSCredentials("", "")) s3_util.move_objects( ("s3://" + test_src_bucket_name + "/" + test_src_prefix), ("s3://" + test_destination_bucket_name + "/" + test_destination_prefix)) src_objects = list( s3_resource.Bucket(test_src_bucket_name).objects.all()) assert len(src_objects) == 1 assert src_objects[0].key == test_src_keys[0] destination_objects = s3_resource.Bucket( test_destination_bucket_name).objects.all() assert sorted(map(lambda x: x.key, destination_objects)) == test_destination_keys
def test_add_step_to_cluster_fail_with_output(self): mock_emr_obj = moto.mock_emr() with mock_emr_obj: emr_cluster_client, emr_cluster_id = self.env_setup( self.emr_cluster_name, self.aws_region, self.aws_credentials, self.timeout_seconds, self.retry_seconds, self.long_timeout_seconds) s3_resource = Boto3Util.create_s3_resource() s3_resource.create_bucket(Bucket="mybucket") step_name = "Test_Step" command_str = "/usr/bin/spark-submit --class spark.job.main.class" emr_step_id = emr_cluster_client.add_step(step_name, command_str) cluster_steps = emr_cluster_client.get_list_of_steps() assert 1 == len(cluster_steps) assert cluster_steps[0] == emr_step_id emr_step_status, _ = emr_cluster_client.get_step_status( emr_step_id) assert emr_step_status == "STARTING" # "STARTING" is not a valid EMR Step state, so we will change it to "RUNNING" emr_backend = mock_emr_obj.backends[self.aws_region] fake_cluster = emr_backend.clusters[emr_cluster_id] fake_step = fake_cluster.steps[0] fake_step.state = "RUNNING" # Make sure that we do not wait for 300 seconds for gz file to be available. EMRClusterClient.AWSConstants.S3_FILE_AVAILABILITY_TIMEOUT_SECONDS = self.timeout_seconds # Required for correct log path generation in MockedMethod. MockedMethod.emr_cluster_id = emr_cluster_id stderr_gz_path = MockedMethod.log_file_template.format( emr_cluster_id=emr_cluster_id, emr_step_id=emr_step_id) expected_content = "Lots of content here!!!" def fail_step_and_write_output(): fake_step.state = "FAILED" time.sleep(0.3) compressed_content = TestEMRClusterClient._compress_string( expected_content) bucket, key = emr_cluster_client.s3_util.get_bucket_and_key( stderr_gz_path) s3_resource.Bucket(bucket).put_object(Key=key, Body=compressed_content) with pytest.raises(M3DAWSAPIException) as exc: # Wait for some time to let EMRClusterClient poll a few times. with ConcurrentExecutor(fail_step_and_write_output, 0.3): with patch( "m3d.hadoop.emr.emr_cluster_client.EMRClusterClient.get_step_status", side_effect=MockedMethod.get_step_status_mocked): emr_cluster_client.wait_for_step_completion( emr_step_id, self.long_timeout_seconds) err_msg = "EMR Step with cluster_id='{}' and step_id='{}' failed to complete".\ format(emr_cluster_id, emr_step_id) assert err_msg in str(exc.value) assert stderr_gz_path in str(exc.value) resulting_content = emr_cluster_client.s3_util.read_gzip_file_content( stderr_gz_path) assert expected_content == resulting_content
def list_objects_in_bucket(s3_bucket): s3_resource = Boto3Util.create_s3_resource() objects = [ obj.key for obj in s3_resource.Bucket(s3_bucket).objects.all() ] return sorted(objects)