def test_emr_log_location_for_cluster(emr_cluster_config, mock_s3_bucket): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) assert emr.log_location_for_cluster(cluster_id) == (mock_s3_bucket.name, "elasticmapreduce/") # Should raise when the log URI is missing emr_cluster_config = copy.deepcopy(emr_cluster_config) del emr_cluster_config["LogUri"] cluster_id = emr.run_job_flow(context.log, emr_cluster_config) with pytest.raises(EmrError) as exc_info: emr.log_location_for_cluster(cluster_id) assert "Log URI not specified, cannot retrieve step execution logs" in str(exc_info.value)
def test_emr_retrieve_logs(emr_cluster_config, mock_s3_bucket): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) assert emr.log_location_for_cluster(cluster_id) == (mock_s3_bucket.name, "elasticmapreduce/") def create_log(): time.sleep(0.5) out = io.BytesIO() with gzip.GzipFile(fileobj=out, mode="w") as fo: fo.write("some log".encode()) prefix = "elasticmapreduce/{cluster_id}/steps/{step_id}".format( cluster_id=cluster_id, step_id="s-123456123456") for name in ["stdout.gz", "stderr.gz"]: mock_s3_bucket.Object(prefix + "/" + name).put( # pylint: disable=no-member Body=out.getvalue()) thread = threading.Thread(target=create_log, args=()) thread.daemon = True thread.start() stdout_log, stderr_log = emr.retrieve_logs_for_step_id( context.log, cluster_id, "s-123456123456") assert stdout_log == "some log" assert stderr_log == "some log"
def test_emr_describe_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() cluster = EmrJobRunner(region=REGION) cluster_id = cluster.run_job_flow(context.log, emr_cluster_config) cluster_info = cluster.describe_cluster(cluster_id)['Cluster'] assert cluster_info['Name'] == 'test-emr' assert EmrClusterState(cluster_info['Status']['State']) == EmrClusterState.Waiting
def test_emr_describe_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() cluster = EmrJobRunner(region=REGION) cluster_id = cluster.run_job_flow(context.log, emr_cluster_config) cluster_info = cluster.describe_cluster(cluster_id)["Cluster"] assert cluster_info["Name"] == "test-emr" assert EmrClusterState(cluster_info["Status"]["State"]) == EmrClusterState.Waiting
def test_emr_retrieve_logs(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) assert emr.log_location_for_cluster(cluster_id) == ('emr-cluster-logs', 'elasticmapreduce/') s3 = boto3.resource('s3', region_name=REGION) s3.create_bucket(Bucket='emr-cluster-logs') # pylint: disable=no-member def create_log(): time.sleep(0.5) out = io.BytesIO() with gzip.GzipFile(fileobj=out, mode='w') as fo: fo.write('some log'.encode()) prefix = 'elasticmapreduce/{cluster_id}/steps/{step_id}'.format( cluster_id=cluster_id, step_id='s-123456123456' ) for name in ['stdout.gz', 'stderr.gz']: s3.Object('emr-cluster-logs', prefix + '/' + name).put( # pylint: disable=no-member Body=out.getvalue() ) thread = threading.Thread(target=create_log, args=()) thread.daemon = True thread.start() stdout_log, stderr_log = emr.retrieve_logs_for_step_id( context.log, cluster_id, 's-123456123456' ) assert stdout_log == 'some log' assert stderr_log == 'some log'
def test_emr_log_location_for_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context, emr_cluster_config) assert emr.log_location_for_cluster(cluster_id) == ('emr-cluster-logs', 'elasticmapreduce/') # Should raise when the log URI is missing emr_cluster_config = copy.deepcopy(emr_cluster_config) del emr_cluster_config['LogUri'] cluster_id = emr.run_job_flow(context, emr_cluster_config) with pytest.raises(EmrError) as exc_info: emr.log_location_for_cluster(cluster_id) assert 'Log URI not specified, cannot retrieve step execution logs' in str( exc_info.value)
def test_emr_id_from_name(emr_cluster_config): context = create_test_pipeline_execution_context() cluster = EmrJobRunner(region=REGION) cluster_id = cluster.run_job_flow(context.log, emr_cluster_config) assert cluster.cluster_id_from_name("test-emr") == cluster_id with pytest.raises(EmrError) as exc_info: cluster.cluster_id_from_name("cluster-doesnt-exist") assert "cluster cluster-doesnt-exist not found in region us-west-1" in str(exc_info.value)
def test_is_emr_step_complete(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = "test_step" step_cmd = ["ls", "/"] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]) def get_step_dict(step_id, step_state): return { "Step": { "Id": step_id, "Name": step_name, "Config": { "Jar": "command-runner.jar", "Properties": {}, "Args": step_cmd }, "ActionOnFailure": "CONTINUE", "Status": { "State": step_state, "StateChangeReason": { "Message": "everything is hosed" }, "Timeline": { "StartDateTime": _boto3_now() }, }, }, } emr_step_id = step_ids[0] describe_step_returns = [ get_step_dict(emr_step_id, "PENDING"), get_step_dict(emr_step_id, "RUNNING"), get_step_dict(emr_step_id, "COMPLETED"), get_step_dict(emr_step_id, "FAILED"), ] with mock.patch.object(EmrJobRunner, "describe_step", side_effect=describe_step_returns): assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) with pytest.raises(EmrError) as exc_info: emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert "step failed" in str(exc_info.value)
def test_is_emr_step_complete(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = 'test_step' step_cmd = ['ls', '/'] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]) def get_step_dict(step_id, step_state): return { 'Step': { 'Id': step_id, 'Name': step_name, 'Config': { 'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd }, 'ActionOnFailure': 'CONTINUE', 'Status': { 'State': step_state, 'StateChangeReason': { 'Message': 'everything is hosed' }, 'Timeline': { 'StartDateTime': _boto3_now() }, }, }, } emr_step_id = step_ids[0] describe_step_returns = [ get_step_dict(emr_step_id, 'PENDING'), get_step_dict(emr_step_id, 'RUNNING'), get_step_dict(emr_step_id, 'COMPLETED'), get_step_dict(emr_step_id, 'FAILED'), ] with mock.patch.object(EmrJobRunner, 'describe_step', side_effect=describe_step_returns): assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) with pytest.raises(EmrError) as exc_info: emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert 'step failed' in str(exc_info.value)
def test_emr_add_tags_and_describe_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) emr.add_tags(context.log, {"foobar": "v1", "baz": "123"}, cluster_id) tags = emr.describe_cluster(cluster_id)["Cluster"]["Tags"] assert {"Key": "baz", "Value": "123"} in tags assert {"Key": "foobar", "Value": "v1"} in tags
def test_emr_add_tags_and_describe_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) emr.add_tags(context.log, {'foobar': 'v1', 'baz': '123'}, cluster_id) tags = emr.describe_cluster(cluster_id)['Cluster']['Tags'] assert {'Key': 'baz', 'Value': '123'} in tags assert {'Key': 'foobar', 'Value': 'v1'} in tags
def test_pyspark_emr(mock_wait): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'us-west-1a' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context, run_job_flow_args) result = execute_pipeline( example_pipe, environment_dict={ 'solids': { 'blah': { 'config': { 'foo': 'a string', 'bar': 123 } } }, 'resources': { 'pyspark': { 'config': { 'pipeline_file': __file__, 'pipeline_fn_name': 'example_pipe', 'cluster_id': cluster_id, 'staging_bucket': 'dagster-scratch-80542c2', 'region_name': 'us-west-1', } } }, }, run_config=RunConfig(mode='prod'), ) assert result.success assert mock_wait.called_once
def test_emr_wait_for_step(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = 'test_step' step_cmd = ['ls', '/'] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)] ) def get_step_dict(step_id, step_state): return { 'Step': { 'Id': step_id, 'Name': step_name, 'Config': {'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd}, 'ActionOnFailure': 'CONTINUE', 'Status': { 'State': step_state, 'StateChangeReason': {'Message': 'everything is hosed'}, 'Timeline': {'StartDateTime': _boto3_now()}, }, }, } calls = {'num_calls': 0, 'final_state': 'COMPLETED'} def new_describe_step(_, cluster_id, step_id): calls['num_calls'] += 1 if calls['num_calls'] == 1: return get_step_dict(step_id, 'PENDING') elif calls['num_calls'] == 2: return get_step_dict(step_id, 'RUNNING') else: return get_step_dict(step_id, calls['final_state']) return emr.describe_step(cluster_id, step_id) with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step): emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids) calls['num_calls'] = 0 calls['final_state'] = 'FAILED' with pytest.raises(EmrError) as exc_info: with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step): emr.wait_for_emr_steps_to_complete(context.log, cluster_id, step_ids) assert 'step failed' in str(exc_info.value)
def test_pyspark_emr(mock_is_emr_step_complete, mock_read_events, mock_s3_bucket): mock_read_events.return_value = execute_pipeline( reconstructable(define_do_nothing_pipe), mode="local").events_by_step_key["do_nothing_solid"] run_job_flow_args = dict( Instances={ "InstanceCount": 1, "KeepJobFlowAliveWhenNoSteps": True, "MasterInstanceType": "c3.medium", "Placement": { "AvailabilityZone": "us-west-1a" }, "SlaveInstanceType": "c3.xlarge", }, JobFlowRole="EMR_EC2_DefaultRole", LogUri="s3://{bucket}/log".format(bucket=mock_s3_bucket.name), Name="cluster", ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region="us-west-1") context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="prod", run_config={ "resources": { "pyspark_step_launcher": { "config": deep_merge_dicts( BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, { "cluster_id": cluster_id, "staging_bucket": mock_s3_bucket.name }, ), } }, }, ) assert result.success assert mock_is_emr_step_complete.called
def test_pyspark_emr(mock_wait, mock_get_step_events): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'us-west-1a' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_do_nothing_pipe).build_pipeline_definition() result = execute_pipeline( pipeline=pipeline_def, mode='prod', environment_dict={ 'resources': { 'pyspark_step_launcher': { 'config': deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, {'cluster_id': cluster_id}), } }, }, ) assert result.success assert mock_wait.called_once assert mock_get_step_events.called_once
def test_emr_create_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() cluster = EmrJobRunner(region=REGION) cluster_id = cluster.run_job_flow(context, emr_cluster_config) assert cluster_id.startswith('j-')