def test_databricks_wait_for_run(mock_submit_run, databricks_run_config): mock_submit_run.return_value = 1 context = create_test_pipeline_execution_context() runner = DatabricksJobRunner(HOST, TOKEN, poll_interval_sec=0.01) task = databricks_run_config.pop("task") databricks_run_id = runner.submit_run(databricks_run_config, task) calls = { "num_calls": 0, "final_state": DatabricksRunState( DatabricksRunLifeCycleState.Terminated, DatabricksRunResultState.Success, "Finished", ), } def new_get_run_state(_run_id): calls["num_calls"] += 1 if calls["num_calls"] == 1: return DatabricksRunState( DatabricksRunLifeCycleState.Pending, None, None, ) elif calls["num_calls"] == 2: return DatabricksRunState( DatabricksRunLifeCycleState.Running, None, None, ) else: return calls["final_state"] with mock.patch.object(runner.client, "get_run_state", new=new_get_run_state): runner.wait_for_run_to_complete(context.log, databricks_run_id) calls["num_calls"] = 0 calls["final_state"] = DatabricksRunState( DatabricksRunLifeCycleState.Terminated, DatabricksRunResultState.Failed, "Failed", ) with pytest.raises(DatabricksError) as exc_info: with mock.patch.object(runner.client, "get_run_state", new=new_get_run_state): runner.wait_for_run_to_complete(context.log, databricks_run_id) assert "Run 1 failed with result state" in str(exc_info.value)
def test_construct_event_record(): messages = [] def _append_message(logger_message): messages.append(construct_event_record(logger_message)) logger = define_structured_logger('some_name', _append_message, level=DEBUG) context = create_test_pipeline_execution_context( loggers=[logger], tags={'pipeline': 'some_pipeline'}) context.log.info('random message') assert len(messages) == 1 message = messages[0] assert isinstance(message, LogMessageRecord)
def test_emr_log_location_for_cluster(emr_cluster_config, mock_s3_bucket): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) assert emr.log_location_for_cluster(cluster_id) == (mock_s3_bucket.name, "elasticmapreduce/") # Should raise when the log URI is missing emr_cluster_config = copy.deepcopy(emr_cluster_config) del emr_cluster_config["LogUri"] cluster_id = emr.run_job_flow(context.log, emr_cluster_config) with pytest.raises(EmrError) as exc_info: emr.log_location_for_cluster(cluster_id) assert "Log URI not specified, cannot retrieve step execution logs" in str( exc_info.value)
def test_emr_log_location_for_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) cluster_id = emr.run_job_flow(context, emr_cluster_config) assert emr.log_location_for_cluster(cluster_id) == ('emr-cluster-logs', 'elasticmapreduce/') # Should raise when the log URI is missing emr_cluster_config = copy.deepcopy(emr_cluster_config) del emr_cluster_config['LogUri'] cluster_id = emr.run_job_flow(context, emr_cluster_config) with pytest.raises(EmrError) as exc_info: emr.log_location_for_cluster(cluster_id) assert 'Log URI not specified, cannot retrieve step execution logs' in str( exc_info.value)
def test_structured_logger_in_context(): messages = [] def _append_message(logger_message): messages.append(logger_message) logger = define_structured_logger('some_name', _append_message, level=DEBUG) context = create_test_pipeline_execution_context(loggers=[logger]) context.log.debug('from_context', foo=2) assert len(messages) == 1 message = messages[0] assert message.name == 'some_name' assert message.level == DEBUG assert message.meta['foo'] == 2 assert message.meta['orig_message'] == 'from_context'
def test_pyspark_emr(mock_is_emr_step_complete, mock_read_events): mock_read_events.return_value = execute_pipeline( reconstructable(define_do_nothing_pipe), mode="local").events_by_step_key["do_nothing_solid.compute"] run_job_flow_args = dict( Instances={ "InstanceCount": 1, "KeepJobFlowAliveWhenNoSteps": True, "MasterInstanceType": "c3.medium", "Placement": { "AvailabilityZone": "us-west-1a" }, "SlaveInstanceType": "c3.xlarge", }, JobFlowRole="EMR_EC2_DefaultRole", LogUri="s3://mybucket/log", Name="cluster", ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region="us-west-1") context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="prod", run_config={ "resources": { "pyspark_step_launcher": { "config": deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, {"cluster_id": cluster_id}), } }, }, ) assert result.success assert mock_is_emr_step_complete.called
def test_is_emr_step_complete(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context.log, emr_cluster_config) step_name = "test_step" step_cmd = ["ls", "/"] step_ids = emr.add_job_flow_steps( context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)] ) def get_step_dict(step_id, step_state): return { "Step": { "Id": step_id, "Name": step_name, "Config": {"Jar": "command-runner.jar", "Properties": {}, "Args": step_cmd}, "ActionOnFailure": "CONTINUE", "Status": { "State": step_state, "StateChangeReason": {"Message": "everything is hosed"}, "Timeline": {"StartDateTime": _boto3_now()}, }, }, } emr_step_id = step_ids[0] describe_step_returns = [ get_step_dict(emr_step_id, "PENDING"), get_step_dict(emr_step_id, "RUNNING"), get_step_dict(emr_step_id, "COMPLETED"), get_step_dict(emr_step_id, "FAILED"), ] with mock.patch.object(EmrJobRunner, "describe_step", side_effect=describe_step_returns): assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) with pytest.raises(EmrError) as exc_info: emr.is_emr_step_complete(context.log, cluster_id, emr_step_id) assert "step failed" in str(exc_info.value)
def test_pyspark_emr(mock_wait, mock_get_step_events): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'us-west-1a' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_do_nothing_pipe).build_pipeline_definition() result = execute_pipeline( pipeline=pipeline_def, mode='prod', environment_dict={ 'resources': { 'pyspark_step_launcher': { 'config': deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, {'cluster_id': cluster_id}), } }, }, ) assert result.success assert mock_wait.called_once assert mock_get_step_events.called_once
def test_pyspark_emr(mock_wait): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': {'AvailabilityZone': 'us-west-1a'}, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context, run_job_flow_args) result = execute_pipeline_with_mode( pipeline=example_pipe, mode='prod', environment_dict={ 'solids': {'blah': {'config': {'foo': 'a string', 'bar': 123}}}, 'resources': { 'pyspark': { 'config': { 'pipeline_file': __file__, 'pipeline_fn_name': 'example_pipe', 'cluster_id': cluster_id, 'staging_bucket': 'dagster-scratch-80542c2', 'region_name': 'us-west-1', } } }, }, ) assert result.success assert mock_wait.called_once
def test_wait_for_log(): s3 = boto3.resource('s3', region_name=REGION) s3.create_bucket(Bucket='log_bucket') # pylint: disable=no-member def create_log(): time.sleep(0.5) out = io.BytesIO() with gzip.GzipFile(fileobj=out, mode='w') as fo: fo.write('foo bar'.encode()) s3.Object('log_bucket', 'some_log_file').put( # pylint: disable=no-member Body=out.getvalue() ) thread = threading.Thread(target=create_log, args=()) thread.daemon = True thread.start() context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION) res = emr.wait_for_log( context.log, log_bucket='log_bucket', log_key='some_log_file', waiter_delay=1, waiter_max_attempts=2, ) assert res == 'foo bar' with pytest.raises(EmrError) as exc_info: emr.wait_for_log( context.log, log_bucket='log_bucket', log_key='does_not_exist', waiter_delay=1, waiter_max_attempts=1, ) assert 'EMR log file did not appear on S3 after waiting' in str(exc_info.value)
def test_emr_create_cluster(emr_cluster_config): context = create_test_pipeline_execution_context() cluster = EmrJobRunner(region=REGION) cluster_id = cluster.run_job_flow(context, emr_cluster_config) assert cluster_id.startswith('j-')
def test_noarg_ctor(): legacy_context = create_test_pipeline_execution_context() assert uuid.UUID(legacy_context.run_id)
def test_emr_wait_for_step(emr_cluster_config): context = create_test_pipeline_execution_context() emr = EmrJobRunner(region=REGION, check_cluster_every=1) cluster_id = emr.run_job_flow(context, emr_cluster_config) step_name = 'test_step' step_cmd = ['ls', '/'] step_ids = emr.add_job_flow_steps( context, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]) def get_step_dict(step_id, step_state): return { 'Step': { 'Id': step_id, 'Name': step_name, 'Config': { 'Jar': 'command-runner.jar', 'Properties': {}, 'Args': step_cmd }, 'ActionOnFailure': 'CONTINUE', 'Status': { 'State': step_state, 'StateChangeReason': { 'Message': 'everything is hosed' }, 'Timeline': { 'StartDateTime': _boto3_now() }, }, }, } calls = {'num_calls': 0, 'final_state': 'COMPLETED'} def new_describe_step(_, cluster_id, step_id): calls['num_calls'] += 1 if calls['num_calls'] == 1: return get_step_dict(step_id, 'PENDING') elif calls['num_calls'] == 2: return get_step_dict(step_id, 'RUNNING') else: return get_step_dict(step_id, calls['final_state']) return emr.describe_step(cluster_id, step_id) with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step): emr.wait_for_steps_to_complete(context, cluster_id, step_ids) calls['num_calls'] = 0 calls['final_state'] = 'FAILED' with pytest.raises(EmrError) as exc_info: with mock.patch.object(EmrJobRunner, 'describe_step', new=new_describe_step): emr.wait_for_steps_to_complete(context, cluster_id, step_ids) assert 'step failed' in str(exc_info.value)
def create_sql_alchemy_context_from_engine(engine, *args, **kwargs): resources = DefaultSqlAlchemyResources(SqlAlchemyResource(engine)) context = create_test_pipeline_execution_context(resources=resources, *args, **kwargs) return check_supports_sql_alchemy_resource(context)