def test_pipeline_step_key_subset_execution_wrong_step_key_in_step_output_handles( ): pipeline_def = define_addy_pipeline() old_run_id = str(uuid.uuid4()) environment_dict = env_with_fs( {'solids': { 'add_one': { 'inputs': { 'num': { 'value': 3 } } } }}) result = execute_pipeline(pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id)) assert result.success assert result.run_id == old_run_id new_run_id = str(uuid.uuid4()) with pytest.raises(DagsterExecutionStepNotFoundError): execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[ StepOutputHandle('invalid_in_step_output_handles') ], ), step_keys_to_execute=['add_two.compute'], ), )
def test_execution_plan_reexecution_with_in_memory(): pipeline_def = define_addy_pipeline() instance = DagsterInstance.ephemeral() old_run_id = str(uuid.uuid4()) environment_dict = {'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}} result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=old_run_id), instance=instance, ) assert result.success ## re-execute add_two new_run_id = str(uuid.uuid4()) in_memory_run_config = RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('add_one.compute')] ), ) execution_plan = create_execution_plan( pipeline_def, environment_dict=environment_dict, run_config=in_memory_run_config ) with pytest.raises(DagsterInvariantViolationError): execute_plan( execution_plan, environment_dict=environment_dict, run_config=in_memory_run_config, step_keys_to_execute=['add_two.compute'], instance=instance, )
def _do_execute_plan(graphene_info, execution_params, dauphin_pipeline): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.inst_param(execution_params, 'execution_params', ExecutionParams) event_records = [] run_config = RunConfig( run_id=execution_params.execution_metadata.run_id, mode=execution_params.mode, tags=execution_params.execution_metadata.tags, event_callback=event_records.append, ) execution_plan = create_execution_plan( pipeline=dauphin_pipeline.get_dagster_pipeline(), environment_dict=execution_params.environment_dict, run_config=run_config, ) if execution_params.step_keys: for step_key in execution_params.step_keys: if not execution_plan.has_step(step_key): raise UserFacingGraphQLError( graphene_info.schema.type_named('InvalidStepError')(invalid_step_key=step_key) ) execute_plan( execution_plan=execution_plan, environment_dict=execution_params.environment_dict, run_config=run_config, step_keys_to_execute=execution_params.step_keys, ) def to_graphql_event(event_record): return from_dagster_event_record( graphene_info, event_record, dauphin_pipeline, execution_plan ) return graphene_info.schema.type_named('ExecutePlanSuccess')( pipeline=dauphin_pipeline, has_failures=any( er for er in event_records if er.is_dagster_event and er.dagster_event.event_type == DagsterEventType.STEP_FAILURE ), step_events=list( map(to_graphql_event, filter(lambda er: er.is_dagster_event, event_records)) ), )
def test_using_gcs_for_subplan(gcs_bucket): pipeline_def = define_inty_pipeline() environment_dict = {'storage': {'gcs': {'config': {'gcs_bucket': gcs_bucket}}}} run_id = str(uuid.uuid4()) execution_plan = create_execution_plan( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id) ) assert execution_plan.get_step_by_key('return_one.compute') step_keys = ['return_one.compute'] instance = DagsterInstance.ephemeral() pipeline_run = PipelineRun.create_empty_run( pipeline_def.name, run_id=run_id, environment_dict=environment_dict ) return_one_step_events = list( execute_plan( execution_plan.build_subset_plan(step_keys), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(return_one_step_events, 'return_one.compute') with scoped_pipeline_context(pipeline_def, environment_dict, pipeline_run, instance) as context: store = GCSIntermediateStore( gcs_bucket, run_id, client=context.scoped_resources_builder.build().gcs.client ) assert store.has_intermediate(context, 'return_one.compute') assert store.get_intermediate(context, 'return_one.compute', Int).obj == 1 add_one_step_events = list( execute_plan( execution_plan.build_subset_plan(['add_one.compute']), environment_dict=environment_dict, pipeline_run=pipeline_run, instance=instance, ) ) assert get_step_output(add_one_step_events, 'add_one.compute') with scoped_pipeline_context(pipeline_def, environment_dict, pipeline_run, instance) as context: assert store.has_intermediate(context, 'add_one.compute') assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
def test_multiproc_event_sink(): pipeline = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_diamond_pipeline' ).build_pipeline_definition() sink = InMemoryEventSink() result = execute_pipeline( pipeline, run_config=RunConfig(event_sink=sink), environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocess': {}}}, ) assert result.success assert len(result.event_list) == len(sink.dagster_event_records)
def do_execute_command(pipeline, env_file_list, mode=None): check.inst_param(pipeline, 'pipeline', PipelineDefinition) env_file_list = check.opt_list_param(env_file_list, 'env_file_list', of_type=str) environment_dict = load_yaml_from_glob_list( env_file_list) if env_file_list else {} return execute_pipeline( pipeline, environment_dict=environment_dict, run_config=RunConfig(mode=mode), instance=DagsterInstance.get(), )
def do_execute_command(pipeline, env_file_list, raise_on_error): check.inst_param(pipeline, 'pipeline', PipelineDefinition) env_file_list = check.opt_list_param(env_file_list, 'env_file_list', of_type=str) environment_dict = load_yaml_from_glob_list( env_file_list) if env_file_list else {} return execute_pipeline( pipeline, environment_dict=environment_dict, run_config=RunConfig(executor_config=InProcessExecutorConfig( raise_on_error=raise_on_error)), )
def test_ingest_pipeline_fast_filesystem_storage(postgres): ingest_config_dict = load_yaml_from_globs( config_path('local_base.yaml'), config_path('local_fast_ingest.yaml'), config_path('filesystem_storage.yaml'), ) ingest_config_dict = enviroment_overrides(ingest_config_dict) result_ingest = execute_pipeline( ingest_pipeline_def, ingest_config_dict, run_config=RunConfig(mode='local'), instance=DagsterInstance.local_temp(), ) assert result_ingest.success
def test_no_repo_registration_error(): with pytest.raises( DagstermillError, match= 'Error occurred during the execution of Dagstermill solid no_repo_reg', ) as exc: execute_pipeline(define_no_repo_registration_error_pipeline()) assert ( 'If Dagstermill solids have outputs that require serialization strategies' in exc.value.original_exc_info[1].args[0]) with exec_for_test( define_no_repo_registration_error_pipeline(), run_config=RunConfig.nonthrowing_in_process()) as result: assert not result.success
def test_error_notebook(): with pytest.raises( DagstermillError, match='Error occurred during the execution of Dagstermill solid' ) as exc: with exec_for_test('define_error_pipeline') as result: pass assert 'Someone set up us the bomb' in exc.value.original_exc_info[1].args[0] with exec_for_test( 'define_error_pipeline', run_config=RunConfig.nonthrowing_in_process() ) as result: assert not result.success assert result.step_event_list[1].event_type.value == 'STEP_MATERIALIZATION' assert result.step_event_list[2].event_type.value == 'STEP_FAILURE'
def test_execute_multi_mode_loggers_with_single_logger(): pipeline_def, foo_logger_captured_results, bar_logger_captured_results = ( define_multi_mode_with_loggers_pipeline() ) execute_pipeline( pipeline_def, run_config=RunConfig(mode='foo_mode'), environment_dict={'loggers': {'foo': {'config': {'log_level': 'DEBUG'}}}}, ) assert not bar_logger_captured_results original_messages = parse_captured_results(foo_logger_captured_results) assert len([x for x in original_messages if 'Here we are' in x]) == 1
def test_download_csv_locally_pipeline(mocker): # Setup download mocks mocker.patch('dagster_examples.bay_bikes.solids.requests') mocker.patch('dagster_examples.bay_bikes.solids._write_chunks_to_fp') mocker.patch('dagster_examples.bay_bikes.solids._unzip_file', side_effect=mock_unzip_csv) # execute tests result = execute_pipeline_with_preset(monthly_bay_bike_etl_pipeline, preset_name='dev', run_config=RunConfig(mode='local')) assert result.success with open('/tmp/test_bucket/key_storage.json') as fp: key_storage = json.load(fp) assert len(key_storage.items()) == 1
def test_filesystem_run_storage_from_run_config(): @solid def check_run_storage(context): assert isinstance(context.get_system_context().run_storage, FileSystemRunStorage) pipeline = PipelineDefinition(name='filesystem_run_storage_test', solids=[check_run_storage]) result = execute_pipeline( pipeline, run_config=RunConfig(storage_mode=RunStorageMode.FILESYSTEM)) assert result.success assert os.path.isdir(os.path.join(base_run_directory(), result.run_id))
def test_execution_plan_wrong_invalid_step_key(): pipeline_def = define_addy_pipeline() old_run_id = str(uuid.uuid4()) environment_dict = {'solids': {'add_one': {'inputs': {'num': {'value': 3}}}}} result = execute_pipeline( pipeline_def, environment_dict=environment_dict, run_config=RunConfig(storage_mode=RunStorageMode.FILESYSTEM, run_id=old_run_id), ) new_run_id = str(uuid.uuid4()) run_config = RunConfig( run_id=new_run_id, reexecution_config=ReexecutionConfig( previous_run_id=result.run_id, step_output_handles=[StepOutputHandle('not_valid.transform')], ), storage_mode=RunStorageMode.FILESYSTEM, ) execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) with pytest.raises(DagsterExecutionStepNotFoundError) as exc_info: execute_plan( execution_plan, environment_dict=environment_dict, run_config=run_config, step_keys_to_execute=['add_two.transform'], ) assert str(exc_info.value) == ( 'Step not_valid.transform was specified as a step from a previous run. ' 'It does not exist.' )
def test_injected_tags(): called = {} @solid def check_tags(context): assert context.get_tag('foo') == 'bar' called['yup'] = True pipeline_def = PipelineDefinition(name='injected_run_id', solid_defs=[check_tags]) result = execute_pipeline(pipeline_def, run_config=RunConfig(tags={'foo': 'bar'})) assert result.success assert called['yup']
def test_pyspark_emr(mock_wait): client = boto3.client('emr', region_name='us-west-1') run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'us-west-1a' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) job_flow_id = client.run_job_flow(**run_job_flow_args)['JobFlowId'] result = execute_pipeline( example_pipe, environment_dict={ 'solids': { 'blah': { 'config': { 'foo': 'a string', 'bar': 123 } } }, 'resources': { 'pyspark': { 'config': { 'pipeline_file': __file__, 'pipeline_fn_name': 'example_pipe', 'job_flow_id': job_flow_id, 'staging_bucket': 'dagster-scratch-80542c2', 'region_name': 'us-west-1', } } }, }, run_config=RunConfig(mode='prod'), ) assert result.success assert mock_wait.called_once
def _launch_pipeline_execution(graphene_info, execution_params, is_reexecuted=False): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.inst_param(execution_params, 'execution_params', ExecutionParams) if is_reexecuted: # required fields for re-execution execution_metadata = check.inst_param( execution_params.execution_metadata, 'execution_metadata', ExecutionMetadata ) check.str_param(execution_metadata.root_run_id, 'root_run_id') check.str_param(execution_metadata.parent_run_id, 'parent_run_id') error_type = 'RunLauncherNotDefinedError' success_type = ( 'LaunchPipelineExecutionSuccess' if not is_reexecuted else 'LaunchPipelineReexecutionSuccess' ) instance = graphene_info.context.instance run_launcher = instance.run_launcher if run_launcher is None: return graphene_info.schema.type_named(error_type)() pipeline_def = get_pipeline_def_from_selector(graphene_info, execution_params.selector) get_validated_config( graphene_info, pipeline_def, environment_dict=execution_params.environment_dict, mode=execution_params.mode, ) execution_plan = create_execution_plan( pipeline_def, execution_params.environment_dict, run_config=RunConfig( mode=execution_params.mode, previous_run_id=execution_params.previous_run_id ), ) _check_start_pipeline_execution_errors(graphene_info, execution_params, execution_plan) run = instance.launch_run(_create_pipeline_run(instance, pipeline_def, execution_params)) return graphene_info.schema.type_named(success_type)( run=graphene_info.schema.type_named('PipelineRun')(run) )
def start_pipeline_execution(graphene_info, execution_params, reexecution_config): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.inst_param(execution_params, 'execution_params', ExecutionParams) check.opt_inst_param(reexecution_config, 'reexecution_config', ReexecutionConfig) pipeline_run_storage = graphene_info.context.pipeline_runs dauphin_pipeline = get_dauphin_pipeline_from_selector( graphene_info, execution_params.selector) get_validated_config( graphene_info, dauphin_pipeline, environment_dict=execution_params.environment_dict, mode=execution_params.mode, ) execution_plan = create_execution_plan( dauphin_pipeline.get_dagster_pipeline(), execution_params.environment_dict, run_config=RunConfig(mode=execution_params.mode), ) _check_start_pipeline_execution_errors(graphene_info, execution_params, execution_plan, reexecution_config) run = pipeline_run_storage.create_run( run_id=execution_params.execution_metadata.run_id if execution_params.execution_metadata.run_id else make_new_run_id(), selector=execution_params.selector, env_config=execution_params.environment_dict, mode=execution_params.mode, reexecution_config=reexecution_config, step_keys_to_execute=execution_params.step_keys, ) pipeline_run_storage.add_run(run) graphene_info.context.execution_manager.execute_pipeline( graphene_info.context.get_handle(), dauphin_pipeline.get_dagster_pipeline(), run, raise_on_error=graphene_info.context.raise_on_error, ) return graphene_info.schema.type_named('StartPipelineExecutionSuccess')( run=graphene_info.schema.type_named('PipelineRun')(run))
def test_failure_midstream(): ''' A \\ C (fails) = D (skipped) // B ''' solid_a = create_root_success_solid('A') solid_b = create_root_success_solid('B') def fail_fn(_context, inputs): check.failed('user error') return [inputs['A'], inputs['B'], {'C': 'compute_called'}] def success_fn(_context, inputs): return [inputs['C'], {'D': 'compute_called'}] solid_c = single_output_solid( name='C', input_defs=[InputDefinition(name='A'), InputDefinition(name='B')], compute_fn=fail_fn, output_def=OutputDefinition(), ) solid_d = single_output_solid( name='D', input_defs=[InputDefinition(name='C')], compute_fn=success_fn, output_def=OutputDefinition(), ) pipeline_def = PipelineDefinition( solid_defs=[solid_a, solid_b, solid_c, solid_d], dependencies={ 'C': {'A': DependencyDefinition(solid_a.name), 'B': DependencyDefinition(solid_b.name)}, 'D': {'C': DependencyDefinition(solid_c.name)}, }, ) pipeline_result = execute_pipeline(pipeline_def, run_config=RunConfig.nonthrowing_in_process()) assert pipeline_result.result_for_solid('A').success assert pipeline_result.result_for_solid('B').success assert not pipeline_result.result_for_solid('C').success assert pipeline_result.result_for_solid('C').failure_data.error.cls_name == 'CheckError' assert not pipeline_result.result_for_solid('D').success assert pipeline_result.result_for_solid('D').skipped
def test_mem_storage_error_pipeline_multiprocess(): with pytest.raises(DagsterInvariantViolationError) as exc_info: execute_pipeline( define_diamond_pipeline(), run_config=RunConfig(executor_config=MultiprocessExecutorConfig( ExecutionTargetHandle.for_pipeline_fn(define_error_pipeline))), ) assert ('While invoking ' 'pipeline diamond_execution. You have attempted to use the ' 'multiprocessing executor while using system storage in_memory ' 'which does not persist intermediates. This means there would ' 'be no way to move data between different processes. Please ' 'configure your pipeline in the storage config section to use ' 'persistent system storage such as the filesystem.') in str( exc_info.value)
def test_execute_multi_mode_loggers_with_multiple_loggers_single_config(): pipeline_def, foo_logger_captured_results, bar_logger_captured_results = ( define_multi_mode_with_loggers_pipeline() ) execute_pipeline( pipeline_def, run_config=RunConfig(mode='foo_bar_mode'), environment_dict={'loggers': {'foo': {'config': {'log_level': 'DEBUG'}}}}, ) foo_original_messages = parse_captured_results(foo_logger_captured_results) assert len(list(filter(lambda x: x == '"Here we are"', foo_original_messages))) == 1 assert not bar_logger_captured_results
def test_local(): result = execute_pipeline( example_pipe, environment_dict={ 'solids': { 'blah': { 'config': { 'foo': 'a string', 'bar': 123 } } }, }, run_config=RunConfig(mode='local'), ) assert result.success
def test_using_file_system_for_subplan_invalid_step(): pipeline = define_inty_pipeline() environment_dict = {'storage': {'filesystem': {}}} execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict) run_id = str(uuid.uuid4()) with pytest.raises(DagsterExecutionStepNotFoundError): execute_plan( execution_plan, environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=['nope'], )
def test_airline_pipeline_1_warehouse(docker_compose_db): warehouse_config_object = load_yaml_from_globs( script_relative_path( '../../dagster_examples/airline_demo/environments/local_base.yaml' ), script_relative_path( '../../dagster_examples/airline_demo/environments/local_warehouse.yaml' ), ) warehouse_config_object = enviroment_overrides(warehouse_config_object) result_warehouse = execute_pipeline( define_airline_demo_warehouse_pipeline(), warehouse_config_object, run_config=RunConfig(mode='local'), ) assert result_warehouse.success
def test_dask_cluster(): result = execute_on_dask( ExecutionTargetHandle.for_pipeline_module( 'dagster_examples.toys.hammer', 'define_hammer_pipeline'), env_config={ 'storage': { 's3': { 's3_bucket': 'dagster-airflow-scratch' } } }, run_config=RunConfig(storage_mode=RunStorageMode.S3), dask_config=DaskConfig(address='%s:8786' % os.getenv('DASK_ADDRESS')), ) assert result.success assert result.result_for_solid('total').transformed_value() == 4
def test_error_resource(snapshot): result = execute_pipeline( resource_error_pipeline, environment_dict={'storage': { 'filesystem': {} }}, run_config=RunConfig(executor_config=InProcessExecutorConfig( raise_on_error=False)), ) assert not result.success assert len(result.event_list) == 1 init_failure_event = result.event_list[0] assert init_failure_event.event_type_value == 'PIPELINE_INIT_FAILURE' snapshot.assert_match(init_failure_event.message)
def test_s3_object_store_with_composite_type_storage_plugin(): run_id = str(uuid.uuid4()) # FIXME need a dedicated test bucket object_store = S3ObjectStore( run_id=run_id, s3_bucket='dagster-airflow-scratch', types_to_register={String.inst(): FancyStringS3TypeStoragePlugin}, ) with yield_pipeline_execution_context(PipelineDefinition([]), {}, RunConfig(run_id=run_id)) as context: with pytest.raises(check.NotImplementedCheckError): object_store.set_value(['hello'], context, resolve_to_runtime_type(List_(String_)), ['obj_name'])
def _create_pipeline_run(instance, pipeline, execution_params): step_keys_to_execute = execution_params.step_keys if not execution_params.step_keys and execution_params.previous_run_id: execution_plan = create_execution_plan( pipeline, execution_params.environment_dict, run_config=RunConfig( mode=execution_params.mode, previous_run_id=execution_params.previous_run_id, tags=execution_params.execution_metadata.tags, ), ) step_keys_to_execute = get_retry_steps_from_execution_plan( instance, execution_plan) return pipeline_run_from_execution_params(execution_params, step_keys_to_execute)
def test_using_file_system_for_subplan_missing_input(): pipeline = define_inty_pipeline() environment_dict = {'storage': {'filesystem': {}}} execution_plan = create_execution_plan(pipeline, environment_dict=environment_dict) run_id = str(uuid.uuid4()) with pytest.raises(DagsterStepOutputNotFoundError): execute_plan( execution_plan, DagsterInstance.ephemeral(), environment_dict=environment_dict, run_config=RunConfig(run_id=run_id), step_keys_to_execute=['add_one.compute'], )
def get_pipeline_run_observable(graphene_info, run_id, after=None): check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.str_param(run_id, 'run_id') check.opt_int_param(after, 'after') instance = graphene_info.context.instance run = instance.get_run(run_id) if not run: def _get_error_observable(observer): observer.on_next( graphene_info.schema. type_named('PipelineRunLogsSubscriptionFailure')( missingRunId=run_id, message='Could not load run with id {}'.format(run_id))) return Observable.create(_get_error_observable) # pylint: disable=E1101 if not instance.can_watch_events: def _get_error_observable(observer): observer.on_next( graphene_info.schema.type_named( 'PipelineRunLogsSubscriptionFailure') (message= 'Event log storage on current DagsterInstance is not watchable.' )) return Observable.create(_get_error_observable) # pylint: disable=E1101 pipeline = get_dauphin_pipeline_from_selector(graphene_info, run.selector) execution_plan = create_execution_plan(pipeline.get_dagster_pipeline(), run.environment_dict, RunConfig(mode=run.mode)) # pylint: disable=E1101 return Observable.create( PipelineRunObservableSubscribe( instance, run_id, after_cursor=after)).map( lambda events: graphene_info.schema. type_named('PipelineRunLogsSubscriptionSuccess')( runId=run_id, messages=[ from_event_record(graphene_info, event, pipeline, execution_plan) for event in events ], ))