def test_composite_execute(): result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'dask_composite_pipeline' ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {}}, 'execution': {'dask': {'config': {'timeout': 30}}}, }, ) assert result.success
def test_execute_on_dask(): result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'dask_engine_pipeline' ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {}}, 'execution': {'dask': {'config': {'timeout': 30}}}, }, ) assert result.result_for_solid('simple').output_value() == 1
def test_solid_subset(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_diamond_pipeline').build_pipeline_definition() result = execute_pipeline(pipe, preset='just_adder', instance=DagsterInstance.local_temp()) assert result.success assert result.result_for_solid('adder').output_value() == 2
def test_diamond_multi_execution(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_diamond_pipeline' ).build_pipeline_definition() result = execute_pipeline( pipe, environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocess': {}}}, instance=DagsterInstance.local_temp(), ) assert result.success assert result.result_for_solid('adder').output_value() == 11
def execute_eagerly_on_celery(pipeline_name): with seven.TemporaryDirectory() as tempdir: result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, pipeline_name ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {'config': {'base_dir': tempdir}}}, 'execution': {'celery': {'config': {'config_source': {'task_always_eager': True}}}}, }, instance=DagsterInstance.local_temp(tempdir=tempdir), ) yield result
def execute_pipeline_on_celery(pipeline_name): with seven.TemporaryDirectory() as tempdir: handle = ExecutionTargetHandle.for_pipeline_python_file(__file__, pipeline_name) pipeline_def = handle.build_pipeline_definition() instance = DagsterInstance.local_temp(tempdir=tempdir) result = execute_pipeline( pipeline_def, environment_dict={ 'storage': {'filesystem': {'config': {'base_dir': tempdir}}}, 'execution': {'celery': {}}, }, instance=instance, ) yield result
def test_multiprocessing_execution_for_composite_solid(): environment_dict = { 'solids': { 'composite_with_nested_config_solid': { 'solids': {'node_a': {'config': {'foo': 'baz'}}, 'node_b': {'config': {'bar': 3}}} } } } run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file(__file__, 'composite_pipeline') pipeline_run = InMemoryPipelineRun( run_id, ExecutionSelector('nonce'), environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, ) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline( handle, composite_pipeline, pipeline_run, raise_on_error=False ) execution_manager.join() assert pipeline_run.status == PipelineRunStatus.SUCCESS environment_dict = { 'solids': { 'composite_with_nested_config_solid': { 'solids': {'node_a': {'config': {'foo': 'baz'}}, 'node_b': {'config': {'bar': 3}}} } }, 'execution': {'multiprocess': {}}, 'storage': {'filesystem': {}}, } run_id = make_new_run_id() pipeline_run = InMemoryPipelineRun( run_id, ExecutionSelector('nonce'), environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, ) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline( handle, composite_pipeline, pipeline_run, raise_on_error=False ) execution_manager.join()
def test_running(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'passing_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': script_relative_path('data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run( PipelineRun( pipeline_name=passing_pipeline.name, run_id=run_id, selector=selector, environment_dict=environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, )) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, passing_pipeline, pipeline_run, instance, raise_on_error=False) execution_manager.join() assert instance.get_run(run_id).status == PipelineRunStatus.SUCCESS events = instance.all_logs(run_id) assert events process_start_events = get_events_of_type( events, DagsterEventType.PIPELINE_PROCESS_START) assert len(process_start_events) == 1 process_started_events = get_events_of_type( events, DagsterEventType.PIPELINE_PROCESS_STARTED) assert len(process_started_events) == 1 process_exited_events = get_events_of_type( events, DagsterEventType.PIPELINE_PROCESS_EXITED) assert len(process_exited_events) == 1
def test_bad_broker(): with pytest.raises(check.CheckError) as exc_info: event_stream = execute_pipeline_iterator( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'test_diamond_pipeline' ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {}}, 'execution': {'celery': {'config': {'broker': 'notlocal.bad'}}}, }, instance=DagsterInstance.local_temp(), ) list(event_stream) assert 'Must use S3 or GCS storage with non-local Celery' in str(exc_info.value)
def test_two_runs_running(): run_id_one = make_new_run_id() run_id_two = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file(__file__, 'infinite_loop_pipeline') with safe_tempfile_path() as file_one, safe_tempfile_path() as file_two: instance = DagsterInstance.local_temp() execution_manager = SubprocessExecutionManager(instance) pipeline_run_one = instance.create_run( PipelineRun.create_empty_run( pipeline_name=infinite_loop_pipeline.name, run_id=run_id_one, environment_dict={'solids': {'loop': {'config': {'file': file_one}}}}, ) ) execution_manager.execute_pipeline( handle, infinite_loop_pipeline, pipeline_run_one, instance ) pipeline_run_two = instance.create_run( PipelineRun.create_empty_run( pipeline_name=infinite_loop_pipeline.name, run_id=run_id_two, environment_dict={'solids': {'loop': {'config': {'file': file_two}}}}, ) ) execution_manager.execute_pipeline( handle, infinite_loop_pipeline, pipeline_run_two, instance ) # ensure both runs have begun execution while not os.path.exists(file_one) and not os.path.exists(file_two): time.sleep(0.1) assert execution_manager.is_process_running(run_id_one) assert execution_manager.is_process_running(run_id_two) assert execution_manager.terminate(run_id_one) assert not execution_manager.is_process_running(run_id_one) assert execution_manager.is_process_running(run_id_two) assert execution_manager.terminate(run_id_two) assert not execution_manager.is_process_running(run_id_one) assert not execution_manager.is_process_running(run_id_two)
def test_multiproc_event_sink(): pipeline = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_diamond_pipeline' ).build_pipeline_definition() sink = InMemoryEventSink() result = execute_pipeline( pipeline, run_config=RunConfig(event_sink=sink), environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocess': {}}}, ) assert result.success assert len(result.event_list) == len(sink.dagster_event_records)
def test_pandas_dask(): environment_dict = { 'solids': { 'pandas_solid': { 'inputs': {'df': {'csv': {'path': file_relative_path(__file__, 'ex.csv')}}} } } } result = execute_on_dask( ExecutionTargetHandle.for_pipeline_python_file(__file__, pandas_pipeline.name), env_config={'storage': {'filesystem': {}}, **environment_dict}, dask_config=DaskConfig(timeout=30), ) assert result.success
def test_priorities_mp(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'priority_test' ).build_pipeline_definition() result = execute_pipeline( pipe, { 'execution': {'multiprocess': {'config': {'max_concurrent': 1}}}, 'storage': {'filesystem': {}}, }, instance=DagsterInstance.local_temp(), ) assert result.success assert [ str(event.solid_handle) for event in result.step_event_list if event.is_step_success ] == ['high', 'high_2', 'none', 'none_2', 'low', 'low_2']
def test_engine_error(): with pytest.raises(DagsterSubprocessError): with seven.TemporaryDirectory() as tempdir: storage = os.path.join(tempdir, 'flakey_storage') execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'engine_error', ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {'config': {'base_dir': storage}}}, 'execution': { 'celery': {'config': {'config_source': {'task_always_eager': True}}} }, 'solids': {'destroy': {'config': storage}}, }, instance=DagsterInstance.local_temp(tempdir=tempdir), )
def test_seperate_sub_dags(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_subdag_pipeline').build_pipeline_definition() with seven.TemporaryDirectory() as tempdir: file_one = os.path.join(tempdir, 'foo_one') file_two = os.path.join(tempdir, 'foo_two') file_three = os.path.join(tempdir, 'foo_three') result = execute_pipeline( pipe, environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocess': { 'config': { 'max_concurrent': 4 } } }, 'solids': { 'waiter': { 'config': file_three }, 'counter_1': { 'config': file_one }, 'counter_2': { 'config': file_two }, 'counter_3': { 'config': file_three }, }, }, instance=DagsterInstance.local_temp(), ) assert result.success # ensure that assert [ str(event.solid_handle) for event in result.step_event_list if event.is_step_success ] == ['counter_1', 'counter_2', 'counter_3', 'waiter']
def test_diamond_multi_execution(): pipeline = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_diamond_pipeline' ).build_pipeline_definition() result = execute_pipeline( pipeline, environment_dict={'storage': {'filesystem': {}}, 'execution': {'multiprocess': {}}}, ) assert result.success assert result.result_for_solid('adder').output_value() == 11 pids_by_solid = {} for solid in pipeline.solids: pids_by_solid[solid.name] = compute_event(result, solid.name).logging_tags['pid'] # guarantee that all solids ran in their own process assert len(set(pids_by_solid.values())) == len(pipeline.solids)
def test_execute_hammer_through_dagit(): handle = ExecutionTargetHandle.for_pipeline_python_file( file_relative_path(__file__, '../../../../examples/dagster_examples/toys/hammer.py'), 'hammer_pipeline', ) instance = DagsterInstance.local_temp() execution_manager = SubprocessExecutionManager(instance) context = DagsterGraphQLContext( handle=handle, execution_manager=execution_manager, instance=instance ) executor = SyncExecutor() variables = { 'executionParams': { 'environmentConfigData': {'storage': {'filesystem': {}}, 'execution': {'dask': {}}}, 'selector': {'name': handle.build_pipeline_definition().name}, 'mode': 'default', } } start_pipeline_result = graphql( request_string=START_PIPELINE_EXECUTION_MUTATION, schema=create_schema(), context=context, variables=variables, executor=executor, ) run_id = start_pipeline_result.data['startPipelineExecution']['run']['runId'] context.execution_manager.join() subscription = execute_dagster_graphql(context, SUBSCRIPTION_QUERY, variables={'runId': run_id}) subscribe_results = [] subscription.subscribe(subscribe_results.append) messages = [x['__typename'] for x in subscribe_results[0].data['pipelineRunLogs']['messages']] assert 'PipelineStartEvent' in messages assert 'PipelineSuccessEvent' in messages
def test_step_retry(environment): with seven.TemporaryDirectory() as tempdir: env = dict(environment) env['solids'] = {'fail_first_time': {'config': tempdir}} result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_step_retry_pipeline' ).build_pipeline_definition(), environment_dict=env, instance=DagsterInstance.local_temp(), ) assert result.success events = defaultdict(list) for ev in result.event_list: events[ev.event_type].append(ev) assert len(events[DagsterEventType.STEP_START]) == 1 assert len(events[DagsterEventType.STEP_UP_FOR_RETRY]) == 1 assert len(events[DagsterEventType.STEP_RESTARTED]) == 1 assert len(events[DagsterEventType.STEP_SUCCESS]) == 1
def test_failing(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file(__file__, 'failing_pipeline') env_config = { 'solids': {'sum_solid': {'inputs': {'num': script_relative_path('data/num.csv')}}} } selector = ExecutionSelector('csv_hello_world') pipeline_run = InMemoryPipelineRun( run_id, selector, env_config, mode='default', reexecution_config=None, step_keys_to_execute=None, ) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline(handle, failing_pipeline, pipeline_run, raise_on_error=False) execution_manager.join() assert pipeline_run.status == PipelineRunStatus.FAILURE assert pipeline_run.all_logs()
def test_running(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'passing_pipeline') env_config = { 'solids': { 'sum_solid': { 'inputs': { 'num': script_relative_path('data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') run_storage = InMemoryRunStorage() pipeline_run = run_storage.create_run( pipeline_name=passing_pipeline.name, run_id=run_id, selector=selector, env_config=env_config, mode='default', reexecution_config=None, step_keys_to_execute=None, ) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline(handle, passing_pipeline, pipeline_run, raise_on_error=False) execution_manager.join() assert pipeline_run.status == PipelineRunStatus.SUCCESS events = pipeline_run.all_logs() assert events process_start_events = get_events_of_type( events, DagsterEventType.PIPELINE_PROCESS_START) assert len(process_start_events) == 1 process_started_events = get_events_of_type( events, DagsterEventType.PIPELINE_PROCESS_STARTED) assert len(process_started_events) == 1
def test_execution_crash(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'crashy_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': script_relative_path('data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run( PipelineRun( pipeline_name=crashy_pipeline.name, run_id=run_id, selector=selector, environment_dict=environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, )) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, crashy_pipeline, pipeline_run, instance, raise_on_error=False) execution_manager.join() assert instance.get_run(run_id).status == PipelineRunStatus.FAILURE last_log = instance.all_logs(run_id)[-1] assert last_log.message.startswith( 'Exception: Pipeline execution process for {run_id} unexpectedly exited\n' .format(run_id=run_id))
def test_multiple_outputs_only_emit_one_multiproc(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_multi_out').build_pipeline_definition() result = execute_pipeline( pipe, environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocess': {} } }, instance=DagsterInstance.local_temp(), ) assert result.success solid_result = result.result_for_solid('multiple_outputs') assert set(solid_result.output_values.keys()) == set(['output_one']) with pytest.raises( DagsterInvariantViolationError, match= "Output 'not_defined' not defined in solid 'multiple_outputs'", ): solid_result.output_value('not_defined') with pytest.raises(DagsterInvariantViolationError, match='Did not find result output_two'): solid_result.output_value('output_two') with pytest.raises( DagsterInvariantViolationError, match=re.escape( 'Tried to get result for solid \'not_present\' in ' '\'multiple_outputs_only_emit_one_pipeline\'. No such top level solid.' ), ): result.result_for_solid('not_present') assert result.result_for_solid('downstream_two').skipped
def test_pandas_dask(): environment_dict = { 'solids': { 'pandas_solid': { 'inputs': {'df': {'csv': {'path': file_relative_path(__file__, 'ex.csv')}}} } } } result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, pandas_pipeline.name ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {}}, 'execution': {'dask': {'config': {'timeout': 30}}}, **environment_dict, }, ) assert result.success
def test_separate_sub_dags(): pipe = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_subdag_pipeline').build_pipeline_definition() with safe_tempfile_path() as filename: result = execute_pipeline( pipe, environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocess': { 'config': { 'max_concurrent': 2 } } }, 'solids': { 'waiter': { 'config': filename }, 'writer': { 'config': filename }, }, }, instance=DagsterInstance.local_temp(), ) assert result.success # this test is to ensure that the chain of noop -> noop -> noop -> writer is not blocked by waiter order = [ str(event.solid_handle) for event in result.step_event_list if event.is_step_success ] # the writer and waiter my finish in different orders so just ensure the proceeding chain assert order[0:3] == ['noop_1', 'noop_2', 'noop_3']
def test_has_run_query_and_terminate(): run_id_one = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'infinite_loop_pipeline') instance = DagsterInstance.local_temp() with get_temp_file_location() as path: pipeline_run = instance.create_run( PipelineRun.create_empty_run( pipeline_name=infinite_loop_pipeline.name, run_id=run_id_one, environment_dict={ 'solids': { 'loop': { 'config': { 'file': path } } } }, )) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, infinite_loop_pipeline, pipeline_run, instance, raise_on_error=False) while not os.path.exists(path): time.sleep(0.1) assert os.path.exists(path) assert execution_manager.is_process_running(run_id_one) assert execution_manager.terminate(run_id_one) assert not execution_manager.is_process_running(run_id_one) assert not execution_manager.terminate(run_id_one) assert not os.path.exists(path)
def test_running(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'passing_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': file_relative_path(__file__, 'data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run( PipelineRun( pipeline_name=passing_pipeline.name, run_id=run_id, selector=selector, environment_dict=environment_dict, mode='default', step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, )) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, passing_pipeline, pipeline_run, instance) execution_manager.join() assert instance.get_run_by_id(run_id).status == PipelineRunStatus.SUCCESS events = instance.all_logs(run_id) assert events engine_events = get_events_of_type(events, DagsterEventType.ENGINE_EVENT) assert (len([ ev for ev in engine_events if 'SubprocessExecutionManager' in ev.message ]) == 3) # starting, started, exit
def test_bad_broker(): event_stream = execute_pipeline_iterator( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'test_diamond_pipeline' ).build_pipeline_definition(), environment_dict={ 'storage': {'filesystem': {}}, 'execution': {'celery': {'config': {'config_source': {'broker_url': '*****@*****.**'}}}}, }, instance=DagsterInstance.local_temp(), ) # ensure an engine event with an error is yielded if we cant connect to the broker saw_engine_error = False try: for event in event_stream: if event.is_engine_event: saw_engine_error = bool(event.engine_event_data.error) except Exception: # pylint: disable=broad-except pass assert saw_engine_error
def execute_pipeline_on_celery(tempdir, pipeline_name, tags=None): handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, pipeline_name) pipeline_def = handle.build_pipeline_definition() instance = DagsterInstance.local_temp(tempdir=tempdir) return execute_pipeline( pipeline_def, environment_dict={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': {} }, }, instance=instance, run_config=RunConfig(tags=tags), )
def test_interrupt_multiproc(): with seven.TemporaryDirectory() as tempdir: file_1 = os.path.join(tempdir, 'file_1') file_2 = os.path.join(tempdir, 'file_2') file_3 = os.path.join(tempdir, 'file_3') file_4 = os.path.join(tempdir, 'file_4') # launch a thread the waits until the file is written to launch an interrupt Thread(target=_send_kbd_int, args=([file_1, file_2, file_3, file_4],)).start() results = [] try: # launch a pipeline that writes a file and loops infinitely # next time the launched thread wakes up it will send a keyboard # interrupt for result in execute_pipeline_iterator( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'write_files_pipeline' ).build_pipeline_definition(), environment_dict={ 'solids': { 'write_1': {'config': {'tempfile': file_1}}, 'write_2': {'config': {'tempfile': file_2}}, 'write_3': {'config': {'tempfile': file_3}}, 'write_4': {'config': {'tempfile': file_4}}, }, 'execution': {'multiprocess': {'config': {'max_concurrent': 4}}}, 'storage': {'filesystem': {}}, }, instance=DagsterInstance.local_temp(tempdir=tempdir), ): results.append(result) assert False # should never reach except (DagsterSubprocessError, KeyboardInterrupt): pass assert [result.event_type for result in results].count(DagsterEventType.STEP_FAILURE) == 4 assert DagsterEventType.PIPELINE_FAILURE in [result.event_type for result in results]
def test_failing(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'failing_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': script_relative_path('data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run( PipelineRun( pipeline_name=failing_pipeline.name, run_id=run_id, selector=selector, environment_dict=environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, )) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline(handle, failing_pipeline, pipeline_run, instance, raise_on_error=False) execution_manager.join() assert instance.get_run(run_id).status == PipelineRunStatus.FAILURE assert instance.all_logs(run_id)