def define_examples_context(raise_on_error=True): return DagsterGraphQLContext( handle=ExecutionTargetHandle.for_repo_module('dagster_examples', 'define_demo_repo'), pipeline_runs=PipelineRunStorage(), execution_manager=SynchronousExecutionManager(), raise_on_error=raise_on_error, )
def test_has_run_query_and_terminate(): handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'infinite_loop_pipeline') instance = DagsterInstance.local_temp() with safe_tempfile_path() as path: pipeline_run = instance.create_run_for_pipeline( pipeline=infinite_loop_pipeline, environment_dict={'solids': { 'loop': { 'config': { 'file': path } } }}, ) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, infinite_loop_pipeline, pipeline_run, instance) while not os.path.exists(path): time.sleep(0.1) assert os.path.exists(path) assert execution_manager.is_process_running(pipeline_run.run_id) assert execution_manager.terminate(pipeline_run.run_id) assert instance.get_run_by_id(pipeline_run.run_id).is_finished assert not execution_manager.is_process_running(pipeline_run.run_id) assert not execution_manager.terminate(pipeline_run.run_id) assert not os.path.exists(path)
def test_running(): handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'passing_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': file_relative_path(__file__, 'data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run_for_pipeline( pipeline=passing_pipeline, selector=selector, environment_dict=environment_dict, ) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, passing_pipeline, pipeline_run, instance) execution_manager.join() assert instance.get_run_by_id( pipeline_run.run_id).status == PipelineRunStatus.SUCCESS events = instance.all_logs(pipeline_run.run_id) assert events engine_events = get_events_of_type(events, DagsterEventType.ENGINE_EVENT) assert (len([ ev for ev in engine_events if 'SubprocessExecutionManager' in ev.message ]) == 3) # starting, started, exit
def test_failing(): handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'failing_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': file_relative_path(__file__, 'data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run_for_pipeline( pipeline=failing_pipeline, selector=selector, environment_dict=environment_dict, ) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, failing_pipeline, pipeline_run, instance) execution_manager.join() assert instance.get_run_by_id( pipeline_run.run_id).status == PipelineRunStatus.FAILURE assert instance.all_logs(pipeline_run.run_id)
def test_execution_crash(): handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'crashy_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': file_relative_path(__file__, 'data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run_for_pipeline( pipeline=crashy_pipeline, selector=selector, environment_dict=environment_dict, ) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, crashy_pipeline, pipeline_run, instance) execution_manager.join() assert instance.get_run_by_id( pipeline_run.run_id).status == PipelineRunStatus.FAILURE crash_log = instance.all_logs(pipeline_run.run_id)[ -2] # last message is pipeline failure, second to last is... assert crash_log.message.startswith( '[SubprocessExecutionManager] Pipeline execution process for {run_id} unexpectedly exited' .format(run_id=pipeline_run.run_id))
class TestAirflowPython_1WarehouseExecution(object): handle = ExecutionTargetHandle.for_pipeline_fn( define_airline_demo_warehouse_pipeline) pipeline_name = 'airline_demo_warehouse_pipeline' environment_yaml = [ script_relative_path( os.path.join('..', '..', 'dagster_examples', 'airline_demo', 'environments', 'local_base.yaml')), script_relative_path( os.path.join('..', '..', 'dagster_examples', 'airline_demo', 'environments', 's3_storage.yaml')), script_relative_path( os.path.join( '..', '..', 'dagster_examples', 'airline_demo', 'environments', 'local_warehouse.yaml', )), ] mode = 'local' def test_airflow_run_warehouse_pipeline( self, dagster_airflow_python_operator_pipeline): pass
def test_diamond_multi_execution(): pipeline = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_diamond_pipeline').build_pipeline_definition() result = execute_pipeline( pipeline, environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocess': {} } }, instance=DagsterInstance.local_temp(), ) assert result.success assert result.result_for_solid('adder').output_value() == 11 pids_by_solid = {} for solid in pipeline.solids: pids_by_solid[solid.name] = compute_event( result, solid.name).logging_tags['pid'] # guarantee that all solids ran in their own process assert len(set(pids_by_solid.values())) == len(pipeline.solids)
def define_context(raise_on_error=True, log_dir=None): return DagsterGraphQLContext( handle=ExecutionTargetHandle.for_repo_fn(define_repository), pipeline_runs=PipelineRunStorage(log_dir), execution_manager=SynchronousExecutionManager(), raise_on_error=raise_on_error, )
def test_compute_log_to_disk_multiprocess(): spew_pipeline = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'define_pipeline').build_pipeline_definition() instance = DagsterInstance.local_temp() manager = instance.compute_log_manager result = execute_pipeline( spew_pipeline, environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'multiprocess': {} } }, instance=instance, ) assert result.success compute_steps = [ event.step_key for event in result.step_event_list if event.event_type == DagsterEventType.STEP_START ] for step_key in compute_steps: if step_key.startswith('spawn'): continue compute_io_path = manager.get_local_path(result.run_id, step_key, ComputeIOType.STDOUT) assert os.path.exists(compute_io_path) with open(compute_io_path, 'r') as stdout_file: assert normalize_file_content(stdout_file.read()) == HELLO_SOLID
class TestExecuteDagContainerizedS3Storage(object): handle = ExecutionTargetHandle.for_pipeline_fn( define_demo_execution_pipeline) pipeline_name = 'demo_pipeline' environment_yaml = [ script_relative_path('test_project/env.yaml'), script_relative_path('test_project/env_s3.yaml'), ] run_id = str(uuid.uuid4()) execution_date = datetime.datetime.utcnow() image = IMAGE # pylint: disable=redefined-outer-name def test_execute_dag_containerized( self, dagster_airflow_docker_operator_pipeline): for result in dagster_airflow_docker_operator_pipeline: assert 'data' in result assert 'executePlan' in result['data'] assert '__typename' in result['data']['executePlan'] assert result['data']['executePlan'][ '__typename'] == 'ExecutePlanSuccess' result = list( filter( lambda x: x['__typename'] == 'ExecutionStepOutputEvent', result['data']['executePlan']['stepEvents'], ))[0] if result['step']['kind'] == 'INPUT_THUNK': continue
def test_execute_on_dask(): result = execute_on_dask( ExecutionTargetHandle.for_pipeline_python_file(__file__, 'dask_engine_pipeline'), env_config={'storage': {'filesystem': {}}}, dask_config=DaskConfig(timeout=30), ) assert result.result_for_solid('simple').result_value() == 1
def execute_eagerly_on_celery(tempdir, pipeline_name, tags=None): return execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, pipeline_name).build_pipeline_definition(), environment_dict={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': { 'config': { 'config_source': { 'task_always_eager': True } } } }, }, instance=DagsterInstance.local_temp(tempdir=tempdir), run_config=RunConfig(tags=tags), )
class TestExecuteDagPythonFilesystemStorage(object): pipeline_name = 'demo_pipeline' handle = ExecutionTargetHandle.for_pipeline_module( 'dagster_airflow_tests.test_project.dagster_airflow_demo', pipeline_name) environment_yaml = [ script_relative_path('test_project/env.yaml'), script_relative_path('test_project/env_filesystem.yaml'), ] run_id = str(uuid.uuid4()) execution_date = datetime.datetime.utcnow() # pylint: disable=redefined-outer-name def test_execute_dag(self, dagster_airflow_python_operator_pipeline): for result in dagster_airflow_python_operator_pipeline: assert 'data' in result assert 'executePlan' in result['data'] assert '__typename' in result['data']['executePlan'] assert result['data']['executePlan'][ '__typename'] == 'ExecutePlanSuccess' result = list( filter( lambda x: x['__typename'] == 'ExecutionStepOutputEvent', result['data']['executePlan']['stepEvents'], ))[0] if result['step']['kind'] == 'INPUT_THUNK': continue
def make_airflow_dag( module_name, pipeline_name, environment_dict=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, ): check.str_param(module_name, 'module_name') handle = ExecutionTargetHandle.for_pipeline_module(module_name, pipeline_name) return _make_airflow_dag( handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, dag_id=dag_id, dag_description=dag_description, dag_kwargs=dag_kwargs, op_kwargs=op_kwargs, )
def make_airflow_dag_containerized( module_name, pipeline_name, image, environment_dict=None, mode=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, ): check.str_param(module_name, 'module_name') handle = ExecutionTargetHandle.for_pipeline_module(module_name, pipeline_name) op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) op_kwargs['image'] = image return _make_airflow_dag( handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, dag_id=dag_id, dag_description=dag_description, dag_kwargs=dag_kwargs, op_kwargs=op_kwargs, operator=DagsterDockerOperator, )
def execute_eagerly_on_celery(pipeline_name, instance=None): with seven.TemporaryDirectory() as tempdir: instance = instance or DagsterInstance.local_temp(tempdir=tempdir) result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, pipeline_name).build_pipeline_definition(), environment_dict={ 'storage': { 'filesystem': { 'config': { 'base_dir': tempdir } } }, 'execution': { 'celery': { 'config': { 'config_source': { 'task_always_eager': True } } } }, }, instance=instance, ) yield result
def test_pandas_dask(): environment_dict = { 'solids': { 'pandas_solid': { 'inputs': { 'df': { 'csv': { 'path': file_relative_path(__file__, 'ex.csv') } } } } } } result = execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, pandas_pipeline.name).build_pipeline_definition(), environment_dict={ 'storage': { 'filesystem': {} }, 'execution': { 'dask': { 'config': { 'timeout': 30 } } }, **environment_dict, }, instance=DagsterInstance.local_temp(), ) assert result.success
def test_failing(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'failing_pipeline') environment_dict = { 'solids': { 'sum_solid': { 'inputs': { 'num': file_relative_path(__file__, 'data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run( PipelineRun( pipeline_name=failing_pipeline.name, run_id=run_id, selector=selector, environment_dict=environment_dict, mode='default', step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, )) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, failing_pipeline, pipeline_run, instance) execution_manager.join() assert instance.get_run_by_id(run_id).status == PipelineRunStatus.FAILURE assert instance.all_logs(run_id)
def test_max_concurrency_zero(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'infinite_loop_pipeline') with safe_tempfile_path() as filepath: instance = DagsterInstance.local_temp() execution_manager = QueueingSubprocessExecutionManager( instance, max_concurrent_runs=0) pipeline_run = instance.create_run( PipelineRun.create_empty_run( pipeline_name=infinite_loop_pipeline.name, run_id=run_id, environment_dict={ 'solids': { 'loop': { 'config': { 'file': filepath } } } }, )) execution_manager.execute_pipeline(handle, infinite_loop_pipeline, pipeline_run, instance) assert not execution_manager.is_active(run_id) assert not os.path.exists(filepath)
def define_context(raise_on_error=True, instance=None): return DagsterGraphQLContext( handle=ExecutionTargetHandle.for_repo_fn(define_repository), instance=instance or DagsterInstance.ephemeral(), execution_manager=SynchronousExecutionManager(), raise_on_error=raise_on_error, )
def test_failing(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_fn(define_failing_pipeline) pipeline = define_failing_pipeline() env_config = { 'solids': { 'sum_solid': { 'inputs': { 'num': script_relative_path('data/num.csv') } } } } selector = ExecutionSelector('csv_hello_world') pipeline_run = InMemoryPipelineRun( run_id, selector, env_config, mode='default', reexecution_config=None, step_keys_to_execute=None, ) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline(handle, pipeline, pipeline_run, raise_on_error=False) execution_manager.join() assert pipeline_run.status == PipelineRunStatus.FAILURE assert pipeline_run.all_logs()
def test_execute_execute_plan_mutation_raw(): pipeline_name = 'sleepy_pipeline' handle = ExecutionTargetHandle.for_pipeline_module( 'dagster_examples.toys.sleepy', pipeline_name) run_id = make_new_run_id() instance = DagsterInstance.local_temp() instance.create_empty_run(run_id, pipeline_name) variables = { 'executionParams': { 'environmentConfigData': {}, 'mode': 'default', 'selector': { 'name': pipeline_name }, 'executionMetadata': { 'runId': run_id }, } } result = execute_execute_plan_mutation_raw(handle, variables, instance_ref=instance.get_ref()) seen_events = set() for event in result: seen_events.add((event.dagster_event.event_type_value, event.step_key)) assert seen_events == EXPECTED_EVENTS
def test_execution_crash(): run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file(__file__, 'crashy_pipeline') env_config = { 'solids': {'sum_solid': {'inputs': {'num': script_relative_path('data/num.csv')}}} } selector = ExecutionSelector('csv_hello_world') pipeline_run = InMemoryPipelineRun( run_id, selector, env_config, mode='default', reexecution_config=None, step_keys_to_execute=None, ) execution_manager = MultiprocessingExecutionManager() execution_manager.execute_pipeline(handle, crashy_pipeline, pipeline_run, raise_on_error=False) execution_manager.join() assert pipeline_run.status == PipelineRunStatus.FAILURE last_log = pipeline_run.all_logs()[-1] print(last_log.message) assert last_log.message.startswith( 'Exception: Pipeline execution process for {run_id} unexpectedly exited\n'.format( run_id=run_id ) )
def test_engine_error(): with pytest.raises(DagsterSubprocessError): with seven.TemporaryDirectory() as tempdir: storage = os.path.join(tempdir, 'flakey_storage') execute_pipeline( ExecutionTargetHandle.for_pipeline_python_file( __file__, 'engine_error', ).build_pipeline_definition(), environment_dict={ 'storage': { 'filesystem': { 'config': { 'base_dir': storage } } }, 'execution': { 'celery': { 'config': { 'config_source': { 'task_always_eager': True } } } }, 'solids': { 'destroy': { 'config': storage } }, }, instance=DagsterInstance.local_temp(tempdir=tempdir), )
def test_two_runs_running(): handle = ExecutionTargetHandle.for_pipeline_python_file( __file__, 'infinite_loop_pipeline') with safe_tempfile_path() as file_one, safe_tempfile_path() as file_two: instance = DagsterInstance.local_temp() execution_manager = SubprocessExecutionManager(instance) pipeline_run_one = instance.create_run_for_pipeline( pipeline_def=infinite_loop_pipeline, environment_dict={ 'solids': { 'loop': { 'config': { 'file': file_one } } } }, ) execution_manager.execute_pipeline(handle, infinite_loop_pipeline, pipeline_run_one, instance) pipeline_run_two = instance.create_run_for_pipeline( pipeline_def=infinite_loop_pipeline, environment_dict={ 'solids': { 'loop': { 'config': { 'file': file_two } } } }, ) execution_manager.execute_pipeline(handle, infinite_loop_pipeline, pipeline_run_two, instance) # ensure both runs have begun execution while not os.path.exists(file_one) and not os.path.exists(file_two): time.sleep(0.1) assert execution_manager.is_process_running(pipeline_run_one.run_id) assert execution_manager.is_process_running(pipeline_run_two.run_id) assert execution_manager.terminate(pipeline_run_one.run_id) assert not execution_manager.is_process_running( pipeline_run_one.run_id) assert execution_manager.is_process_running(pipeline_run_two.run_id) assert execution_manager.terminate(pipeline_run_two.run_id) assert not execution_manager.is_process_running( pipeline_run_one.run_id) assert not execution_manager.is_process_running( pipeline_run_two.run_id)
def test_execute_hammer_through_dagit(): handle = ExecutionTargetHandle.for_pipeline_python_file( file_relative_path( __file__, '../../../../examples/dagster_examples/toys/hammer.py'), 'hammer_pipeline', ) instance = DagsterInstance.local_temp() execution_manager = SubprocessExecutionManager(instance) context = DagsterGraphQLInProcessRepositoryContext( handle=handle, execution_manager=execution_manager, instance=instance) executor = SyncExecutor() variables = { 'executionParams': { 'environmentConfigData': { 'storage': { 'filesystem': {} }, 'execution': { 'dask': {} } }, 'selector': { 'name': handle.build_pipeline_definition().name }, 'mode': 'default', } } start_pipeline_result = graphql( request_string=START_PIPELINE_EXECUTION_MUTATION, schema=create_schema(), context=context, variables=variables, executor=executor, ) run_id = start_pipeline_result.data['startPipelineExecution']['run'][ 'runId'] context.execution_manager.join() subscription = execute_dagster_graphql(context, SUBSCRIPTION_QUERY, variables={'runId': run_id}) subscribe_results = [] subscription.subscribe(subscribe_results.append) messages = [ x['__typename'] for x in subscribe_results[0].data['pipelineRunLogs']['messages'] ] assert 'PipelineStartEvent' in messages assert 'PipelineSuccessEvent' in messages
def get_papermill_parameters(compute_context, inputs, output_log_path): check.inst_param(compute_context, 'compute_context', SystemComputeExecutionContext) check.param_invariant( isinstance(compute_context.environment_dict, dict), 'compute_context', 'SystemComputeExecutionContext must have valid environment_dict', ) check.dict_param(inputs, 'inputs', key_type=six.string_types) run_id = compute_context.run_id marshal_dir = '/tmp/dagstermill/{run_id}/marshal'.format(run_id=run_id) mkdir_p(marshal_dir) (handle, solid_subset) = ExecutionTargetHandle.get_handle( compute_context.pipeline_def) if not handle: raise DagstermillError( 'Can\'t execute a dagstermill solid from a pipeline that wasn\'t instantiated using ' 'an ExecutionTargetHandle') dm_handle_kwargs = handle.data._asdict() dm_handle_kwargs['pipeline_name'] = compute_context.pipeline_def.name dm_context_dict = { 'output_log_path': output_log_path, 'marshal_dir': marshal_dir, 'environment_dict': compute_context.environment_dict, } dm_solid_handle_kwargs = compute_context.solid_handle._asdict() parameters = {} input_def_dict = compute_context.solid_def.input_dict for input_name, input_value in inputs.items(): assert ( input_name not in RESERVED_INPUT_NAMES ), 'Dagstermill solids cannot have inputs named {input_name}'.format( input_name=input_name) dagster_type = input_def_dict[input_name].dagster_type parameter_value = write_value( dagster_type, input_value, os.path.join(marshal_dir, 'input-{}'.format(input_name))) parameters[input_name] = parameter_value parameters['__dm_context'] = dm_context_dict parameters['__dm_handle_kwargs'] = dm_handle_kwargs parameters['__dm_pipeline_run_dict'] = pack_value( compute_context.pipeline_run) parameters['__dm_solid_handle_kwargs'] = dm_solid_handle_kwargs parameters['__dm_solid_subset'] = solid_subset parameters['__dm_instance_ref_dict'] = pack_value( compute_context.instance.get_ref()) return parameters
def test_multiprocessing_execution_for_composite_solid(): environment_dict = { 'solids': { 'composite_with_nested_config_solid': { 'solids': {'node_a': {'config': {'foo': 'baz'}}, 'node_b': {'config': {'bar': 3}}} } } } run_id = make_new_run_id() handle = ExecutionTargetHandle.for_pipeline_python_file(__file__, 'composite_pipeline') instance = DagsterInstance.local_temp() pipeline_run = instance.create_run( PipelineRun( pipeline_name=composite_pipeline.name, run_id=run_id, selector=ExecutionSelector('nonce'), environment_dict=environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, ) ) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, composite_pipeline, pipeline_run, instance) execution_manager.join() assert instance.get_run_by_id(run_id).status == PipelineRunStatus.SUCCESS environment_dict = { 'solids': { 'composite_with_nested_config_solid': { 'solids': {'node_a': {'config': {'foo': 'baz'}}, 'node_b': {'config': {'bar': 3}}} } }, 'execution': {'multiprocess': {}}, 'storage': {'filesystem': {}}, } run_id = make_new_run_id() pipeline_run = instance.create_run( PipelineRun( pipeline_name=composite_pipeline.name, run_id=run_id, selector=ExecutionSelector('nonce'), environment_dict=environment_dict, mode='default', reexecution_config=None, step_keys_to_execute=None, tags=None, status=PipelineRunStatus.NOT_STARTED, ) ) execution_manager = SubprocessExecutionManager(instance) execution_manager.execute_pipeline(handle, composite_pipeline, pipeline_run, instance) execution_manager.join()
def test_pipelines_python_error(): ctx = DagsterGraphQLContext( handle=ExecutionTargetHandle.for_repo_fn(define_error_pipeline_repo), pipeline_runs=InMemoryRunStorage(), execution_manager=SynchronousExecutionManager(), ) result = execute_dagster_graphql(ctx, PIPELINES) assert result.data['pipelinesOrError']['__typename'] == "PythonError"
def define_context(raise_on_error=True, log_dir=None, schedule_dir=None): return DagsterGraphQLContext( handle=ExecutionTargetHandle.for_repo_fn(define_repository), pipeline_runs=FilesystemRunStorage(base_dir=log_dir) if log_dir else InMemoryRunStorage(), scheduler=TestSystemCronScheduler(schedule_dir) if schedule_dir else None, execution_manager=SynchronousExecutionManager(), raise_on_error=raise_on_error, )