def _build_sub_pipeline(pipeline_def, solid_names): ''' Build a pipeline which is a subset of another pipeline. Only includes the solids which are in solid_names. ''' from dagster.core.definitions.handle import ExecutionTargetHandle check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) check.list_param(solid_names, 'solid_names', of_type=str) solid_name_set = set(solid_names) solids = list(map(pipeline_def.solid_named, solid_names)) deps = {_dep_key_of(solid): {} for solid in solids} for solid in solids: for input_handle in solid.input_handles(): if pipeline_def.dependency_structure.has_singular_dep( input_handle): output_handle = pipeline_def.dependency_structure.get_singular_dep( input_handle) if output_handle.solid.name in solid_name_set: deps[_dep_key_of(solid)][ input_handle.input_def.name] = DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name) elif pipeline_def.dependency_structure.has_multi_deps( input_handle): output_handles = pipeline_def.dependency_structure.get_multi_deps( input_handle) deps[_dep_key_of(solid)][ input_handle.input_def.name] = MultiDependencyDefinition([ DependencyDefinition( solid=output_handle.solid.name, output=output_handle.output_def.name) for output_handle in output_handles if output_handle.solid.name in solid_name_set ]) sub_pipeline_def = PipelineDefinition( name=pipeline_def. name, # should we change the name for subsetted pipeline? solid_defs=list({solid.definition for solid in solids}), mode_defs=pipeline_def.mode_definitions, dependencies=deps, _parent_pipeline_def=pipeline_def, ) handle, _ = ExecutionTargetHandle.get_handle(pipeline_def) if handle: ExecutionTargetHandle.cache_handle(sub_pipeline_def, handle, solid_names=solid_names) return sub_pipeline_def
def multiprocess_executor(init_context): '''The default multiprocess executor. This simple multiprocess executor is available by default on any :py:class:`ModeDefinition` that does not provide custom executors. To select the multiprocess executor, include a fragment such as the following in your config: .. code-block:: yaml execution: multiprocess: max_concurrent: 4 The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of :py:func:`python:multiprocessing.cpu_count`. ''' from dagster.core.definitions.handle import ExecutionTargetHandle from dagster.core.engine.init import InitExecutorContext check.inst_param(init_context, 'init_context', InitExecutorContext) handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def) return MultiprocessExecutorConfig( handle=handle, max_concurrent=init_context.executor_config['max_concurrent'])
def create_context_creation_data(pipeline_def, environment_dict, pipeline_run, instance, execution_plan): environment_config = EnvironmentConfig.build(pipeline_def, environment_dict, pipeline_run) mode_def = pipeline_def.get_mode_definition(pipeline_run.mode) system_storage_def = system_storage_def_from_config( mode_def, environment_config) executor_def = executor_def_from_config(mode_def, environment_config) execution_target_handle, _ = ExecutionTargetHandle.get_handle(pipeline_def) return ContextCreationData( pipeline_def=pipeline_def, environment_config=environment_config, pipeline_run=pipeline_run, mode_def=mode_def, system_storage_def=system_storage_def, execution_target_handle=execution_target_handle, executor_def=executor_def, instance=instance, resource_keys_to_init=get_required_resource_keys_to_init( execution_plan, system_storage_def), )
def multiprocess_executor(init_context): '''The default multiprocess executor. This simple multiprocess executor is available by default on any :py:class:`ModeDefinition` that does not provide custom executors. To select the multiprocess executor, include a fragment such as the following in your config: .. code-block:: yaml execution: multiprocess: max_concurrent: 4 The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of :py:func:`python:multiprocessing.cpu_count`. Execution priority can be configured using the ``dagster/priority`` tag via solid metadata, where the higher the number the higher the priority. 0 is the default and both positive and negative numbers can be used. ''' from dagster.core.definitions.handle import ExecutionTargetHandle from dagster.core.engine.init import InitExecutorContext check.inst_param(init_context, 'init_context', InitExecutorContext) check_cross_process_constraints(init_context) handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def) return MultiprocessExecutorConfig( handle=handle, max_concurrent=init_context.executor_config['max_concurrent'], retries=Retries.from_config(init_context.executor_config['retries']), )
def initialize_step_context(scratch_dir): pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_basic_pipeline).build_pipeline_definition() pipeline_run = PipelineRun( pipeline_name='foo_pipeline', run_id=str(uuid.uuid4()), environment_dict=make_environment_dict(scratch_dir, 'external'), mode='external', ) plan = create_execution_plan(pipeline_def, pipeline_run.environment_dict, mode='external') initialization_manager = pipeline_initialization_manager( plan, pipeline_run.environment_dict, pipeline_run, DagsterInstance.ephemeral(), ) for _ in initialization_manager.generate_setup_events(): pass pipeline_context = initialization_manager.get_object() active_execution = plan.start(retries=Retries(RetryMode.DISABLED)) step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) return step_context
def multiprocess_executor(init_context): from dagster.core.definitions.handle import ExecutionTargetHandle from dagster.core.engine.init import InitExecutorContext check.inst_param(init_context, 'init_context', InitExecutorContext) handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def) return MultiprocessExecutorConfig( handle=handle, max_concurrent=init_context.executor_config['max_concurrent'])
def test_pipeline(mode): with seven.TemporaryDirectory() as tmpdir: pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_basic_pipeline).build_pipeline_definition() result = execute_pipeline( pipeline=pipeline_def, mode=mode, environment_dict=make_environment_dict(tmpdir, mode), ) assert result.result_for_solid('return_two').output_value() == 2 assert result.result_for_solid('add_one').output_value() == 3
def _check_pipeline_has_target_handle(pipeline_def): from dagster.core.definitions.handle import ExecutionTargetHandle handle, _ = ExecutionTargetHandle.get_handle(pipeline_def) if not handle: raise DagsterUnmetExecutorRequirementsError( 'You have attempted to use an executor that uses multiple processes with the pipeline "{name}" ' 'that can not be re-hydrated. Pipelines must be loaded in a way that allows dagster to reconstruct ' 'them in a new process. This means: \n' ' * using the file, module, or repository.yaml arguments of dagit/dagster-graphql/dagster\n' ' * constructing an ExecutionTargetHandle directly\n'.format( name=pipeline_def.name))
def test_launcher_requests_retry(): mode = 'request_retry' with seven.TemporaryDirectory() as tmpdir: pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_basic_pipeline).build_pipeline_definition() result = execute_pipeline( pipeline=pipeline_def, mode=mode, environment_dict=make_environment_dict(tmpdir, mode), ) assert result.result_for_solid('return_two').output_value() == 2 assert result.result_for_solid('add_one').output_value() == 3 for step_key, events in result.events_by_step_key.items(): if step_key: event_types = [event.event_type for event in events] assert DagsterEventType.STEP_UP_FOR_RETRY in event_types assert DagsterEventType.STEP_RESTARTED in event_types
def create_context_creation_data(pipeline_def, environment_dict, run_config, instance): environment_config = create_environment_config(pipeline_def, environment_dict, run_config) mode_def = pipeline_def.get_mode_definition(run_config.mode) system_storage_def = system_storage_def_from_config(mode_def, environment_config) executor_def = executor_def_from_config(mode_def, environment_config) execution_target_handle, _ = ExecutionTargetHandle.get_handle(pipeline_def) return ContextCreationData( pipeline_def=pipeline_def, environment_config=environment_config, run_config=run_config, mode_def=mode_def, system_storage_def=system_storage_def, execution_target_handle=execution_target_handle, executor_def=executor_def, instance=instance, )
def test_pyspark_emr(mock_wait, mock_get_step_events): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'us-west-1a' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_do_nothing_pipe).build_pipeline_definition() result = execute_pipeline( pipeline=pipeline_def, mode='prod', environment_dict={ 'resources': { 'pyspark_step_launcher': { 'config': deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, {'cluster_id': cluster_id}), } }, }, ) assert result.success assert mock_wait.called_once assert mock_get_step_events.called_once
def create_context_creation_data(pipeline_def, environment_dict, run_config): environment_config = create_environment_config(pipeline_def, environment_dict, run_config) mode_def = pipeline_def.get_mode_definition(run_config.mode) system_storage_def = system_storage_def_from_config( mode_def, environment_config) check_persistent_storage_requirement(pipeline_def, system_storage_def, run_config) return ContextCreationData( pipeline_def=pipeline_def, environment_config=environment_config, run_config=run_config, mode_def=mode_def, system_storage_def=system_storage_def, execution_target_handle=ExecutionTargetHandle.get_handle(pipeline_def), )
def multiprocess_executor(init_context): '''The default multiprocess executor. This simple multiprocess executor is available by default on any :py:class:`ModeDefinition` that does not provide custom executors. To select the multiprocess executor, include a fragment such as the following in your config: .. code-block:: yaml execution: multiprocess: max_concurrent: 4 The ``max_concurrent`` arg is optional and tells the execution engine how many processes may run concurrently. By default, or if you set ``max_concurrent`` to be 0, this is the return value of :py:func:`python:multiprocessing.cpu_count`. Execution priority can be configured using the ``dagster/priority`` tag via solid metadata, where the higher the number the higher the priority. 0 is the default and both positive and negative numbers can be used. ''' from dagster.core.definitions.handle import ExecutionTargetHandle from dagster.core.engine.init import InitExecutorContext check.inst_param(init_context, 'init_context', InitExecutorContext) check_cross_process_constraints(init_context) # ExecutionTargetHandle.get_handle returns an ExecutionTargetHandleCacheEntry, which is a tuple # (handle, solid_subset). Right now we are throwing away the solid_subset that we store in the # cache -- this is fragile and we should fix this with # https://github.com/dagster-io/dagster/issues/2115 and friends so there are not multiple # sources of truth for the solid subset handle, _ = ExecutionTargetHandle.get_handle(init_context.pipeline_def) return MultiprocessExecutorConfig( handle=handle, max_concurrent=init_context.executor_config['max_concurrent'], retries=Retries.from_config(init_context.executor_config['retries']), )
def test_do_it_live_emr(): sync_code() # Retrieving the pipeline this way stores pipeline definition in the ExecutionTargetHandle # cache, where it can be retrieved and sent to the remote cluster at launch time. pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_pyspark_pipe).build_pipeline_definition() result = execute_pipeline( pipeline_def, mode='prod', environment_dict={ 'solids': { 'blah': { 'config': { 'foo': 'a string', 'bar': 123 } } }, 'resources': { 'pyspark_step_launcher': { 'config': BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG }, }, 'storage': { 's3': { 'config': { 's3_bucket': S3_BUCKET, 's3_prefix': 'test_pyspark' } } }, }, ) assert result.success