def test_default_unmemoized_steps(): speculative_execution_plan = create_execution_plan(no_version_pipeline) with pytest.raises( DagsterInvariantViolationError, match= ("While creating a memoized pipeline run, no steps have versions. At least one step " "must have a version."), ): resolve_memoized_execution_plan(speculative_execution_plan)
def test_resolve_memoized_execution_plan_yes_stored_results(): manager = VersionedInMemoryIOManager() versioned_pipeline = versioned_pipeline_factory(manager) speculative_execution_plan = create_execution_plan(versioned_pipeline) step_output_handle = StepOutputHandle("versioned_solid_no_input", "result") step_output_version = speculative_execution_plan.resolve_step_output_versions()[ step_output_handle ] manager.values[ (step_output_handle.step_key, step_output_handle.output_name, step_output_version) ] = 4 with DagsterInstance.ephemeral() as dagster_instance: memoized_execution_plan = resolve_memoized_execution_plan( speculative_execution_plan, {}, dagster_instance ) assert memoized_execution_plan.step_keys_to_execute == ["versioned_solid_takes_input"] expected_handle = StepOutputHandle( step_key="versioned_solid_no_input", output_name="result" ) assert ( memoized_execution_plan.get_step_by_key("versioned_solid_takes_input") .step_input_dict["intput"] .source.step_output_handle == expected_handle )
def test_resolve_memoized_execution_plan_partial_versioning(): manager = VersionedInMemoryIOManager() partially_versioned_pipeline = partially_versioned_pipeline_factory( manager) speculative_execution_plan = create_execution_plan( partially_versioned_pipeline) resolved_run_config = ResolvedRunConfig.build(partially_versioned_pipeline) step_output_handle = StepOutputHandle("versioned_solid_no_input", "result") step_output_version = resolve_step_output_versions( partially_versioned_pipeline, speculative_execution_plan, resolved_run_config)[step_output_handle] manager.values[(step_output_handle.step_key, step_output_handle.output_name, step_output_version)] = 4 with DagsterInstance.ephemeral() as instance: assert (resolve_memoized_execution_plan( speculative_execution_plan, partially_versioned_pipeline, {}, instance, resolved_run_config, ).step_keys_to_execute == ["solid_takes_input"])
def get_step_keys_to_execute(pipeline, run_config, mode, instance): memoized_execution_plan = resolve_memoized_execution_plan( create_execution_plan(pipeline, run_config=run_config, mode=mode), pipeline, run_config, instance, ResolvedRunConfig.build(pipeline, run_config=run_config, mode=mode), ) return memoized_execution_plan.step_keys_to_execute
def test_resolve_memoized_execution_plan_no_stored_results(): versioned_pipeline = versioned_pipeline_factory(VersionedInMemoryObjectManager()) speculative_execution_plan = create_execution_plan(versioned_pipeline) memoized_execution_plan = resolve_memoized_execution_plan(speculative_execution_plan) assert set(memoized_execution_plan.step_keys_to_execute) == { "versioned_solid_no_input", "versioned_solid_takes_input", }
def test_resolve_memoized_execution_plan_no_stored_results(): versioned_pipeline = versioned_pipeline_factory(VersionedInMemoryIOManager()) speculative_execution_plan = create_execution_plan(versioned_pipeline) with DagsterInstance.ephemeral() as dagster_instance: memoized_execution_plan = resolve_memoized_execution_plan( speculative_execution_plan, {}, dagster_instance ) assert set(memoized_execution_plan.step_keys_to_execute) == { "versioned_solid_no_input", "versioned_solid_takes_input", }
def test_resolve_memoized_execution_plan_partial_versioning(): object_manager = VersionedInMemoryObjectManager() partially_versioned_pipeline = partially_versioned_pipeline_factory(object_manager) speculative_execution_plan = create_execution_plan(partially_versioned_pipeline) step_output_handle = StepOutputHandle("versioned_solid_no_input", "result") step_output_version = speculative_execution_plan.resolve_step_output_versions()[ step_output_handle ] object_manager.values[ (step_output_handle.step_key, step_output_handle.output_name, step_output_version) ] = 4 assert resolve_memoized_execution_plan(speculative_execution_plan).step_keys_to_execute == [ "solid_takes_input" ]
def execute_list_versions_command(instance, kwargs): check.inst_param(instance, "instance", DagsterInstance) config = list( check.opt_tuple_param(kwargs.get("config"), "config", default=(), of_type=str)) preset = kwargs.get("preset") mode = kwargs.get("mode") if preset and config: raise click.UsageError("Can not use --preset with --config.") pipeline_origin = get_pipeline_python_origin_from_kwargs(kwargs) pipeline = recon_pipeline_from_origin(pipeline_origin) run_config = get_run_config_from_file_list(config) environment_config = EnvironmentConfig.build(pipeline.get_definition(), run_config, mode=mode) execution_plan = ExecutionPlan.build(pipeline, environment_config) step_output_versions = resolve_step_output_versions( pipeline.get_definition(), execution_plan, environment_config) memoized_plan = resolve_memoized_execution_plan(execution_plan, pipeline.get_definition(), run_config, instance, environment_config) # the step keys that we need to execute are those which do not have their inputs populated. step_keys_not_stored = set(memoized_plan.step_keys_to_execute) table = [] for step_output_handle, version in step_output_versions.items(): table.append([ "{key}.{output}".format(key=step_output_handle.step_key, output=step_output_handle.output_name), version, "stored" if step_output_handle.step_key not in step_keys_not_stored else "to-be-recomputed", ]) table_str = tabulate( table, headers=["Step Output", "Version", "Status of Output"], tablefmt="github") click.echo(table_str)
def test_resolve_memoized_execution_plan_yes_stored_results(): object_manager = VersionedInMemoryObjectManager() versioned_pipeline = versioned_pipeline_factory(object_manager) speculative_execution_plan = create_execution_plan(versioned_pipeline) step_output_handle = StepOutputHandle("versioned_solid_no_input", "result") step_output_version = speculative_execution_plan.resolve_step_output_versions()[ step_output_handle ] object_manager.values[ (step_output_handle.step_key, step_output_handle.output_name, step_output_version) ] = 4 memoized_execution_plan = resolve_memoized_execution_plan(speculative_execution_plan) assert memoized_execution_plan.step_keys_to_execute == ["versioned_solid_takes_input"] expected_handle = StepOutputHandle(step_key="versioned_solid_no_input", output_name="result") assert ( memoized_execution_plan.get_step_by_key("versioned_solid_takes_input") .step_input_dict["intput"] .source.step_output_handle == expected_handle )
def get_step_keys_to_execute(pipeline, run_config, mode): memoized_execution_plan = resolve_memoized_execution_plan( create_execution_plan(pipeline, run_config=run_config, mode=mode), run_config) return memoized_execution_plan.step_keys_to_execute
def execute_run( pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance, raise_on_error: bool = False, ) -> PipelineExecutionResult: """Executes an existing pipeline run synchronously. Synchronous version of execute_run_iterator. Args: pipeline (IPipeline): The pipeline to execute. pipeline_run (PipelineRun): The run to execute instance (DagsterInstance): The instance in which the run has been created. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``False``. Returns: PipelineExecutionResult: The result of the execution. """ if isinstance(pipeline, PipelineDefinition): raise DagsterInvariantViolationError( "execute_run requires an IPipeline but received a PipelineDefinition " "directly instead. To support hand-off to other processes provide a " "ReconstructablePipeline which can be done using reconstructable(). For in " "process only execution you can use InMemoryPipeline.") check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) if pipeline_run.status == PipelineRunStatus.CANCELED: message = "Not starting execution since the run was canceled before execution could start" instance.report_engine_event( message, pipeline_run, ) raise DagsterInvariantViolationError(message) check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) pipeline_def = pipeline.get_definition() if pipeline_run.solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that " "conflicts with pipeline subset {pipeline_solids_to_execute}.". format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = _get_execution_plan_from_run(pipeline, pipeline_run, instance) if is_memoized_run(pipeline_run.tags): resolved_run_config = ResolvedRunConfig.build( pipeline.get_definition(), pipeline_run.run_config, pipeline_run.mode) execution_plan = resolve_memoized_execution_plan( execution_plan, pipeline.get_definition(), pipeline_run.run_config, instance, resolved_run_config, ) output_capture: Optional[Dict[StepOutputHandle, Any]] = {} _execute_run_iterable = ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=pipeline_execution_iterator, execution_context_manager=PlanOrchestrationContextManager( context_event_generator=orchestration_context_event_generator, pipeline=pipeline, execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=raise_on_error, executor_defs=None, output_capture=output_capture, ), ) event_list = list(_execute_run_iterable) return PipelineExecutionResult( pipeline.get_definition(), pipeline_run.run_id, event_list, lambda: scoped_pipeline_context( execution_plan, pipeline, pipeline_run.run_config, pipeline_run, instance, ), output_capture=output_capture, )
def execute_run( pipeline: IPipeline, pipeline_run: PipelineRun, instance: DagsterInstance, raise_on_error: bool = False, ) -> PipelineExecutionResult: """Executes an existing pipeline run synchronously. Synchronous version of execute_run_iterator. Args: pipeline (IPipeline): The pipeline to execute. pipeline_run (PipelineRun): The run to execute instance (DagsterInstance): The instance in which the run has been created. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``False``. Returns: PipelineExecutionResult: The result of the execution. """ if isinstance(pipeline, PipelineDefinition): raise DagsterInvariantViolationError( "execute_run requires an IPipeline but received a PipelineDefinition " "directly instead. To support hand-off to other processes provide a " "ReconstructablePipeline which can be done using reconstructable(). For in " "process only execution you can use InMemoryPipeline.") check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) if pipeline_run.status == PipelineRunStatus.CANCELED: message = "Not starting execution since the run was canceled before execution could start" instance.report_engine_event( message, pipeline_run, ) raise DagsterInvariantViolationError(message) check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) pipeline_def = pipeline.get_definition() if pipeline_run.solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that " "conflicts with pipeline subset {pipeline_solids_to_execute}.". format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = create_execution_plan( pipeline, run_config=pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) if is_memoized_run(pipeline_run.tags): execution_plan = resolve_memoized_execution_plan(execution_plan) _execute_run_iterable = _ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=_pipeline_execution_iterator, execution_context_manager=PipelineExecutionContextManager( execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=raise_on_error, ), ) event_list = list(_execute_run_iterable) pipeline_context = _execute_run_iterable.pipeline_context # workaround for mem_io_manager to work in reconstruct_context, e.g. result.result_for_solid # in-memory values dict will get lost when the resource is re-initiated in reconstruct_context # so instead of re-initiating every single resource, we pass the resource instances to # reconstruct_context directly to avoid re-building from resource def. resource_instances_to_override = {} if pipeline_context: # None if we have a pipeline failure for ( key, resource_instance, ) in pipeline_context.scoped_resources_builder.resource_instance_dict.items( ): if isinstance(resource_instance, InMemoryIOManager): resource_instances_to_override[key] = resource_instance return PipelineExecutionResult( pipeline.get_definition(), pipeline_run.run_id, event_list, lambda hardcoded_resources_arg: scoped_pipeline_context( execution_plan, pipeline_run.run_config, pipeline_run, instance, intermediate_storage=pipeline_context.intermediate_storage, resource_instances_to_override=hardcoded_resources_arg, ), resource_instances_to_override=resource_instances_to_override, )
def execute_run(pipeline, pipeline_run, instance, raise_on_error=False): """Executes an existing pipeline run synchronously. Synchronous version of execute_run_iterator. Args: pipeline (IPipeline): The pipeline to execute. pipeline_run (PipelineRun): The run to execute instance (DagsterInstance): The instance in which the run has been created. raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur. Defaults to ``False``. Returns: PipelineExecutionResult: The result of the execution. """ if isinstance(pipeline, PipelineDefinition): raise DagsterInvariantViolationError( "execute_run requires an IPipeline but received a PipelineDefinition " "directly instead. To support hand-off to other processes provide a " "ReconstructablePipeline which can be done using reconstructable(). For in " "process only execution you can use InMemoryPipeline.") check.inst_param(pipeline, "pipeline", IPipeline) check.inst_param(pipeline_run, "pipeline_run", PipelineRun) check.inst_param(instance, "instance", DagsterInstance) check.invariant( pipeline_run.status == PipelineRunStatus.NOT_STARTED or pipeline_run.status == PipelineRunStatus.STARTING, desc="Pipeline run {} ({}) in state {}, expected NOT_STARTED or STARTING" .format(pipeline_run.pipeline_name, pipeline_run.run_id, pipeline_run.status), ) pipeline_def = pipeline.get_definition() if pipeline_run.solids_to_execute: if isinstance(pipeline_def, PipelineSubsetDefinition): check.invariant( pipeline_run.solids_to_execute == pipeline.solids_to_execute, "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that " "conflicts with pipeline subset {pipeline_solids_to_execute}.". format( pipeline_solids_to_execute=str_format_set( pipeline.solids_to_execute), solids_to_execute=str_format_set( pipeline_run.solids_to_execute), ), ) else: # when `execute_run` is directly called, the sub pipeline hasn't been created # note that when we receive the solids to execute via PipelineRun, it won't support # solid selection query syntax pipeline = pipeline.subset_for_execution_from_existing_pipeline( pipeline_run.solids_to_execute) execution_plan = create_execution_plan( pipeline, run_config=pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ) if is_memoized_run(pipeline_run.tags): execution_plan = resolve_memoized_execution_plan(execution_plan) _execute_run_iterable = _ExecuteRunWithPlanIterable( execution_plan=execution_plan, iterator=_pipeline_execution_iterator, execution_context_manager=PipelineExecutionContextManager( execution_plan=execution_plan, pipeline_run=pipeline_run, instance=instance, run_config=pipeline_run.run_config, raise_on_error=raise_on_error, ), ) event_list = list(_execute_run_iterable) pipeline_context = _execute_run_iterable.pipeline_context return PipelineExecutionResult( pipeline.get_definition(), pipeline_run.run_id, event_list, lambda: scoped_pipeline_context( execution_plan, pipeline_run.run_config, pipeline_run, instance, intermediate_storage=pipeline_context.intermediate_storage, ), )