def test_using_file_system_for_subplan_missing_input():
    pipeline = define_inty_pipeline()
    run_config = {"storage": {"filesystem": {}}}

    instance = DagsterInstance.ephemeral()
    environment_config = EnvironmentConfig.build(
        pipeline,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)

    events = execute_plan(
        execution_plan.build_subset_plan(["add_one"], pipeline,
                                         environment_config),
        InMemoryPipeline(pipeline),
        instance,
        run_config=run_config,
        pipeline_run=pipeline_run,
    )
    failures = [
        event for event in events if event.event_type_value == "STEP_FAILURE"
    ]
    assert len(failures) == 1
    assert failures[0].step_key == "add_one"
    assert "DagsterStepOutputNotFoundError" in failures[
        0].event_specific_data.error.message
Exemple #2
0
def test_compile():
    run_config = RunConfig()
    environment_config = EnvironmentConfig.build(
        composition,
        {'solids': {
            'add_four': {
                'inputs': {
                    'num': {
                        'value': 1
                    }
                }
            }
        }},
        run_config=None)

    plan = ExecutionPlan.build(
        composition, environment_config,
        composition.get_mode_definition(run_config.mode))

    res = coalesce_execution_steps(plan)

    assert set(res.keys()) == {
        'add_four.add_two.add_one',
        'add_four.add_two.add_one_2',
        'add_four.add_two_2.add_one',
        'add_four.add_two_2.add_one_2',
        'div_four.div_two',
        'div_four.div_two_2',
    }
Exemple #3
0
def create_execution_plan(
    pipeline: Union[IPipeline, PipelineDefinition],
    run_config: Optional[dict] = None,
    mode: Optional[str] = None,
    step_keys_to_execute: Optional[List[str]] = None,
) -> ExecutionPlan:
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)

    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)
    mode = check.opt_str_param(mode,
                               "mode",
                               default=pipeline_def.get_default_mode_name())
    check.opt_list_param(step_keys_to_execute,
                         "step_keys_to_execute",
                         of_type=str)

    environment_config = EnvironmentConfig.build(pipeline_def,
                                                 run_config,
                                                 mode=mode)

    return ExecutionPlan.build(pipeline,
                               environment_config,
                               mode=mode,
                               step_keys_to_execute=step_keys_to_execute)
def test_using_file_system_for_subplan_invalid_step():
    pipeline = define_inty_pipeline()

    run_config = {"storage": {"filesystem": {}}}

    instance = DagsterInstance.ephemeral()

    environment_config = EnvironmentConfig.build(
        pipeline,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        environment_config,
    )

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)

    with pytest.raises(DagsterExecutionStepNotFoundError):
        execute_plan(
            execution_plan.build_subset_plan(["nope.compute"], pipeline,
                                             environment_config),
            InMemoryPipeline(pipeline),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        )
Exemple #5
0
def create_execution_plan(
    pipeline: Union[IPipeline, PipelineDefinition],
    run_config: Optional[dict] = None,
    mode: Optional[str] = None,
    step_keys_to_execute: Optional[List[str]] = None,
    known_state: KnownExecutionState = None,
) -> ExecutionPlan:
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)
    mode = check.opt_str_param(mode,
                               "mode",
                               default=pipeline_def.get_default_mode_name())
    check.opt_nullable_list_param(step_keys_to_execute,
                                  "step_keys_to_execute",
                                  of_type=str)

    resolved_run_config = ResolvedRunConfig.build(pipeline_def,
                                                  run_config,
                                                  mode=mode)

    return ExecutionPlan.build(
        pipeline,
        resolved_run_config,
        step_keys_to_execute=step_keys_to_execute,
        known_state=known_state,
    )
Exemple #6
0
def test_compile():
    environment_config = EnvironmentConfig.build(
        composition,
        {"solids": {
            "add_four": {
                "inputs": {
                    "num": {
                        "value": 1
                    }
                }
            }
        }},
    )

    plan = ExecutionPlan.build(InMemoryPipeline(composition),
                               environment_config)

    res = coalesce_execution_steps(plan)

    assert set(res.keys()) == {
        "add_four.add_two.add_one",
        "add_four.add_two.add_one_2",
        "add_four.add_two_2.add_one",
        "add_four.add_two_2.add_one_2",
        "div_four.div_two",
        "div_four.div_two_2",
        "int_to_float",
    }
def test_using_file_system_for_subplan_missing_input():
    pipeline = define_inty_pipeline(using_file_system=True)

    instance = DagsterInstance.ephemeral()
    resolved_run_config = ResolvedRunConfig.build(pipeline, )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        resolved_run_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)

    events = execute_plan(
        execution_plan.build_subset_plan(["add_one"], pipeline,
                                         resolved_run_config),
        InMemoryPipeline(pipeline),
        instance,
        pipeline_run=pipeline_run,
    )
    failures = [
        event for event in events if event.event_type_value == "STEP_FAILURE"
    ]
    assert len(failures) == 1
    assert failures[0].step_key == "add_one"
    assert "DagsterExecutionLoadInputError" in failures[
        0].event_specific_data.error.message
Exemple #8
0
def create_execution_plan(pipeline,
                          environment_dict=None,
                          mode=None,
                          step_keys_to_execute=None):
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)

    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    mode = check.opt_str_param(mode,
                               'mode',
                               default=pipeline_def.get_default_mode_name())
    check.opt_list_param(step_keys_to_execute,
                         'step_keys_to_execute',
                         of_type=str)

    environment_config = EnvironmentConfig.build(pipeline_def,
                                                 environment_dict,
                                                 mode=mode)

    return ExecutionPlan.build(pipeline,
                               environment_config,
                               mode=mode,
                               step_keys_to_execute=step_keys_to_execute)
def test_compile():
    # TODO: remove dependency on legacy_examples
    # https://github.com/dagster-io/dagster/issues/2653
    environment_config = EnvironmentConfig.build(
        composition,
        {'solids': {
            'add_four': {
                'inputs': {
                    'num': {
                        'value': 1
                    }
                }
            }
        }},
    )

    plan = ExecutionPlan.build(InMemoryExecutablePipeline(composition),
                               environment_config)

    res = coalesce_execution_steps(plan)

    assert set(res.keys()) == {
        'add_four.add_two.add_one',
        'add_four.add_two.add_one_2',
        'add_four.add_two_2.add_one',
        'add_four.add_two_2.add_one_2',
        'div_four.div_two',
        'div_four.div_two_2',
        'int_to_float',
    }
Exemple #10
0
def test_compile():
    environment_config = EnvironmentConfig.build(
        composition,
        {'solids': {
            'add_four': {
                'inputs': {
                    'num': {
                        'value': 1
                    }
                }
            }
        }},
    )

    plan = ExecutionPlan.build(InMemoryExecutablePipeline(composition),
                               environment_config)

    res = coalesce_execution_steps(plan)

    assert set(res.keys()) == {
        'add_four.add_two.add_one',
        'add_four.add_two.add_one_2',
        'add_four.add_two_2.add_one',
        'add_four.add_two_2.add_one_2',
        'div_four.div_two',
        'div_four.div_two_2',
        'int_to_float',
    }
Exemple #11
0
def test_execution_plan_reexecution_with_in_memory():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = {"solids": {"add_one": {"inputs": {"num": {"value": 3}}}}}
    result = execute_pipeline(pipeline_def,
                              run_config=run_config,
                              instance=instance)

    assert result.success

    ## re-execute add_two

    environment_config = EnvironmentConfig.build(pipeline_def,
                                                 run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def),
                                         environment_config)

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=execution_plan,
        run_config=run_config,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
    )

    with pytest.raises(DagsterInvariantViolationError):
        execute_plan(
            execution_plan.build_subset_plan(["add_two"], pipeline_def,
                                             environment_config),
            InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
Exemple #12
0
def create_execution_plan(pipeline, environment_dict=None, mode=None):
    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict',
                                            key_type=str)
    check.opt_str_param(mode, 'mode')
    environment_config = create_environment_config(pipeline, environment_dict,
                                                   mode)
    return ExecutionPlan.build(pipeline, environment_config)
def test_using_intermediate_file_system_for_subplan_multiprocessing():
    with instance_for_test() as instance:

        run_config = {"intermediate_storage": {"filesystem": {}}}

        pipeline = reconstructable(define_inty_pipeline)

        environment_config = EnvironmentConfig.build(
            pipeline.get_definition(),
            run_config=run_config,
        )
        execution_plan = ExecutionPlan.build(
            pipeline,
            environment_config,
        )
        pipeline_run = instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            execution_plan=execution_plan)

        assert execution_plan.get_step_by_key("return_one")

        return_one_step_events = list(
            execute_plan(
                execution_plan.build_subset_plan(["return_one"],
                                                 pipeline.get_definition(),
                                                 environment_config),
                pipeline,
                instance,
                run_config=dict(run_config, execution={"multiprocess": {}}),
                pipeline_run=pipeline_run,
            ))

        intermediate_storage = build_fs_intermediate_storage(
            instance.intermediates_directory, pipeline_run.run_id)

        assert get_step_output(return_one_step_events, "return_one")
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("return_one"))
        assert (intermediate_storage.get_intermediate(
            None, Int, StepOutputHandle("return_one")).obj == 1)

        add_one_step_events = list(
            execute_plan(
                execution_plan.build_subset_plan(["add_one"],
                                                 pipeline.get_definition(),
                                                 environment_config),
                pipeline,
                instance,
                run_config=dict(run_config, execution={"multiprocess": {}}),
                pipeline_run=pipeline_run,
            ))

        assert get_step_output(add_one_step_events, "add_one")
        assert intermediate_storage.has_intermediate(
            None, StepOutputHandle("add_one"))
        assert (intermediate_storage.get_intermediate(
            None, Int, StepOutputHandle("add_one")).obj == 2)
Exemple #14
0
def _execute_pipeline_iterator(context_or_failure_event):
    # Due to use of context managers, if the user land code in context or resource init fails
    # we can get either a pipeline_context or the failure event here.
    if (isinstance(context_or_failure_event, DagsterEvent)
            and context_or_failure_event.event_type
            == DagsterEventType.PIPELINE_INIT_FAILURE):
        yield context_or_failure_event
        return

    pipeline_context = context_or_failure_event
    check.inst_param(pipeline_context, 'pipeline_context',
                     SystemPipelineExecutionContext)
    yield DagsterEvent.pipeline_start(pipeline_context)

    execution_plan = ExecutionPlan.build(
        pipeline_context.pipeline_def,
        pipeline_context.environment_config,
        pipeline_context.mode_def,
    )

    steps = execution_plan.topological_steps()

    if not steps:
        pipeline_context.log.debug(
            'Pipeline {pipeline} has no nodes and no execution will happen'.
            format(pipeline=pipeline_context.pipeline_def.display_name))
        yield DagsterEvent.pipeline_success(pipeline_context)
        return

    _setup_reexecution(pipeline_context.run_config, pipeline_context,
                       execution_plan)

    pipeline_context.log.debug(
        'About to execute the compute node graph in the following order {order}'
        .format(order=[step.key for step in steps]))

    check.invariant(
        len([
            step_input for step_input in steps[0].step_inputs
            if step_input.is_from_output
        ]) == 0)

    pipeline_success = True

    try:
        for event in invoke_executor_on_plan(
                pipeline_context, execution_plan,
                pipeline_context.run_config.step_keys_to_execute):
            if event.is_step_failure:
                pipeline_success = False
            yield event
    finally:
        if pipeline_success:
            yield DagsterEvent.pipeline_success(pipeline_context)
        else:
            yield DagsterEvent.pipeline_failure(pipeline_context)
Exemple #15
0
def create_execution_plan(pipeline, environment_dict=None, run_config=None):
    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
    run_config = check.opt_inst_param(run_config, 'run_config', RunConfig, RunConfig())

    environment_config = EnvironmentConfig.build(pipeline, environment_dict, run_config)

    return ExecutionPlan.build(
        pipeline, environment_config, pipeline.get_mode_definition(run_config.mode)
    )
def test_using_file_system_for_subplan_multiprocessing():
    with instance_for_test() as instance:
        pipeline = reconstructable(define_reconstructable_inty_pipeline)

        resolved_run_config = ResolvedRunConfig.build(
            pipeline.get_definition(), )
        execution_plan = ExecutionPlan.build(
            pipeline,
            resolved_run_config,
        )
        pipeline_run = instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            execution_plan=execution_plan)

        assert execution_plan.get_step_by_key("return_one")

        return_one_step_events = list(
            execute_plan(
                execution_plan.build_subset_plan(["return_one"],
                                                 pipeline.get_definition(),
                                                 resolved_run_config),
                pipeline,
                instance,
                run_config=dict(execution={"multiprocess": {}}),
                pipeline_run=pipeline_run,
            ))

        assert get_step_output(return_one_step_events, "return_one")
        with open(
                os.path.join(instance.storage_directory(), pipeline_run.run_id,
                             "return_one", "result"),
                "rb",
        ) as read_obj:
            assert pickle.load(read_obj) == 1

        add_one_step_events = list(
            execute_plan(
                execution_plan.build_subset_plan(["add_one"],
                                                 pipeline.get_definition(),
                                                 resolved_run_config),
                pipeline,
                instance,
                run_config=dict(execution={"multiprocess": {}}),
                pipeline_run=pipeline_run,
            ))

        assert get_step_output(add_one_step_events, "add_one")
        with open(
                os.path.join(instance.storage_directory(), pipeline_run.run_id,
                             "add_one", "result"),
                "rb",
        ) as read_obj:
            assert pickle.load(read_obj) == 2
def test_using_intermediates_file_system_for_subplan():
    pipeline = define_inty_pipeline()

    run_config = {"intermediate_storage": {"filesystem": {}}}

    instance = DagsterInstance.ephemeral()
    environment_config = EnvironmentConfig.build(
        pipeline,
        run_config=run_config,
    )

    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key("return_one")

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["return_one"], pipeline,
                                             environment_config),
            InMemoryPipeline(pipeline),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert get_step_output(return_one_step_events, "return_one")
    assert intermediate_storage.has_intermediate(
        None, StepOutputHandle("return_one"))
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("return_one")).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline,
                                             environment_config),
            InMemoryPipeline(pipeline),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(add_one_step_events, "add_one")
    assert intermediate_storage.has_intermediate(None,
                                                 StepOutputHandle("add_one"))
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 2
def test_using_file_system_for_subplan():
    pipeline = define_inty_pipeline(using_file_system=True)

    instance = DagsterInstance.ephemeral()

    resolved_run_config = ResolvedRunConfig.build(pipeline)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline),
                                         resolved_run_config)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key("return_one")

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["return_one"], pipeline,
                                             resolved_run_config),
            InMemoryPipeline(pipeline),
            instance,
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(return_one_step_events, "return_one")
    with open(
            os.path.join(instance.storage_directory(), pipeline_run.run_id,
                         "return_one", "result"),
            "rb",
    ) as read_obj:
        assert pickle.load(read_obj) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline,
                                             resolved_run_config),
            InMemoryPipeline(pipeline),
            instance,
            pipeline_run=pipeline_run,
        ))

    assert get_step_output(add_one_step_events, "add_one")
    with open(
            os.path.join(instance.storage_directory(), pipeline_run.run_id,
                         "add_one", "result"),
            "rb",
    ) as read_obj:
        assert pickle.load(read_obj) == 2
Exemple #19
0
def execute_list_versions_command(instance, kwargs):
    check.inst_param(instance, "instance", DagsterInstance)

    config = list(
        check.opt_tuple_param(kwargs.get("config"),
                              "config",
                              default=(),
                              of_type=str))
    preset = kwargs.get("preset")
    mode = kwargs.get("mode")

    if preset and config:
        raise click.UsageError("Can not use --preset with --config.")

    pipeline_origin = get_pipeline_python_origin_from_kwargs(kwargs)
    pipeline = recon_pipeline_from_origin(pipeline_origin)
    run_config = get_run_config_from_file_list(config)

    environment_config = EnvironmentConfig.build(pipeline.get_definition(),
                                                 run_config,
                                                 mode=mode)
    execution_plan = ExecutionPlan.build(pipeline, environment_config)

    step_output_versions = resolve_step_output_versions(
        pipeline.get_definition(), execution_plan, environment_config)
    memoized_plan = resolve_memoized_execution_plan(execution_plan,
                                                    pipeline.get_definition(),
                                                    run_config, instance,
                                                    environment_config)
    # the step keys that we need to execute are those which do not have their inputs populated.
    step_keys_not_stored = set(memoized_plan.step_keys_to_execute)
    table = []
    for step_output_handle, version in step_output_versions.items():
        table.append([
            "{key}.{output}".format(key=step_output_handle.step_key,
                                    output=step_output_handle.output_name),
            version,
            "stored" if step_output_handle.step_key not in step_keys_not_stored
            else "to-be-recomputed",
        ])
    table_str = tabulate(
        table,
        headers=["Step Output", "Version", "Status of Output"],
        tablefmt="github")
    click.echo(table_str)
def test_execute_step_wrong_step_key():
    pipeline = define_inty_pipeline()
    instance = DagsterInstance.ephemeral()

    environment_config = EnvironmentConfig.build(pipeline, )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)

    with pytest.raises(DagsterExecutionStepNotFoundError) as exc_info:
        execute_plan(
            execution_plan.build_subset_plan(["nope.compute"], pipeline,
                                             environment_config),
            InMemoryPipeline(pipeline),
            instance,
            pipeline_run=pipeline_run,
        )

    assert exc_info.value.step_keys == ["nope.compute"]

    assert str(exc_info.value
               ) == "Can not build subset plan from unknown step: nope.compute"

    with pytest.raises(DagsterExecutionStepNotFoundError) as exc_info:
        execute_plan(
            execution_plan.build_subset_plan(
                ["nope.compute", "nuh_uh.compute"], pipeline,
                environment_config),
            InMemoryPipeline(pipeline),
            instance,
            pipeline_run=pipeline_run,
        )

    assert exc_info.value.step_keys == ["nope.compute", "nuh_uh.compute"]

    assert (
        str(exc_info.value) ==
        "Can not build subset plan from unknown steps: nope.compute, nuh_uh.compute"
    )
Exemple #21
0
    def execute(self):
        from dagster.core.execution.api import scoped_pipeline_context

        check.inst(self.run_config.executor_config, MultiprocessExecutorConfig)
        pipeline = self.run_config.executor_config.handle.build_pipeline_definition(
        )

        with scoped_pipeline_context(
                pipeline, self.environment_dict,
                self.run_config.with_tags(
                    pid=str(os.getpid()))) as pipeline_context:

            execution_plan = ExecutionPlan.build(
                pipeline_context.pipeline_def,
                pipeline_context.environment_config)

            for step_event in InProcessEngine.execute(
                    pipeline_context,
                    execution_plan,
                    step_keys_to_execute=[self.step_key]):
                yield step_event
def test_using_intermediates_to_override():
    pipeline = define_inty_pipeline()

    run_config = {
        "storage": {
            "filesystem": {}
        },
        "intermediate_storage": {
            "in_memory": {}
        }
    }

    instance = DagsterInstance.ephemeral()
    resolved_run_config = ResolvedRunConfig.build(
        pipeline,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        resolved_run_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)
    assert execution_plan.get_step_by_key("return_one")

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["return_one"], pipeline,
                                             resolved_run_config),
            InMemoryPipeline(pipeline),
            instance,
            run_config=run_config,
            pipeline_run=pipeline_run,
        ))

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, pipeline_run.run_id)
    assert get_step_output(return_one_step_events, "return_one")
    assert not intermediate_storage.has_intermediate(
        None, StepOutputHandle("return_one"))
def test_using_file_system_for_subplan_invalid_step():
    pipeline = define_inty_pipeline(using_file_system=True)

    instance = DagsterInstance.ephemeral()

    resolved_run_config = ResolvedRunConfig.build(pipeline, )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline),
        resolved_run_config,
    )

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline, execution_plan=execution_plan)

    with pytest.raises(DagsterExecutionStepNotFoundError):
        execute_plan(
            execution_plan.build_subset_plan(["nope.compute"], pipeline,
                                             resolved_run_config),
            InMemoryPipeline(pipeline),
            instance,
            pipeline_run=pipeline_run,
        )
Exemple #24
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            raise DagstermillError(
                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
            ) from err

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=pipeline_run.mode)

        execution_plan = ExecutionPlan.build(
            self.pipeline,
            environment_config,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=run_config.get("solids",
                                            {}).get(solid_def.name,
                                                    {}).get("config"),
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
Exemple #25
0
    def get_context(self, solid_config=None, mode_def=None, run_config=None):
        """Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            run_config(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        """
        check.opt_inst_param(mode_def, "mode_def", ModeDefinition)
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
                self.context, DagstermillRuntimeExecutionContext):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={"dagstermill": colored_console_logger})
            run_config["loggers"] = {"dagstermill": {}}

        solid_def = SolidDefinition(
            name="this_solid",
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            "Ephemeral solid constructed by dagstermill.get_context()",
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name="ephemeral_dagstermill_pipeline")

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        environment_config = EnvironmentConfig.build(pipeline_def,
                                                     run_config,
                                                     mode=mode_def.name)

        pipeline = InMemoryPipeline(pipeline_def)
        execution_plan = ExecutionPlan.build(pipeline, environment_config)

        with scoped_pipeline_context(
                execution_plan,
                pipeline,
                run_config,
                pipeline_run,
                DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
Exemple #26
0
def test_gcs_pickle_io_manager_execution(gcs_bucket):
    inty_job = define_inty_job()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "gcs_bucket": gcs_bucket,
                }
            }
        }
    }

    run_id = make_new_run_id()

    resolved_run_config = ResolvedRunConfig.build(inty_job,
                                                  run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(inty_job),
                                         resolved_run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=inty_job.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, inty_job,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(inty_job),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")

    io_manager = PickledObjectGCSIOManager(gcs_bucket, storage.Client())
    step_output_handle = StepOutputHandle("return_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], inty_job,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(inty_job),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2
Exemple #27
0
def test_using_gcs_for_subplan(gcs_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {"intermediate_storage": {"gcs": {"config": {"gcs_bucket": gcs_bucket}}}}

    run_id = make_new_run_id()

    environment_config = EnvironmentConfig.build(pipeline_def, run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def), environment_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(
        pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config
    )

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, pipeline_def, environment_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(return_one_step_events, "return_one")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["return_one"], pipeline_def, environment_config),
        InMemoryPipeline(pipeline_def),
        run_config,
        pipeline_run,
        instance,
    ) as context:
        intermediate_storage = GCSIntermediateStorage(
            gcs_bucket,
            run_id,
            client=context.scoped_resources_builder.build(
                required_resource_keys={"gcs"},
            ).gcs,
        )
        assert intermediate_storage.has_intermediate(context, StepOutputHandle("return_one"))
        assert (
            intermediate_storage.get_intermediate(context, Int, StepOutputHandle("return_one")).obj
            == 1
        )

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline_def, environment_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(add_one_step_events, "add_one")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["return_one"], pipeline_def, environment_config),
        InMemoryPipeline(pipeline_def),
        run_config,
        pipeline_run,
        instance,
    ) as context:
        assert intermediate_storage.has_intermediate(context, StepOutputHandle("add_one"))
        assert (
            intermediate_storage.get_intermediate(context, Int, StepOutputHandle("add_one")).obj
            == 2
        )
def test_using_s3_for_subplan(mock_s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "intermediate_storage": {
            "s3": {
                "config": {
                    "s3_bucket": mock_s3_bucket.name
                }
            }
        }
    }

    run_id = make_new_run_id()

    environment_config = EnvironmentConfig.build(pipeline_def,
                                                 run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def),
                                         environment_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, pipeline_def,
                                             environment_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(["return_one"], pipeline_def,
                                             environment_config),
            InMemoryPipeline(pipeline_def),
            run_config,
            pipeline_run,
            instance,
    ) as context:

        intermediates_manager = S3IntermediateStorage(
            mock_s3_bucket.name,
            run_id,
            s3_session=context.scoped_resources_builder.build(
                required_resource_keys={"s3"}, ).s3,
        )
        step_output_handle = StepOutputHandle("return_one")
        assert intermediates_manager.has_intermediate(context,
                                                      step_output_handle)
        assert intermediates_manager.get_intermediate(
            context, Int, step_output_handle).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline_def,
                                             environment_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, "add_one")
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(["add_one"], pipeline_def,
                                             environment_config),
            InMemoryPipeline(pipeline_def),
            run_config,
            pipeline_run,
            instance,
    ) as context:
        step_output_handle = StepOutputHandle("add_one")
        assert intermediates_manager.has_intermediate(context,
                                                      step_output_handle)
        assert intermediates_manager.get_intermediate(
            context, Int, step_output_handle).obj == 2
Exemple #29
0
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        run_config=run_config,
        instance=instance,
    )

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    ## re-execute add_two

    environment_config = EnvironmentConfig.build(
        pipeline_def,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline_def),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=execution_plan,
        run_config=run_config,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
    )

    step_events = execute_plan(
        execution_plan.build_subset_plan(["add_two"], pipeline_def,
                                         environment_config),
        InMemoryPipeline(pipeline_def),
        run_config=run_config,
        pipeline_run=pipeline_run,
        instance=instance,
    )

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    assert not get_step_output_event(step_events, "add_one")
    assert get_step_output_event(step_events, "add_two")
Exemple #30
0
def test_s3_pickle_io_manager_execution(mock_s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "s3_bucket": mock_s3_bucket.name
                }
            }
        }
    }

    run_id = make_new_run_id()

    resolved_run_config = ResolvedRunConfig.build(pipeline_def,
                                                  run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def),
                                         resolved_run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, pipeline_def,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")

    io_manager = PickledObjectS3IOManager(mock_s3_bucket.name,
                                          construct_s3_client(max_attempts=5),
                                          s3_prefix="dagster")
    step_output_handle = StepOutputHandle("return_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline_def,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2