Beispiel #1
0
def test_reexecution_fs_storage():
    @lambda_solid
    def return_one():
        return 1

    @lambda_solid
    def add_one(num):
        return num + 1

    pipeline_def = PipelineDefinition(
        solid_defs=[return_one, add_one],
        dependencies={'add_one': {
            'num': DependencyDefinition('return_one')
        }},
    )
    environment_dict = {'storage': {'filesystem': {}}}
    instance = DagsterInstance.ephemeral()
    pipeline_result = execute_pipeline(
        pipeline_def,
        environment_dict={'storage': {
            'filesystem': {}
        }},
        instance=instance)
    assert pipeline_result.success
    assert pipeline_result.result_for_solid('add_one').output_value() == 2

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        parent_run_id=pipeline_result.run_id,
        root_run_id=pipeline_result.run_id,
    )

    reexecution_result = execute_run(InMemoryExecutablePipeline(pipeline_def),
                                     pipeline_run, instance)

    assert reexecution_result.success
    assert len(reexecution_result.solid_result_list) == 2
    assert reexecution_result.result_for_solid(
        'return_one').output_value() == 1
    assert reexecution_result.result_for_solid('add_one').output_value() == 2
    reexecution_run = instance.get_run_by_id(reexecution_result.run_id)
    assert reexecution_run.parent_run_id == pipeline_result.run_id
    assert reexecution_run.root_run_id == pipeline_result.run_id

    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        parent_run_id=reexecution_result.run_id,
        root_run_id=pipeline_result.run_id,
    )

    grandchild_result = execute_run(InMemoryExecutablePipeline(pipeline_def),
                                    pipeline_run, instance)

    assert grandchild_result.success
    assert len(grandchild_result.solid_result_list) == 2
    assert grandchild_result.result_for_solid('return_one').output_value() == 1
    assert grandchild_result.result_for_solid('add_one').output_value() == 2
    grandchild_run = instance.get_run_by_id(grandchild_result.run_id)
    assert grandchild_run.parent_run_id == reexecution_result.run_id
    assert grandchild_run.root_run_id == pipeline_result.run_id
Beispiel #2
0
def test_in_memory_persist_one_run():
    with DagsterInstance.ephemeral() as instance:
        do_test_single_write_read(instance)
Beispiel #3
0
def test_create_app_with_workspace():
    with load_workspace_from_yaml_paths(
        [file_relative_path(__file__, "./workspace.yaml")], ) as workspace:
        assert create_app_from_workspace(workspace,
                                         DagsterInstance.ephemeral())
Beispiel #4
0
def test_create_app_with_workspace():
    with load_workspace_process_context_from_yaml_paths(
        DagsterInstance.ephemeral(),
        [file_relative_path(__file__, "./workspace.yaml")],
    ) as workspace_process_context:
        assert create_app_from_workspace_process_context(workspace_process_context)
Beispiel #5
0
def test_successful_pipeline_reexecution(snapshot):
    def sanitize_result_data(result_data):
        if isinstance(result_data, dict):
            if 'path' in result_data:
                result_data['path'] = 'DUMMY_PATH'
            result_data = {
                k: sanitize_result_data(v)
                for k, v in result_data.items()
            }
        elif isinstance(result_data, list):
            for i in range(len(result_data)):
                result_data[i] = sanitize_result_data(result_data[i])
        else:
            pass
        return result_data

    run_id = str(uuid.uuid4())
    instance = DagsterInstance.ephemeral()
    result_one = execute_dagster_graphql(
        define_context(instance=instance),
        START_PIPELINE_EXECUTION_SNAPSHOT_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData':
                csv_hello_world_solids_config_fs_storage(),
                'executionMetadata': {
                    'runId': run_id
                },
                'mode': 'default',
            }
        },
    )

    assert (result_one.data['startPipelineExecution']['__typename'] ==
            'StartPipelineExecutionSuccess')

    snapshot.assert_match(sanitize_result_data(result_one.data))

    expected_value_repr = (
        '''[OrderedDict([('num1', '1'), ('num2', '2'), ('sum', 3), '''
        '''('sum_sq', 9)]), OrderedDict([('num1', '3'), ('num2', '4'), ('sum', 7), '''
        '''('sum_sq', 49)])]''')

    store = FilesystemIntermediateStore.for_instance(instance, run_id)
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
    assert (str(
        store.get_intermediate(None, 'sum_sq_solid.compute',
                               PoorMansDataFrame).obj) == expected_value_repr)

    new_run_id = str(uuid.uuid4())

    result_two = execute_dagster_graphql(
        define_context(instance=instance),
        START_PIPELINE_EXECUTION_SNAPSHOT_QUERY,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData':
                csv_hello_world_solids_config_fs_storage(),
                'stepKeys': ['sum_sq_solid.compute'],
                'executionMetadata': {
                    'runId': new_run_id
                },
                'mode': 'default',
            },
            'reexecutionConfig': {
                'previousRunId':
                run_id,
                'stepOutputHandles': [{
                    'stepKey': 'sum_solid.compute',
                    'outputName': 'result'
                }],
            },
        },
    )

    query_result = result_two.data['startPipelineExecution']
    assert query_result['__typename'] == 'StartPipelineExecutionSuccess'
    logs = query_result['run']['logs']['nodes']

    assert isinstance(logs, list)
    assert has_event_of_type(logs, 'PipelineStartEvent')
    assert has_event_of_type(logs, 'PipelineSuccessEvent')
    assert not has_event_of_type(logs, 'PipelineFailureEvent')

    assert not get_step_output_event(logs, 'sum_solid.compute')
    assert get_step_output_event(logs, 'sum_sq_solid.compute')

    snapshot.assert_match(sanitize_result_data(result_two.data))

    store = FilesystemIntermediateStore.for_instance(instance, new_run_id)
    assert not store.has_intermediate(None, 'sum_solid.inputs.num.read',
                                      'input_thunk_output')
    assert store.has_intermediate(None, 'sum_solid.compute')
    assert store.has_intermediate(None, 'sum_sq_solid.compute')
    assert (str(
        store.get_intermediate(None, 'sum_sq_solid.compute',
                               PoorMansDataFrame).obj) == expected_value_repr)
Beispiel #6
0
def test_create_app_with_reconstructable_repo():
    recon_repo = ReconstructableRepository.from_yaml(
        file_relative_path(__file__, './repository.yaml')
    )
    assert create_app_with_reconstructable_repo(recon_repo, DagsterInstance.ephemeral())
Beispiel #7
0
def test_using_adls2_for_subplan(storage_account, file_system):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "adls2": {
                "config": {
                    "storage_account": storage_account,
                    "credential": get_azure_credential()
                }
            }
        },
        "intermediate_storage": {
            "adls2": {
                "config": {
                    "adls2_file_system": file_system
                }
            }
        },
    }

    run_id = make_new_run_id()

    environment_config = EnvironmentConfig.build(pipeline_def,
                                                 run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def),
                                         environment_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, pipeline_def,
                                             environment_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(["return_one"], pipeline_def,
                                             environment_config),
            InMemoryPipeline(pipeline_def),
            run_config,
            pipeline_run,
            instance,
    ) as context:

        resource = context.scoped_resources_builder.build(
            required_resource_keys={"adls2"}).adls2
        intermediate_storage = ADLS2IntermediateStorage(
            file_system=file_system,
            run_id=run_id,
            adls2_client=resource.adls2_client,
            blob_client=resource.blob_client,
        )
        step_output_handle = StepOutputHandle("return_one")
        assert intermediate_storage.has_intermediate(context,
                                                     step_output_handle)
        assert intermediate_storage.get_intermediate(
            context, Int, step_output_handle).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline_def,
                                             environment_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, "add_one")
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(["add_one"], pipeline_def,
                                             environment_config),
            InMemoryPipeline(pipeline_def),
            run_config,
            pipeline_run,
            instance,
    ) as context:
        step_output_handle = StepOutputHandle("add_one")
        assert intermediate_storage.has_intermediate(context,
                                                     step_output_handle)
        assert intermediate_storage.get_intermediate(
            context, Int, step_output_handle).obj == 2
Beispiel #8
0
def _check_execute_pipeline_args(fn_name, pipeline, environment_dict, mode,
                                 preset, tags, run_config, instance):
    # backcompat
    if isinstance(pipeline, PipelineDefinition):
        pipeline = InMemoryExecutablePipeline(pipeline)

    check.inst_param(pipeline, 'pipeline', ExecutablePipeline)
    pipeline_def = pipeline.get_definition()

    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')

    check.opt_str_param(mode, 'mode')
    check.opt_str_param(preset, 'preset')
    check.invariant(
        not (mode is not None and preset is not None),
        'You may set only one of `mode` (got {mode}) or `preset` (got {preset}).'
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, 'tags', key_type=str)

    run_config = check.opt_inst_param(run_config,
                                      'run_config',
                                      RunConfig,
                                      default=RunConfig())

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        check.invariant(
            run_config.mode is None or pipeline_preset.mode == run_config.mode,
            'The mode set in preset \'{preset}\' (\'{preset_mode}\') does not agree with the mode '
            'set in the `run_config` (\'{run_config_mode}\')'.format(
                preset=preset,
                preset_mode=pipeline_preset.mode,
                run_config_mode=run_config.mode),
        )

        if pipeline_preset.environment_dict is not None:
            check.invariant(
                (not environment_dict)
                or (pipeline_preset.environment_dict == environment_dict),
                'The environment set in preset \'{preset}\' does not agree with the environment '
                'passed in the `environment_dict` argument.'.format(
                    preset=preset),
            )

            environment_dict = pipeline_preset.environment_dict

        if pipeline_preset.solid_subset is not None:
            pipeline = pipeline.build_sub_pipeline(
                pipeline_preset.solid_subset)

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            'Mode {mode} does not agree with the mode set in preset \'{preset}\': '
            '(\'{preset_mode}\')'.format(preset=preset,
                                         preset_mode=pipeline_preset.mode,
                                         mode=mode),
        )

        mode = pipeline_preset.mode

    if run_config.mode is not None or run_config.tags:
        warnings.warn((
            'In 0.8.0, the use of `run_config` to set pipeline mode and tags will be '
            'deprecated. Please use the `mode` and `tags` arguments to `{fn_name}` '
            'instead.').format(fn_name=fn_name))

    if run_config.mode is not None:
        if mode is not None:
            check.invariant(
                run_config.mode == mode,
                'Mode \'{mode}\' does not agree with the mode set in the `run_config`: '
                '\'{run_config_mode}\''.format(
                    mode=mode, run_config_mode=run_config.mode),
            )
        mode = run_config.mode

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                'You have attempted to execute pipeline {name} with mode {mode}. '
                'Available modes: {modes}').format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if not pipeline_def.is_single_mode:
            raise DagsterInvariantViolationError((
                'Pipeline {name} has multiple modes (Available modes: {modes}) and you have '
                'attempted to execute it without specifying a mode. Set '
                'mode property on the PipelineRun object.').format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(merge_dicts(pipeline_def.tags, run_config.tags or {}),
                       tags)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    execution_plan = create_execution_plan(
        pipeline,
        environment_dict,
        mode=mode,
        step_keys_to_execute=run_config.step_keys_to_execute,
    )

    return pipeline, environment_dict, instance, mode, tags, run_config, execution_plan
Beispiel #9
0
def test_basic_execute_plan_with_materialization():
    with get_temp_file_name() as out_csv_path:

        environment_dict = {
            'solids': {
                'sum_solid': {
                    'inputs': {
                        'num': file_relative_path(__file__, '../data/num.csv')
                    },
                    'outputs': [{
                        'result': out_csv_path
                    }],
                }
            }
        }

        instance = DagsterInstance.ephemeral()

        result = execute_dagster_graphql(
            define_test_context(instance=instance),
            EXECUTION_PLAN_QUERY,
            variables={
                'pipeline': {
                    'name': 'csv_hello_world'
                },
                'environmentConfigData': environment_dict,
                'mode': 'default',
            },
        )

        steps_data = result.data['executionPlan']['steps']

        assert [step_data['key'] for step_data in steps_data] == [
            'sum_solid.compute',
            'sum_sq_solid.compute',
        ]

        run_id = make_new_run_id()
        instance.create_empty_run(run_id, 'csv_hello_world')

        result = execute_dagster_graphql(
            define_test_context(instance=instance),
            EXECUTE_PLAN_QUERY,
            variables={
                'executionParams': {
                    'selector': {
                        'name': 'csv_hello_world'
                    },
                    'environmentConfigData': environment_dict,
                    'stepKeys': ['sum_solid.compute', 'sum_sq_solid.compute'],
                    'executionMetadata': {
                        'runId': run_id
                    },
                    'mode': 'default',
                }
            },
        )

        assert result.data

        step_mat_event = None

        for message in result.data['executePlan']['stepEvents']:
            if message['__typename'] == 'StepMaterializationEvent':
                # ensure only one event
                assert step_mat_event is None
                step_mat_event = message

        # ensure only one event
        assert step_mat_event
        assert step_mat_event['materialization']
        assert len(step_mat_event['materialization']['metadataEntries']) == 1
        metadata_entry = step_mat_event['materialization']['metadataEntries'][
            0]
        assert metadata_entry['path'] == out_csv_path
Beispiel #10
0
def _check_execute_pipeline_args(pipeline,
                                 run_config,
                                 mode,
                                 preset,
                                 tags,
                                 instance,
                                 solid_selection=None):
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)

    run_config = check.opt_dict_param(run_config, 'run_config')
    check.opt_str_param(mode, 'mode')
    check.opt_str_param(preset, 'preset')
    check.invariant(
        not (mode is not None and preset is not None),
        'You may set only one of `mode` (got {mode}) or `preset` (got {preset}).'
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, 'tags', key_type=str)
    check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                'The environment set in preset \'{preset}\' does not agree with the environment '
                'passed in the `run_config` argument.'.format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None
                or solid_selection == pipeline_preset.solid_selection,
                'The solid_selection set in preset \'{preset}\', {preset_subset}, does not agree with '
                'the `solid_selection` argument: {solid_selection}'.format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            'Mode {mode} does not agree with the mode set in preset \'{preset}\': '
            '(\'{preset_mode}\')'.format(preset=preset,
                                         preset_mode=pipeline_preset.mode,
                                         mode=mode),
        )

        mode = pipeline_preset.mode

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                'You have attempted to execute pipeline {name} with mode {mode}. '
                'Available modes: {modes}').format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                'Pipeline {name} has multiple modes (Available modes: {modes}) and you have '
                'attempted to execute it without specifying a mode. Set '
                'mode property on the PipelineRun object.').format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        instance,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
Beispiel #11
0
def test_all_step_events():  # pylint: disable=too-many-locals
    workspace = workspace_from_load_target(
        PythonFileTarget(__file__, define_test_events_pipeline.__name__)
    )
    pipeline_def = define_test_events_pipeline()
    mode = pipeline_def.get_default_mode_name()
    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(pipeline_def, mode=mode)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def, execution_plan=execution_plan, mode=mode
    )
    step_levels = execution_plan.topological_step_levels()

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        'LogMessageEvent',
        'PipelineStartEvent',
        'PipelineSuccessEvent',
        'PipelineInitFailureEvent',
        'PipelineFailureEvent',
    }

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:

            variables = {
                'executionParams': {
                    'selector': {
                        'repositoryLocationName': 'test_events',
                        'repositoryName': '<<unnamed>>',
                        'pipelineName': pipeline_def.name,
                    },
                    'runConfigData': {'storage': {'filesystem': {}}},
                    'mode': mode,
                    'executionMetadata': {'runId': pipeline_run.run_id},
                    'stepKeys': [step.key],
                },
            }
            res = execute_query(workspace, EXECUTE_PLAN_MUTATION, variables, instance=instance,)

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get('errors'):
                assert 'data' in res, res
                assert 'executePlan' in res['data'], res
                assert 'stepEvents' in res['data']['executePlan'], res
                step_events = res['data']['executePlan']['stepEvents']

                events = [
                    dagster_event_from_dict(e, pipeline_def.name)
                    for e in step_events
                    if e['__typename'] not in ignored_events
                ]

                for event in events:
                    if event.step_key:
                        key = event.step_key + '.' + event.event_type_value
                    else:
                        key = event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {DagsterEventType(e.event_type_value) for e in events}
            else:
                raise Exception(res['errors'])

    # build up a dict, incrementing all the event records we've produced in the run storage
    logs = instance.all_logs(pipeline_run.run_id)
    for log in logs:
        if not log.dagster_event or (
            DagsterEventType(log.dagster_event.event_type_value)
            not in STEP_EVENTS.union(set([DagsterEventType.ENGINE_EVENT]))
        ):
            continue
        if log.dagster_event.step_key:
            key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value
        else:
            key = log.dagster_event.event_type_value
        event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    # Why are these retry events not handled? Because right now there is no way to configure retries
    # on executePlan -- this needs to change, and we should separate the ExecutionParams that get
    # sent to executePlan fromm those that get sent to startPipelineExecution and friends
    assert unhandled_events == {DagsterEventType.STEP_UP_FOR_RETRY, DagsterEventType.STEP_RESTARTED}
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline(using_file_system=True)
    instance = DagsterInstance.ephemeral()
    run_config = {"solids": {"add_one": {"inputs": {"num": {"value": 3}}}}}
    result = execute_pipeline(
        pipeline_def,
        run_config=run_config,
        instance=instance,
    )

    assert result.success

    with open(
        os.path.join(instance.storage_directory(), result.run_id, "add_one", "result"),
        "rb",
    ) as read_obj:
        assert pickle.load(read_obj) == 4

    with open(
        os.path.join(instance.storage_directory(), result.run_id, "add_two", "result"),
        "rb",
    ) as read_obj:
        assert pickle.load(read_obj) == 6

    ## re-execute add_two

    resolved_run_config = ResolvedRunConfig.build(
        pipeline_def,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline_def),
        resolved_run_config,
    )

    subset_plan = execution_plan.build_subset_plan(["add_two"], pipeline_def, resolved_run_config)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=subset_plan,
        run_config=run_config,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
    )

    step_events = execute_plan(
        subset_plan,
        InMemoryPipeline(pipeline_def),
        run_config=run_config,
        pipeline_run=pipeline_run,
        instance=instance,
    )
    assert not os.path.exists(
        os.path.join(instance.storage_directory(), pipeline_run.run_id, "add_one", "result")
    )
    with open(
        os.path.join(instance.storage_directory(), pipeline_run.run_id, "add_two", "result"),
        "rb",
    ) as read_obj:
        assert pickle.load(read_obj) == 6

    assert not get_step_output_event(step_events, "add_one")
    assert get_step_output_event(step_events, "add_two")
Beispiel #13
0
def _check_execute_pipeline_args(pipeline,
                                 run_config,
                                 mode,
                                 preset,
                                 tags,
                                 instance,
                                 solid_selection=None):
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)

    run_config = check.opt_dict_param(run_config, "run_config")
    check.opt_str_param(mode, "mode")
    check.opt_str_param(preset, "preset")
    check.invariant(
        not (mode is not None and preset is not None),
        "You may set only one of `mode` (got {mode}) or `preset` (got {preset})."
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, "tags", key_type=str)
    check.opt_list_param(solid_selection, "solid_selection", of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                "The environment set in preset '{preset}' does not agree with the environment "
                "passed in the `run_config` argument.".format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None
                or solid_selection == pipeline_preset.solid_selection,
                "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "
                "the `solid_selection` argument: {solid_selection}".format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            "Mode {mode} does not agree with the mode set in preset '{preset}': "
            "('{preset_mode}')".format(preset=preset,
                                       preset_mode=pipeline_preset.mode,
                                       mode=mode),
        )

        mode = pipeline_preset.mode

        tags = merge_dicts(pipeline_preset.tags, tags)

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                "You have attempted to execute pipeline {name} with mode {mode}. "
                "Available modes: {modes}").format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "
                "attempted to execute it without specifying a mode. Set "
                "mode property on the PipelineRun object.").format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    check.opt_inst_param(instance, "instance", DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        instance,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
Beispiel #14
0
def test_reexecution_fs_storage_with_subset():
    @lambda_solid
    def return_one():
        return 1

    @lambda_solid
    def add_one(num):
        return num + 1

    pipeline_def = PipelineDefinition(
        solid_defs=[return_one, add_one],
        dependencies={'add_one': {
            'num': DependencyDefinition('return_one')
        }},
    )
    environment_dict = {'storage': {'filesystem': {}}}
    instance = DagsterInstance.ephemeral()
    pipeline_result = execute_pipeline(pipeline_def,
                                       environment_dict,
                                       instance=instance)
    assert pipeline_result.success
    assert pipeline_result.result_for_solid('add_one').output_value() == 2

    # This is how this is actually done in dagster_graphql.implementation.pipeline_execution_manager
    reexecution_pipeline_run = instance.create_run_for_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        step_keys_to_execute=['return_one.compute'],
        parent_run_id=pipeline_result.run_id,
        root_run_id=pipeline_result.run_id,
    )
    reexecution_result_no_subset = execute_run(
        InMemoryExecutablePipeline(pipeline_def), reexecution_pipeline_run,
        instance)
    assert reexecution_result_no_subset.success
    assert len(reexecution_result_no_subset.solid_result_list) == 2
    assert reexecution_result_no_subset.result_for_solid('add_one').skipped
    assert reexecution_result_no_subset.result_for_solid(
        'return_one').output_value() == 1

    pipeline_result_subset = execute_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        instance=instance,
        solid_selection=['return_one'],
    )
    assert pipeline_result_subset.success
    assert len(pipeline_result_subset.solid_result_list) == 1
    with pytest.raises(DagsterInvariantViolationError):
        pipeline_result_subset.result_for_solid('add_one')
    assert pipeline_result_subset.result_for_solid(
        'return_one').output_value() == 1

    reexecution_pipeline_run = instance.create_run_for_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        parent_run_id=pipeline_result_subset.run_id,
        root_run_id=pipeline_result_subset.run_id,
        solids_to_execute={'return_one'},
        step_keys_to_execute=['return_one.compute'],
    )

    reexecution_result = execute_run(InMemoryExecutablePipeline(pipeline_def),
                                     reexecution_pipeline_run, instance)

    assert reexecution_result.success
    assert len(reexecution_result.solid_result_list) == 1
    with pytest.raises(DagsterInvariantViolationError):
        pipeline_result_subset.result_for_solid('add_one')
    assert reexecution_result.result_for_solid(
        'return_one').output_value() == 1

    with pytest.raises(
            DagsterExecutionStepNotFoundError,
            match=re.escape(
                'Execution plan does not contain step: add_one.compute'),
    ):
        instance.create_run_for_pipeline(
            pipeline_def,
            environment_dict=environment_dict,
            parent_run_id=pipeline_result_subset.run_id,
            root_run_id=pipeline_result_subset.run_id,
            solids_to_execute={'return_one'},
            step_keys_to_execute=['add_one.compute'],
        )

    re_reexecution_pipeline_run = instance.create_run_for_pipeline(
        pipeline_def,
        environment_dict=environment_dict,
        parent_run_id=reexecution_result.run_id,
        root_run_id=reexecution_result.run_id,
        solids_to_execute={'return_one'},
        step_keys_to_execute=['return_one.compute'],
    )

    re_reexecution_result = execute_run(
        InMemoryExecutablePipeline(pipeline_def), re_reexecution_pipeline_run,
        instance)

    assert re_reexecution_result.success
    assert len(re_reexecution_result.solid_result_list) == 1
    assert re_reexecution_result.result_for_solid(
        'return_one').output_value() == 1

    with pytest.raises(
            DagsterExecutionStepNotFoundError,
            match=re.escape('Execution plan does not contain step: add_one'),
    ):
        instance.create_run_for_pipeline(
            pipeline_def,
            environment_dict=environment_dict,
            parent_run_id=reexecution_result.run_id,
            root_run_id=reexecution_result.run_id,
            solids_to_execute={'return_one'},
            step_keys_to_execute=['add_one.compute'],
        )
Beispiel #15
0
def test_all_step_events():  # pylint: disable=too-many-locals
    handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline)
    pipeline = handle.build_pipeline_definition()
    mode = pipeline.get_default_mode_name()
    run_config = RunConfig(mode=mode)
    execution_plan = create_execution_plan(pipeline, {}, run_config=run_config)
    step_levels = execution_plan.topological_step_levels()

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        'LogMessageEvent',
        'PipelineStartEvent',
        'PipelineSuccessEvent',
        'PipelineInitFailureEvent',
        'PipelineFailureEvent',
    }

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:

            variables = {
                'executionParams': {
                    'selector': {'name': pipeline.name},
                    'environmentConfigData': {'storage': {'filesystem': {}}},
                    'mode': mode,
                    'executionMetadata': {'runId': run_config.run_id},
                    'stepKeys': [step.key],
                }
            }
            instance = DagsterInstance.ephemeral()
            res = execute_query(
                handle, START_PIPELINE_EXECUTION_QUERY, variables, instance=instance
            )

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get('errors'):
                run_logs = res['data']['startPipelineExecution']['run']['logs']['nodes']

                events = [
                    dagster_event_from_dict(e, pipeline.name)
                    for e in run_logs
                    if e['__typename'] not in ignored_events
                ]

                for event in events:
                    if event.step_key:
                        key = event.step_key + '.' + event.event_type_value
                    else:
                        key = event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {DagsterEventType(e.event_type_value) for e in events}
            else:
                raise Exception(res['errors'])

            # build up a dict, incrementing all the event records we've produced in the run storage
            logs = instance.all_logs(run_config.run_id)
            for log in logs:
                if not log.dagster_event or (
                    DagsterEventType(log.dagster_event.event_type_value)
                    not in STEP_EVENTS.union(set([DagsterEventType.ENGINE_EVENT]))
                ):
                    continue
                if log.dagster_event.step_key:
                    key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value
                else:
                    key = log.dagster_event.event_type_value
                event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    assert not unhandled_events
Beispiel #16
0
def test_reexecution_fs_storage_with_solid_selection():
    @solid
    def return_one():
        return 1

    @solid
    def add_one(num):
        return num + 1

    pipeline_def = PipelineDefinition(
        solid_defs=[return_one, add_one],
        name="test",
        dependencies={"add_one": {
            "num": DependencyDefinition("return_one")
        }},
    )
    run_config = {"storage": {"filesystem": {}}}
    instance = DagsterInstance.ephemeral()
    # Case 1: re-execute a part of a pipeline when the original pipeline doesn't have solid selection
    pipeline_result = execute_pipeline(pipeline_def,
                                       run_config,
                                       instance=instance)
    assert pipeline_result.success
    assert pipeline_result.result_for_solid("add_one").output_value() == 2

    # This is how this is actually done in dagster_graphql.implementation.pipeline_execution_manager
    reexecution_result_no_solid_selection = reexecute_pipeline(
        pipeline_def,
        parent_run_id=pipeline_result.run_id,
        run_config=run_config,
        step_selection=["return_one"],
        instance=instance,
    )
    assert reexecution_result_no_solid_selection.success
    assert len(reexecution_result_no_solid_selection.solid_result_list) == 2
    assert reexecution_result_no_solid_selection.result_for_solid(
        "add_one").skipped
    assert reexecution_result_no_solid_selection.result_for_solid(
        "return_one").output_value() == 1

    # Case 2: re-execute a pipeline when the original pipeline has solid selection
    pipeline_result_solid_selection = execute_pipeline(
        pipeline_def,
        run_config=run_config,
        instance=instance,
        solid_selection=["return_one"],
    )
    assert pipeline_result_solid_selection.success
    assert len(pipeline_result_solid_selection.solid_result_list) == 1
    with pytest.raises(DagsterInvariantViolationError):
        pipeline_result_solid_selection.result_for_solid("add_one")
    assert pipeline_result_solid_selection.result_for_solid(
        "return_one").output_value() == 1

    reexecution_result_solid_selection = reexecute_pipeline(
        pipeline_def,
        parent_run_id=pipeline_result_solid_selection.run_id,
        run_config=run_config,
        instance=instance,
    )

    assert reexecution_result_solid_selection.success
    assert len(reexecution_result_solid_selection.solid_result_list) == 1
    with pytest.raises(DagsterInvariantViolationError):
        pipeline_result_solid_selection.result_for_solid("add_one")
    assert reexecution_result_solid_selection.result_for_solid(
        "return_one").output_value() == 1

    # Case 3: re-execute a pipeline partially when the original pipeline has solid selection and
    #   re-exeucte a step which hasn't been included in the original pipeline
    with pytest.raises(
            DagsterExecutionStepNotFoundError,
            match="Step selection refers to unknown step: add_one",
    ):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=pipeline_result_solid_selection.run_id,
            run_config=run_config,
            step_selection=["add_one"],
            instance=instance,
        )

    # Case 4: re-execute a pipeline partially when the original pipeline has solid selection and
    #   re-exeucte a step which has been included in the original pipeline
    re_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=reexecution_result_solid_selection.run_id,
        run_config=run_config,
        instance=instance,
        step_selection=["return_one"],
    )

    assert re_reexecution_result.success
    assert len(re_reexecution_result.solid_result_list) == 1
    assert re_reexecution_result.result_for_solid(
        "return_one").output_value() == 1
Beispiel #17
0
def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(pipeline_def,
                              run_config=run_config,
                              instance=instance)

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    ## re-execute add_two

    pipeline_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=result.run_id,
        run_config=run_config,
        step_selection=["add_two"],
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    assert not get_step_output_event(step_events, "add_one")
    assert get_step_output_event(step_events, "add_two")

    with pytest.raises(
            DagsterExecutionStepNotFoundError,
            match="Can not build subset plan from unknown step: nope",
    ):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=result.run_id,
            run_config=run_config,
            step_selection=["nope"],
            instance=instance,
        )
Beispiel #18
0
def define_context(repo_fn, instance=None):
    return DagsterGraphQLInProcessRepositoryContext(
        handle=ExecutionTargetHandle.for_repo_fn(repo_fn),
        instance=instance or DagsterInstance.ephemeral(),
        execution_manager=SynchronousExecutionManager(),
    )
Beispiel #19
0
def test_create_app():
    handle = ExecutionTargetHandle.for_repo_yaml(file_relative_path(__file__, './repository.yaml'))
    assert create_app(handle, DagsterInstance.ephemeral())
Beispiel #20
0
def define_subprocess_context_for_file(python_file, fn_name, instance=None):
    return DagsterGraphQLInProcessRepositoryContext(
        handle=ExecutionTargetHandle.for_repo_python_file(python_file, fn_name),
        instance=instance or DagsterInstance.ephemeral(),
        execution_manager=SubprocessExecutionManager(instance),
    )
Beispiel #21
0
def execute_pipeline(pipeline,
                     environment_dict=None,
                     run_config=None,
                     instance=None):
    '''
    "Synchronous" version of :py:func:`execute_pipeline_iterator`.

    This is the entry point for dagster CLI and dagit execution. For the dagster-graphql entry
    point, see execute_plan() below.

    Parameters:
        pipeline (PipelineDefinition): Pipeline to run
        environment_dict (dict):
            The enviroment configuration that parameterizes this run
        run_config (RunConfig):
            Configuration for how this pipeline will be executed
        instance (DagsterInstance):
            The instance to execute against, defaults to ephemeral (no artifacts persisted)

    Returns:
      :py:class:`PipelineExecutionResult`
    '''

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')
    run_config = check_run_config_param(run_config, pipeline)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    execution_plan = create_execution_plan(pipeline, environment_dict,
                                           run_config)

    # run should be used and threaded through here
    # https://github.com/dagster-io/dagster/issues/1745
    _run = _create_run(instance, pipeline, run_config, environment_dict)

    with scoped_pipeline_context(pipeline, environment_dict, run_config,
                                 instance) as pipeline_context:
        event_list = list(
            _pipeline_execution_iterator(
                pipeline_context,
                execution_plan=execution_plan,
                run_config=run_config,
                step_keys_to_execute=run_config.step_keys_to_execute,
            ))

        return PipelineExecutionResult(
            pipeline,
            run_config.run_id,
            event_list,
            lambda: scoped_pipeline_context(
                pipeline,
                environment_dict,
                run_config,
                instance,
                system_storage_data=SystemStorageData(
                    intermediates_manager=pipeline_context.
                    intermediates_manager,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
 def command(**kwargs):
     capture_result['external_repo'] = get_external_repository_from_kwargs(
         kwargs, DagsterInstance.ephemeral())
Beispiel #23
0
    def get_context(self,
                    solid_config=None,
                    mode_def=None,
                    environment_dict=None):
        '''Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            environment_dict(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        '''
        check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
        environment_dict = check.opt_dict_param(environment_dict,
                                                'environment_dict',
                                                key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
                self.context, DagstermillRuntimeExecutionContext):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(
                logger_defs={'dagstermill': colored_console_logger})
            environment_dict['loggers'] = {'dagstermill': {}}

        solid_def = SolidDefinition(
            name='this_solid',
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description=
            'Ephemeral solid constructed by dagstermill.get_context()',
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def],
            mode_defs=[mode_def],
            name='ephemeral_dagstermill_pipeline')

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        execution_plan = create_execution_plan(self.pipeline,
                                               environment_dict,
                                               mode=mode_def.name)
        with scoped_pipeline_context(
                execution_plan,
                environment_dict,
                pipeline_run,
                DagsterInstance.ephemeral(),
                scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def),
            )

        return self.context
Beispiel #24
0
def test_multiprocessing_execution_for_composite_solid_with_config_mapping():
    environment_dict = {
        'solids': {
            'composite_with_nested_config_solid_and_config_mapping': {
                'config': {
                    'foo': 'baz',
                    'bar': 3
                }
            }
        }
    }

    run_id = make_new_run_id()
    handle = ExecutionTargetHandle.for_pipeline_python_file(
        __file__, 'composite_pipeline_with_config_mapping')

    instance = DagsterInstance.ephemeral()
    pipeline_run = instance.create_run(
        PipelineRun(
            pipeline_name=composite_pipeline_with_config_mapping.name,
            run_id=run_id,
            selector=ExecutionSelector('nonce'),
            environment_dict=environment_dict,
            mode='default',
            reexecution_config=None,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
        ))
    execution_manager = MultiprocessingExecutionManager()
    execution_manager.execute_pipeline(handle,
                                       composite_pipeline_with_config_mapping,
                                       pipeline_run,
                                       instance,
                                       raise_on_error=False)
    execution_manager.join()
    assert instance.get_run(run_id).status == PipelineRunStatus.SUCCESS

    environment_dict = {
        'solids': {
            'composite_with_nested_config_solid_and_config_mapping': {
                'config': {
                    'foo': 'baz',
                    'bar': 3
                }
            }
        },
        'execution': {
            'multiprocess': {}
        },
        'storage': {
            'filesystem': {}
        },
    }

    run_id = make_new_run_id()

    pipeline_run = instance.create_run(
        PipelineRun(
            pipeline_name=composite_pipeline.name,
            run_id=run_id,
            selector=ExecutionSelector('nonce'),
            environment_dict=environment_dict,
            mode='default',
            reexecution_config=None,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
        ))
    execution_manager = MultiprocessingExecutionManager()
    execution_manager.execute_pipeline(handle,
                                       composite_pipeline,
                                       pipeline_run,
                                       instance,
                                       raise_on_error=False)

    execution_manager.join()
    assert instance.get_run(run_id).status == PipelineRunStatus.SUCCESS
Beispiel #25
0
def define_context_for_repository_yaml(path, instance=None):
    return DagsterGraphQLContext(
        handle=ExecutionTargetHandle.for_repo_yaml(path),
        instance=instance or DagsterInstance.ephemeral(),
        execution_manager=SynchronousExecutionManager(),
    )
def test_execution_plan_reexecution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(
        pipeline_def,
        run_config=run_config,
        instance=instance,
    )

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    ## re-execute add_two

    environment_config = EnvironmentConfig.build(
        pipeline_def,
        run_config=run_config,
    )
    execution_plan = ExecutionPlan.build(
        InMemoryPipeline(pipeline_def),
        environment_config,
    )
    pipeline_run = instance.create_run_for_pipeline(
        pipeline_def=pipeline_def,
        execution_plan=execution_plan,
        run_config=run_config,
        parent_run_id=result.run_id,
        root_run_id=result.run_id,
        step_keys_to_execute=["add_two"],
    )

    step_events = execute_plan(
        execution_plan.build_subset_plan(["add_two"], pipeline_def,
                                         environment_config),
        InMemoryPipeline(pipeline_def),
        run_config=run_config,
        pipeline_run=pipeline_run,
        instance=instance,
    )

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one")).obj == 4
    assert intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two")).obj == 6

    assert not get_step_output_event(step_events, "add_one")
    assert get_step_output_event(step_events, "add_two")
def test_pipeline_step_key_subset_execution():
    pipeline_def = define_addy_pipeline()
    instance = DagsterInstance.ephemeral()
    run_config = env_with_fs(
        {"solids": {
            "add_one": {
                "inputs": {
                    "num": {
                        "value": 3
                    }
                }
            }
        }})
    result = execute_pipeline(pipeline_def,
                              run_config=run_config,
                              instance=instance)

    assert result.success

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one.compute")).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two.compute")).obj == 6)

    ## re-execute add_two

    pipeline_reexecution_result = reexecute_pipeline(
        pipeline_def,
        parent_run_id=result.run_id,
        run_config=run_config,
        step_selection=["add_two.compute"],
        instance=instance,
    )

    assert pipeline_reexecution_result.success

    step_events = pipeline_reexecution_result.step_event_list
    assert step_events

    intermediate_storage = build_fs_intermediate_storage(
        instance.intermediates_directory, result.run_id)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_one.compute")).obj == 4)
    assert (intermediate_storage.get_intermediate(
        None, Int, StepOutputHandle("add_two.compute")).obj == 6)

    assert not get_step_output_event(step_events, "add_one.compute")
    assert get_step_output_event(step_events, "add_two.compute")

    with pytest.raises(
            DagsterInvalidSubsetError,
            match="No qualified steps to execute found for step_selection"):
        reexecute_pipeline(
            pipeline_def,
            parent_run_id=result.run_id,
            run_config=run_config,
            step_selection=["nope.compute"],
            instance=instance,
        )
Beispiel #28
0
def execute_pipeline(pipeline,
                     environment_dict=None,
                     run_config=None,
                     instance=None,
                     raise_on_error=True):
    '''Execute a pipeline synchronously.

    Users will typically call this API when testing pipeline execution, or running standalone
    scripts.

    Parameters:
        pipeline (PipelineDefinition): The pipeline to execute.
        environment_dict (Optional[dict]): The enviroment configuration that parameterizes this run,
            as a dict.
        run_config (Optional[RunConfig]): Optionally specifies additional config options for
            pipeline execution.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
      :py:class:`PipelineExecutionResult`: The result of pipeline execution.

    For the asynchronous version, see :py:func:`execute_pipeline_iterator`.

    This is the entrypoint for dagster CLI execution. For the dagster-graphql entrypoint, see
    ``dagster.core.execution.api.execute_plan()``.
    '''

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')
    run_config = check_run_config_param(run_config, pipeline)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    execution_plan = create_execution_plan(pipeline, environment_dict,
                                           run_config)

    pipeline_run = _create_run(instance, pipeline, run_config,
                               environment_dict)

    with scoped_pipeline_context(
            pipeline,
            environment_dict,
            pipeline_run,
            instance,
            raise_on_error=raise_on_error) as pipeline_context:
        event_list = list(
            _pipeline_execution_iterator(pipeline_context, execution_plan,
                                         pipeline_run))

        return PipelineExecutionResult(
            pipeline,
            run_config.run_id,
            event_list,
            lambda: scoped_pipeline_context(
                pipeline,
                environment_dict,
                pipeline_run,
                instance,
                system_storage_data=SystemStorageData(
                    intermediates_manager=pipeline_context.
                    intermediates_manager,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
Beispiel #29
0
def test_using_s3_for_subplan(s3_bucket):
    pipeline_def = define_inty_pipeline()

    environment_dict = {
        'storage': {
            's3': {
                'config': {
                    's3_bucket': s3_bucket
                }
            }
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict,
                                           run_config=RunConfig(run_id=run_id))

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun.create_empty_run(
        pipeline_def.name, run_id=run_id, environment_dict=environment_dict)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, 'return_one.compute')
    with scoped_pipeline_context(
            pipeline_def,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan.build_subset_plan(['return_one.compute']),
    ) as context:

        store = S3IntermediateStore(
            s3_bucket,
            run_id,
            s3_session=context.scoped_resources_builder.build(
                required_resource_keys={'s3'}, ).s3,
        )
        assert store.has_intermediate(context, 'return_one.compute')
        assert store.get_intermediate(context, 'return_one.compute',
                                      Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    with scoped_pipeline_context(
            pipeline_def,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan.build_subset_plan(['add_one.compute']),
    ) as context:
        assert store.has_intermediate(context, 'add_one.compute')
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
Beispiel #30
0
def test_using_gcs_for_subplan(gcs_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {"intermediate_storage": {"gcs": {"config": {"gcs_bucket": gcs_bucket}}}}

    run_id = make_new_run_id()

    resolved_run_config = ResolvedRunConfig.build(pipeline_def, run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def), resolved_run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(
        pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config
    )

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, pipeline_def, resolved_run_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(return_one_step_events, "return_one")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["return_one"], pipeline_def, resolved_run_config),
        InMemoryPipeline(pipeline_def),
        run_config,
        pipeline_run,
        instance,
    ) as context:
        intermediate_storage = GCSIntermediateStorage(
            gcs_bucket,
            run_id,
            client=context.scoped_resources_builder.build(
                required_resource_keys={"gcs"},
            ).gcs,
        )
        assert intermediate_storage.has_intermediate(context, StepOutputHandle("return_one"))
        assert (
            intermediate_storage.get_intermediate(context, Int, StepOutputHandle("return_one")).obj
            == 1
        )

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline_def, resolved_run_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(add_one_step_events, "add_one")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["return_one"], pipeline_def, resolved_run_config),
        InMemoryPipeline(pipeline_def),
        run_config,
        pipeline_run,
        instance,
    ) as context:
        assert intermediate_storage.has_intermediate(context, StepOutputHandle("add_one"))
        assert (
            intermediate_storage.get_intermediate(context, Int, StepOutputHandle("add_one")).obj
            == 2
        )