@solid
def nonce_solid(_):
    return


@pipeline
def nonce_pipeline():
    return nonce_solid()


nonce_pipeline_snapshot = nonce_pipeline.get_pipeline_snapshot()

nonce_execution_plan_snapshot = snapshot_from_execution_plan(
    create_execution_plan(nonce_pipeline), nonce_pipeline.get_pipeline_snapshot_id()
)


def test_init_modified_docker_operator(dagster_docker_image):
    dagster_operator_parameters = DagsterOperatorParameters(
        task_id='nonce',
        run_config={'storage': {'filesystem': {}}},
        pipeline_name='',
        mode='default',
        op_kwargs={'image': dagster_docker_image, 'api_version': 'auto',},
        pipeline_snapshot=nonce_pipeline_snapshot,
        execution_plan_snapshot=nonce_execution_plan_snapshot,
    )
    DagsterDockerOperator(dagster_operator_parameters)
Exemple #2
0
def test_gcs_pickle_io_manager_execution(gcs_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "gcs_bucket": gcs_bucket,
                }
            }
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")

    io_manager = PickledObjectGCSIOManager(gcs_bucket, storage.Client())
    step_output_handle = StepOutputHandle("return_one")
    context = InputContext(
        pipeline_name=pipeline_def.name,
        solid_def=pipeline_def.solid_def_named("return_one"),
        upstream_output=OutputContext(
            step_key=step_output_handle.step_key,
            name=step_output_handle.output_name,
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            solid_def=pipeline_def.solid_def_named("return_one"),
        ),
        log_manager=DagsterLogManager(run_id=pipeline_run.run_id,
                                      logging_tags={},
                                      loggers=[]),
    )
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"]),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = InputContext(
        pipeline_name=pipeline_def.name,
        solid_def=pipeline_def.solid_def_named("add_one"),
        upstream_output=OutputContext(
            step_key=step_output_handle.step_key,
            name=step_output_handle.output_name,
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            solid_def=pipeline_def.solid_def_named("add_one"),
        ),
        log_manager=DagsterLogManager(run_id=pipeline_run.run_id,
                                      logging_tags={},
                                      loggers=[]),
    )

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2
Exemple #3
0
def _make_airflow_dag(
    recon_repo,
    pipeline_name,
    run_config=None,
    mode=None,
    instance=None,
    dag_id=None,
    dag_description=None,
    dag_kwargs=None,
    op_kwargs=None,
    operator=DagsterPythonOperator,
):
    check.inst_param(recon_repo, "recon_repo", ReconstructableRepository)
    check.str_param(pipeline_name, "pipeline_name")
    run_config = check.opt_dict_param(run_config, "run_config", key_type=str)
    mode = check.opt_str_param(mode, "mode")
    # Default to use the (persistent) system temp directory rather than a TemporaryDirectory,
    # which would not be consistent between Airflow task invocations.

    if instance is None:
        if is_dagster_home_set():
            instance = DagsterInstance.get()
        else:
            instance = DagsterInstance.local_temp(
                tempdir=seven.get_system_temp_directory())

    check.inst_param(instance, "instance", DagsterInstance)

    # Only used for Airflow; internally we continue to use pipeline.name
    dag_id = check.opt_str_param(dag_id, "dag_id",
                                 _rename_for_airflow(pipeline_name))

    dag_description = check.opt_str_param(dag_description, "dag_description",
                                          _make_dag_description(pipeline_name))
    check.subclass_param(operator, "operator", BaseOperator)

    dag_kwargs = dict(
        {"default_args": DEFAULT_ARGS},
        **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str),
    )

    op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str)

    dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs)
    pipeline = recon_repo.get_definition().get_pipeline(pipeline_name)

    if mode is None:
        mode = pipeline.get_default_mode_name()

    execution_plan = create_execution_plan(pipeline, run_config, mode=mode)

    tasks = {}

    coalesced_plan = coalesce_execution_steps(execution_plan)

    for solid_handle, solid_steps in coalesced_plan.items():
        step_keys = [step.key for step in solid_steps]

        operator_parameters = DagsterOperatorParameters(
            recon_repo=recon_repo,
            pipeline_name=pipeline_name,
            run_config=run_config,
            mode=mode,
            task_id=solid_handle,
            step_keys=step_keys,
            dag=dag,
            instance_ref=instance.get_ref(),
            op_kwargs=op_kwargs,
            pipeline_snapshot=pipeline.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan,
                pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()),
        )
        task = operator(operator_parameters)

        tasks[solid_handle] = task

        for solid_step in solid_steps:
            for step_input in solid_step.step_inputs:
                for key in step_input.dependency_keys:
                    prev_solid_handle = execution_plan.get_step_by_key(
                        key).solid_handle.to_string()
                    if solid_handle != prev_solid_handle:
                        tasks[prev_solid_handle].set_downstream(task)

    return (dag,
            [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
Exemple #4
0
def test_all_step_events():  # pylint: disable=too-many-locals
    handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline)
    pipeline = handle.build_pipeline_definition()
    mode = pipeline.get_default_mode_name()
    execution_plan = create_execution_plan(pipeline, {}, mode=mode)
    step_levels = execution_plan.topological_step_levels()
    run_config = RunConfig(
        executor_config=InProcessExecutorConfig(raise_on_error=False),
        storage_mode=RunStorageMode.FILESYSTEM,
    )

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        'LogMessageEvent',
        'PipelineStartEvent',
        'PipelineSuccessEvent',
        'PipelineInitFailureEvent',
        'PipelineFailureEvent',
    }

    step_event_fragment = get_step_event_fragment()
    log_message_event_fragment = get_log_message_event_fragment()
    query = '\n'.join(
        (
            PIPELINE_EXECUTION_QUERY_TEMPLATE.format(
                step_event_fragment=step_event_fragment.include_key,
                log_message_event_fragment=log_message_event_fragment.include_key,
            ),
            step_event_fragment.fragment,
            log_message_event_fragment.fragment,
        )
    )

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:

            variables = {
                'executionParams': {
                    'selector': {'name': pipeline.name},
                    'environmentConfigData': {'storage': {'filesystem': {}}},
                    'mode': mode,
                    'executionMetadata': {'runId': run_config.run_id},
                    'stepKeys': [step.key],
                }
            }

            pipeline_run_storage = PipelineRunStorage()

            res = execute_query(handle, query, variables, pipeline_run_storage=pipeline_run_storage)

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get('errors'):
                run_logs = res['data']['startPipelineExecution']['run']['logs']['nodes']

                events = [
                    dagster_event_from_dict(e, pipeline.name)
                    for e in run_logs
                    if e['__typename'] not in ignored_events
                ]

                for event in events:
                    key = event.step_key + '.' + event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {DagsterEventType(e.event_type_value) for e in events}

            # build up a dict, incrementing all the event records we've produced in the run storage
            logs = pipeline_run_storage.get_run_by_id(run_config.run_id).all_logs()
            for log in logs:
                if not log.dagster_event or (
                    DagsterEventType(log.dagster_event.event_type_value) not in STEP_EVENTS
                ):
                    continue
                key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value
                event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    assert not unhandled_events
Exemple #5
0
def default_mode_output_versions(pipeline_def):
    return resolve_step_output_versions(
        create_execution_plan(pipeline_def),
        EnvironmentConfig.build(pipeline_def, {}, "default"),
        pipeline_def.get_mode_definition("default"),
    )
Exemple #6
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        environment_dict=None,
        handle_kwargs=None,
        pipeline_run_dict=None,
        solid_subset=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        '''Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        '''
        check.opt_str_param(output_log_path, 'output_log_path')
        check.opt_str_param(marshal_dir, 'marshal_dir')
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)
        check.dict_param(pipeline_run_dict, 'pipeline_run_dict')
        check.dict_param(handle_kwargs, 'handle_kwargs')
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.dict_param(solid_handle_kwargs, 'solid_handle_kwargs')
        check.dict_param(instance_ref_dict, 'instance_ref_dict')

        try:
            handle = load_handle.handle_for_pipeline_cli_args(
                handle_kwargs, use_default_repository_yaml=False
            )
        except (check.CheckError, load_handle.UsageError) as err:
            six.raise_from(
                DagstermillError(
                    'Cannot invoke a dagstermill solid from an in-memory pipeline that was not loaded '
                    'from an ExecutionTargetHandle. Run this pipeline using dagit, the dagster CLI, '
                    'through dagster-graphql, or in-memory after loading it through an '
                    'ExecutionTargetHandle.'
                ),
                err,
            )

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            six.raise_from(
                DagstermillError(
                    'Error when attempting to resolve DagsterInstance from serialized InstanceRef'
                ),
                err,
            )

        pipeline_def = check.inst_param(
            handle.build_pipeline_definition(),
            'pipeline_def (from handle {handle_dict})'.format(handle_dict=handle.data._asdict()),
            PipelineDefinition,
        ).build_sub_pipeline(solid_subset)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        pipeline_run = unpack_value(pipeline_run_dict)

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        execution_plan = create_execution_plan(self.pipeline_def, environment_dict, pipeline_run)

        with scoped_pipeline_context(
            self.pipeline_def,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan,
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=None,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def
                ),
            )

        return self.context
Exemple #7
0

@solid
def nonce_solid(_):
    return


@pipeline
def nonce_pipeline():
    return nonce_solid()


nonce_pipeline_snapshot = nonce_pipeline.get_pipeline_snapshot()

nonce_execution_plan_snapshot = snapshot_from_execution_plan(
    create_execution_plan(nonce_pipeline),
    nonce_pipeline.get_pipeline_snapshot_id())


def test_init_modified_docker_operator(dagster_docker_image):
    with instance_for_test() as instance:
        dagster_operator_parameters = DagsterOperatorParameters(
            task_id="nonce",
            run_config={"storage": {
                "filesystem": {}
            }},
            pipeline_name="",
            mode="default",
            op_kwargs={
                "image": dagster_docker_image,
                "api_version": "auto",
Exemple #8
0
def test_retries_active_execution():
    pipeline_def = define_diamond_pipeline()
    plan = create_execution_plan(pipeline_def)

    with plan.start(retry_mode=(RetryMode.ENABLED)) as active_execution:

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 1
        step_1 = steps[0]
        assert step_1.key == "return_two"

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 0  # cant progress

        active_execution.mark_up_for_retry(step_1.key)

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 1
        assert steps[0].key == "return_two"

        active_execution.mark_up_for_retry(step_1.key)

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 1
        assert steps[0].key == "return_two"

        active_execution.mark_success(step_1.key)
        active_execution.mark_step_produced_output(StepOutputHandle(step_1.key, "result"))

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 2
        step_2 = steps[0]
        step_3 = steps[1]
        assert step_2.key == "add_three"
        assert step_3.key == "mult_three"

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 0  # cant progress

        active_execution.mark_success(step_2.key)
        active_execution.mark_step_produced_output(StepOutputHandle(step_2.key, "result"))

        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 0  # cant progress

        # uh oh failure
        active_execution.mark_failed(step_3.key)

        # cant progres to 4th step
        steps = active_execution.get_steps_to_execute()
        assert len(steps) == 0

        assert not active_execution.is_complete

        steps = active_execution.get_steps_to_abandon()
        assert len(steps) == 1
        step_4 = steps[0]

        assert step_4.key == "adder"
        active_execution.mark_abandoned(step_4.key)

        assert active_execution.is_complete
Exemple #9
0
def test_fan_in_should_skip_step():
    @lambda_solid
    def one():
        return 1

    @solid(output_defs=[OutputDefinition(is_required=False)])
    def skip(_):
        return
        yield  # pylint: disable=unreachable

    @solid
    def fan_in(_context, items):
        return items

    @composite_solid(output_defs=[OutputDefinition(is_required=False)])
    def composite_all_upstream_skip():
        return fan_in([skip(), skip()])

    @composite_solid(output_defs=[OutputDefinition(is_required=False)])
    def composite_one_upstream_skip():
        return fan_in([one(), skip()])

    @pipeline
    def optional_outputs_composite():
        composite_all_upstream_skip()
        composite_one_upstream_skip()

    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name="optional_outputs_composite", run_id=make_new_run_id())
    execute_plan(
        create_execution_plan(
            optional_outputs_composite,
            step_keys_to_execute=[
                "composite_all_upstream_skip.skip",
                "composite_all_upstream_skip.skip_2",
            ],
        ),
        InMemoryPipeline(optional_outputs_composite),
        instance,
        pipeline_run,
    )
    # skip when all the step's sources weren't yield
    assert should_skip_step(
        create_execution_plan(
            optional_outputs_composite,
            step_keys_to_execute=["composite_all_upstream_skip.fan_in"],
        ),
        instance,
        pipeline_run.run_id,
    )

    execute_plan(
        create_execution_plan(
            optional_outputs_composite,
            step_keys_to_execute=[
                "composite_one_upstream_skip.one",
                "composite_one_upstream_skip.skip",
            ],
        ),
        InMemoryPipeline(optional_outputs_composite),
        instance,
        pipeline_run,
    )
    # do not skip when some of the sources exist
    assert not should_skip_step(
        create_execution_plan(
            optional_outputs_composite,
            step_keys_to_execute=["composite_one_upstream_skip.fan_in"],
        ),
        instance,
        pipeline_run.run_id,
    )
Exemple #10
0
def test_s3_asset_store_execution(mock_s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "asset_store": {
                "config": {
                    "s3_bucket": mock_s3_bucket.name
                }
            }
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one.compute")

    step_keys = ["return_one.compute"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one.compute")

    asset_store = PickledObjectS3AssetStore(mock_s3_bucket.name,
                                            s3_prefix="dagster")
    step_output_handle = StepOutputHandle("return_one.compute")
    context = AssetStoreContext(
        step_output_handle.step_key,
        step_output_handle.output_name,
        {},
        pipeline_def.name,
        pipeline_def.solid_def_named("return_one"),
        run_id,
    )
    assert asset_store.get_asset(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one.compute"]),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one.compute")
    context = AssetStoreContext(
        step_output_handle.step_key,
        step_output_handle.output_name,
        {},
        pipeline_def.name,
        pipeline_def.solid_def_named("add_one"),
        run_id,
    )

    assert get_step_output(add_one_step_events, "add_one.compute")
    assert asset_store.get_asset(context) == 2
Exemple #11
0
def test_adls2_object_manager_execution(storage_account, file_system,
                                        credential):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "object_manager": {
                "config": {
                    "adls2_file_system": file_system
                }
            },
            "adls2": {
                "config": {
                    "storage_account": storage_account,
                    "credential": {
                        "key": credential
                    }
                }
            },
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")
    step_output_handle = StepOutputHandle("return_one")
    context = InputContext(
        pipeline_name=pipeline_def.name,
        solid_def=pipeline_def.solid_def_named("return_one"),
        upstream_output=OutputContext(
            step_key=step_output_handle.step_key,
            name=step_output_handle.output_name,
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            solid_def=pipeline_def.solid_def_named("return_one"),
        ),
    )

    object_manager = PickledObjectADLS2ObjectManager(
        file_system=file_system,
        adls2_client=create_adls2_client(storage_account, credential),
        blob_client=create_blob_client(storage_account, credential),
    )
    assert object_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"]),
            pipeline_run=pipeline_run,
            run_config=run_config,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = InputContext(
        pipeline_name=pipeline_def.name,
        solid_def=pipeline_def.solid_def_named("add_one"),
        upstream_output=OutputContext(
            step_key=step_output_handle.step_key,
            name=step_output_handle.output_name,
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            solid_def=pipeline_def.solid_def_named("add_one"),
        ),
    )

    assert get_step_output(add_one_step_events, "add_one")
    assert object_manager.load_input(context) == 2
Exemple #12
0
def _launch_scheduled_execution(instance, schedule_def, pipeline, tick,
                                stream):
    pipeline_def = pipeline.get_definition()

    # Run should_execute and halt if it returns False
    schedule_context = ScheduleExecutionContext(instance)
    with user_code_error_boundary(
            ScheduleExecutionError,
            lambda:
            'Error occurred during the execution of should_execute for schedule '
            '{schedule_name}'.format(schedule_name=schedule_def.name),
    ):
        should_execute = schedule_def.should_execute(schedule_context)

    if not should_execute:
        # Update tick to skipped state and return
        tick.update_with_status(ScheduleTickStatus.SKIPPED)
        stream.send(ScheduledExecutionSkipped())
        return

    errors = []

    run_config = {}
    schedule_tags = {}
    try:
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution of run_config_fn for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            run_config = schedule_def.get_run_config(schedule_context)
    except DagsterUserCodeExecutionError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    try:
        with user_code_error_boundary(
                ScheduleExecutionError,
                lambda:
                'Error occurred during the execution of tags_fn for schedule '
                '{schedule_name}'.format(schedule_name=schedule_def.name),
        ):
            schedule_tags = schedule_def.get_tags(schedule_context)
    except DagsterUserCodeExecutionError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    pipeline_tags = pipeline_def.tags or {}
    check_tags(pipeline_tags, 'pipeline_tags')
    tags = merge_dicts(pipeline_tags, schedule_tags)

    mode = schedule_def.mode

    execution_plan_snapshot = None
    try:
        execution_plan = create_execution_plan(
            pipeline_def,
            run_config=run_config,
            mode=mode,
        )
        execution_plan_snapshot = snapshot_from_execution_plan(
            execution_plan, pipeline_def.get_pipeline_snapshot_id())
    except DagsterInvalidConfigError:
        error_data = serializable_error_info_from_exc_info(sys.exc_info())
        errors.append(error_data)

    # Enter the run in the DB with the information we have
    possibly_invalid_pipeline_run = instance.create_run(
        pipeline_name=schedule_def.pipeline_name,
        run_id=None,
        run_config=run_config,
        mode=mode,
        solids_to_execute=pipeline.solids_to_execute,
        step_keys_to_execute=None,
        solid_selection=pipeline.solid_selection,
        status=None,
        root_run_id=None,
        parent_run_id=None,
        tags=tags,
        pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
        execution_plan_snapshot=execution_plan_snapshot,
        parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
    )

    tick.update_with_status(ScheduleTickStatus.SUCCESS,
                            run_id=possibly_invalid_pipeline_run.run_id)

    # If there were errors, inject them into the event log and fail the run
    if len(errors) > 0:
        for error in errors:
            instance.report_engine_event(
                error.message,
                possibly_invalid_pipeline_run,
                EngineEventData.engine_error(error),
            )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=errors))
        return

    # Otherwise the run should be valid so lets launch it

    # Need an ExternalPipeline to launch so make one here
    recon_repo = pipeline.get_reconstructable_repository()
    repo_location = InProcessRepositoryLocation(recon_repo)
    external_pipeline = repo_location.get_repository(
        recon_repo.get_definition().name).get_full_external_pipeline(
            pipeline_def.name)

    try:
        launched_run = instance.launch_run(
            possibly_invalid_pipeline_run.run_id, external_pipeline)
    except DagsterLaunchFailedError:
        error = serializable_error_info_from_exc_info(sys.exc_info())
        instance.report_engine_event(
            error.message,
            possibly_invalid_pipeline_run,
            EngineEventData.engine_error(error),
        )
        instance.report_run_failed(possibly_invalid_pipeline_run)
        stream.send(
            ScheduledExecutionFailed(
                run_id=possibly_invalid_pipeline_run.run_id, errors=[error]))
        return

    stream.send(ScheduledExecutionSuccess(run_id=launched_run.run_id))
    return
Exemple #13
0
    def _execute_plan(self, execute_step_args_packed, executable_dict):
        execute_step_args = unpack_value(
            check.dict_param(
                execute_step_args_packed,
                "execute_step_args_packed",
            ))
        check.inst_param(execute_step_args, "execute_step_args",
                         ExecuteStepArgs)

        check.dict_param(executable_dict, "executable_dict")

        instance = DagsterInstance.from_ref(execute_step_args.instance_ref)

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        retry_mode = execute_step_args.retry_mode

        pipeline_run = instance.get_run_by_id(
            execute_step_args.pipeline_run_id)
        check.invariant(
            pipeline_run,
            "Could not load run {}".format(execute_step_args.pipeline_run_id))

        step_keys_str = ", ".join(execute_step_args.step_keys_to_execute)

        execution_plan = create_execution_plan(
            pipeline,
            pipeline_run.run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=execute_step_args.step_keys_to_execute,
            known_state=execute_step_args.known_state,
        )

        engine_event = instance.report_engine_event(
            "Executing steps {} in celery worker".format(step_keys_str),
            pipeline_run,
            EngineEventData(
                [
                    EventMetadataEntry.text(step_keys_str, "step_keys"),
                    EventMetadataEntry.text(self.request.hostname,
                                            "Celery worker"),
                ],
                marker_end=DELEGATE_MARKER,
            ),
            CeleryExecutor,
            step_key=execution_plan.step_handle_for_single_step_plans().to_key(
            ),
        )

        events = [engine_event]
        for step_event in execute_plan_iterator(
                execution_plan=execution_plan,
                pipeline=pipeline,
                pipeline_run=pipeline_run,
                instance=instance,
                retry_mode=retry_mode,
                run_config=pipeline_run.run_config,
        ):
            events.append(step_event)

        serialized_events = [
            serialize_dagster_namedtuple(event) for event in events
        ]
        return serialized_events
def test_using_adls2_for_subplan(storage_account, file_system):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "adls2": {
                "config": {"storage_account": storage_account, "credential": get_azure_credential()}
            }
        },
        "storage": {"adls2": {"config": {"adls2_file_system": file_system}}},
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one.compute")

    step_keys = ["return_one.compute"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(
        pipeline_name=pipeline_def.name, run_id=run_id, run_config=run_config
    )

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(return_one_step_events, "return_one.compute")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["return_one.compute"]),
        run_config,
        pipeline_run,
        instance,
    ) as context:

        resource = context.scoped_resources_builder.build(required_resource_keys={"adls2"}).adls2
        intermediate_storage = ADLS2IntermediateStorage(
            file_system=file_system,
            run_id=run_id,
            adls2_client=resource.adls2_client,
            blob_client=resource.blob_client,
        )
        step_output_handle = StepOutputHandle("return_one.compute")
        assert intermediate_storage.has_intermediate(context, step_output_handle)
        assert intermediate_storage.get_intermediate(context, Int, step_output_handle).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one.compute"]),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        )
    )

    assert get_step_output(add_one_step_events, "add_one.compute")
    with scoped_pipeline_context(
        execution_plan.build_subset_plan(["add_one.compute"]), run_config, pipeline_run, instance,
    ) as context:
        step_output_handle = StepOutputHandle("add_one.compute")
        assert intermediate_storage.has_intermediate(context, step_output_handle)
        assert intermediate_storage.get_intermediate(context, Int, step_output_handle).obj == 2
def test_using_s3_for_subplan(s3_bucket):
    pipeline_def = define_inty_pipeline()

    environment_dict = {
        'storage': {
            's3': {
                'config': {
                    's3_bucket': s3_bucket
                }
            }
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def,
                                           environment_dict=environment_dict,
                                           run_config=RunConfig(run_id=run_id))

    assert execution_plan.get_step_by_key('return_one.compute')

    step_keys = ['return_one.compute']
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun.create_empty_run(
        pipeline_def.name, run_id=run_id, environment_dict=environment_dict)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, 'return_one.compute')
    with scoped_pipeline_context(
            pipeline_def,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan.build_subset_plan(['return_one.compute']),
    ) as context:

        store = S3IntermediateStore(
            s3_bucket,
            run_id,
            s3_session=context.scoped_resources_builder.build(
                required_resource_keys={'s3'}, ).s3.session,
        )
        assert store.has_intermediate(context, 'return_one.compute')
        assert store.get_intermediate(context, 'return_one.compute',
                                      Int).obj == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(['add_one.compute']),
            environment_dict=environment_dict,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, 'add_one.compute')
    with scoped_pipeline_context(
            pipeline_def,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan.build_subset_plan(['add_one.compute']),
    ) as context:
        assert store.has_intermediate(context, 'add_one.compute')
        assert store.get_intermediate(context, 'add_one.compute', Int).obj == 2
Exemple #16
0
def test_create_execution_plan_with_bad_inputs():
    with pytest.raises(DagsterInvalidConfigError):
        create_execution_plan(
            define_diamond_pipeline(), run_config={"solids": {"add_three": {"inputs": {"num": 3}}}},
        )
Exemple #17
0
    def get_context(self, solid_config=None, mode_def=None, environment_dict=None):
        '''Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            environment_dict(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        '''
        check.opt_inst_param(mode_def, 'mode_def', ModeDefinition)
        environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
            self.context, DagstermillRuntimeExecutionContext
        ):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(logger_defs={'dagstermill': colored_console_logger})
            environment_dict['loggers'] = {'dagstermill': {}}

        solid_def = SolidDefinition(
            name='this_solid',
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description='Ephemeral solid constructed by dagstermill.get_context()',
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def], mode_defs=[mode_def], name='ephemeral_dagstermill_pipeline'
        )

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=mode_def.name,
            selector=None,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline_def = pipeline_def

        execution_plan = create_execution_plan(self.pipeline_def, environment_dict, pipeline_run)
        with scoped_pipeline_context(
            self.pipeline_def,
            environment_dict,
            pipeline_run,
            DagsterInstance.ephemeral(),
            execution_plan,
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan, pipeline_context.system_storage_def
                ),
            )

        return self.context
Exemple #18
0
def test_emr_pyspark_execution_plan():
    os.environ['EMR_CLUSTER_ID'] = 'some_cluster_id'
    create_execution_plan(my_pipeline,
                          mode='emr',
                          run_config=emr_preset.run_config)
Exemple #19
0
def test_can_reexecute():
    pipeline_def = define_pipeline(fs_io_manager, {})
    plan = create_execution_plan(pipeline_def)
    assert plan.artifacts_persisted
Exemple #20
0
def _start_pipeline_execution(graphene_info,
                              execution_params,
                              is_reexecuted=False):
    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)
    check.inst_param(execution_params, 'execution_params', ExecutionParams)

    if is_reexecuted:
        # required fields for re-execution
        execution_metadata = check.inst_param(
            execution_params.execution_metadata, 'execution_metadata',
            ExecutionMetadata)
        check.str_param(execution_metadata.root_run_id, 'root_run_id')
        check.str_param(execution_metadata.parent_run_id, 'parent_run_id')

    error_type = ('StartPipelineExecutionDisabledError' if not is_reexecuted
                  else 'DauphinStartPipelineReexecutionDisabledError')
    success_type = ('StartPipelineExecutionSuccess' if not is_reexecuted else
                    'StartPipelineReexecutionSuccess')

    instance = graphene_info.context.instance

    execution_manager_settings = instance.dagit_settings.get(
        'execution_manager')
    if execution_manager_settings and execution_manager_settings.get(
            'disabled'):
        return graphene_info.schema.type_named(error_type)()

    pipeline_def = get_pipeline_def_from_selector(graphene_info,
                                                  execution_params.selector)

    get_validated_config(
        graphene_info,
        pipeline_def,
        environment_dict=execution_params.environment_dict,
        mode=execution_params.mode,
    )

    execution_plan = create_execution_plan(
        pipeline_def,
        execution_params.environment_dict,
        run_config=RunConfig(
            mode=execution_params.mode,
            previous_run_id=execution_params.previous_run_id,
            tags=execution_params.execution_metadata.tags,
        ),
    )

    _check_start_pipeline_execution_errors(graphene_info, execution_params,
                                           execution_plan)

    if execution_params.execution_metadata.run_id:
        # If a run_id is provided, use the old get_or_create_run machinery
        run = instance.get_or_create_run(
            _create_pipeline_run(instance, pipeline_def, execution_params))
    else:
        # Otherwise we know we are creating a new run, and we can
        # use the new machinery that persists a pipeline snapshot
        # with the run.
        run = instance.create_run_with_snapshot(
            InstanceCreateRunArgs(
                pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
                run_id=execution_params.execution_metadata.run_id
                if execution_params.execution_metadata.run_id else
                make_new_run_id(),
                selector=execution_params.selector,
                environment_dict=execution_params.environment_dict,
                mode=execution_params.mode,
                step_keys_to_execute=get_step_keys_to_execute(
                    instance, pipeline_def, execution_params)
                or execution_params.step_keys,
                tags=execution_params.execution_metadata.tags,
                status=PipelineRunStatus.NOT_STARTED,
                root_run_id=(execution_params.execution_metadata.root_run_id
                             or execution_params.previous_run_id),
                parent_run_id=(
                    execution_params.execution_metadata.parent_run_id
                    or execution_params.previous_run_id),
            ))

    graphene_info.context.execution_manager.execute_pipeline(
        graphene_info.context.get_handle(),
        pipeline_def,
        run,
        instance=instance,
    )

    return graphene_info.schema.type_named(success_type)(
        run=graphene_info.schema.type_named('PipelineRun')(run))
Exemple #21
0
def test_all_step_events():  # pylint: disable=too-many-locals
    handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline)
    pipeline = handle.build_pipeline_definition()
    mode = pipeline.get_default_mode_name()
    instance = DagsterInstance.ephemeral()
    execution_plan = create_execution_plan(pipeline, mode=mode)
    pipeline_run = instance.create_run_for_pipeline(
        pipeline=pipeline, execution_plan=execution_plan, mode=mode)
    step_levels = execution_plan.topological_step_levels()

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        'LogMessageEvent',
        'PipelineStartEvent',
        'PipelineSuccessEvent',
        'PipelineInitFailureEvent',
        'PipelineFailureEvent',
    }

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:
            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline.name
                    },
                    'environmentConfigData': {
                        'storage': {
                            'filesystem': {}
                        }
                    },
                    'mode': mode,
                    'executionMetadata': {
                        'runId': pipeline_run.run_id
                    },
                    'stepKeys': [step.key],
                }
            }
            res = execute_query(handle,
                                EXECUTE_PLAN_MUTATION,
                                variables,
                                instance=instance)

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get('errors'):
                assert 'data' in res, res
                assert 'executePlan' in res['data'], res
                assert 'stepEvents' in res['data']['executePlan'], res
                step_events = res['data']['executePlan']['stepEvents']

                events = [
                    dagster_event_from_dict(e, pipeline.name)
                    for e in step_events
                    if e['__typename'] not in ignored_events
                ]

                for event in events:
                    if event.step_key:
                        key = event.step_key + '.' + event.event_type_value
                    else:
                        key = event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {
                    DagsterEventType(e.event_type_value)
                    for e in events
                }
            else:
                raise Exception(res['errors'])

    # build up a dict, incrementing all the event records we've produced in the run storage
    logs = instance.all_logs(pipeline_run.run_id)
    for log in logs:
        if not log.dagster_event or (DagsterEventType(
                log.dagster_event.event_type_value) not in STEP_EVENTS.union(
                    set([DagsterEventType.ENGINE_EVENT]))):
            continue
        if log.dagster_event.step_key:
            key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value
        else:
            key = log.dagster_event.event_type_value
        event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    # Why are these retry events not handled? Because right now there is no way to configure retries
    # on executePlan -- this needs to change, and we should separate the ExecutionParams that get
    # sent to executePlan fromm those that get sent to startPipelineExecution and friends
    assert unhandled_events == {
        DagsterEventType.STEP_UP_FOR_RETRY, DagsterEventType.STEP_RESTARTED
    }
def test_using_gcs_for_subplan(gcs_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "intermediate_storage": {
            "gcs": {
                "config": {
                    "gcs_bucket": gcs_bucket
                }
            }
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(["return_one"]),
            run_config,
            pipeline_run,
            instance,
    ) as context:
        intermediate_storage = GCSIntermediateStorage(
            gcs_bucket,
            run_id,
            client=context.scoped_resources_builder.build(
                required_resource_keys={"gcs"}, ).gcs,
        )
        assert intermediate_storage.has_intermediate(
            context, StepOutputHandle("return_one"))
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle("return_one")).obj == 1)

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"]),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(add_one_step_events, "add_one")
    with scoped_pipeline_context(
            execution_plan.build_subset_plan(["return_one"]),
            run_config,
            pipeline_run,
            instance,
    ) as context:
        assert intermediate_storage.has_intermediate(
            context, StepOutputHandle("add_one"))
        assert (intermediate_storage.get_intermediate(
            context, Int, StepOutputHandle("add_one")).obj == 2)
Exemple #23
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
        check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute, 'solids_to_execute', of_type=str)
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    'Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} '
                    'that conflicts with solids_to_execute arg {solids_to_execute}'.format(
                        pipeline_solids_to_execute=str_format_list(pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute
                )

        if execution_plan is None:
            execution_plan = create_execution_plan(
                pipeline_def,
                run_config=run_config,
                mode=mode,
                step_keys_to_execute=step_keys_to_execute,
            )

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(mode, 'mode', default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_def.get_pipeline_snapshot_id()
            ),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(),
        )
Exemple #24
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        solid_subset=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)
        check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan)

        if solid_subset:
            if isinstance(pipeline_def, PipelineSubsetForExecution):
                check.invariant(
                    len(solid_subset) == len(pipeline_def.solid_subset)
                    and set(solid_subset) == set(pipeline_def.solid_subset),
                    'Cannot create a PipelineRun from pipeline subset {pipeline_solid_subset} that '
                    'conflicts with solid_subset arg {solid_subset}'.format(
                        pipeline_solid_subset=str_format_list(
                            pipeline_def.solid_subset),
                        solid_subset=str_format_list(solid_subset),
                    ),
                )
            else:
                pipeline_def = pipeline_def.subset_for_execution(
                    solid_subset=solid_subset)

        if execution_plan is None:
            execution_plan = create_execution_plan(
                pipeline_def,
                environment_dict=environment_dict,
                mode=mode,
                step_keys_to_execute=step_keys_to_execute,
            )

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=environment_dict,
            mode=check.opt_str_param(
                mode, 'mode', default=pipeline_def.get_default_mode_name()),
            solid_subset=solid_subset,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_def.get_pipeline_snapshot_id()),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(
            ),
        )
Exemple #25
0
def test_tags_to_dynamic_plan():
    @solid(
        tags={
            USER_DEFINED_K8S_CONFIG_KEY: {
                "container_config": {
                    "resources": {
                        "requests": {
                            "cpu": "500m",
                            "memory": "128Mi"
                        },
                        "limits": {
                            "cpu": "1000m",
                            "memory": "1Gi"
                        },
                    }
                }
            }
        })
    def multiply_inputs(_, x):
        return 2 * x

    @solid(
        tags={
            USER_DEFINED_K8S_CONFIG_KEY: {
                "container_config": {
                    "resources": {
                        "requests": {
                            "cpu": "250m",
                            "memory": "64Mi"
                        },
                        "limits": {
                            "cpu": "500m",
                            "memory": "2560Mi"
                        },
                    }
                }
            }
        },
        output_defs=[DynamicOutputDefinition()],
    )
    def emit(_):
        for i in range(3):
            yield DynamicOutput(value=i, mapping_key=str(i))

    @pipeline
    def k8s_ready():
        return emit().map(multiply_inputs)

    known_state = KnownExecutionState(
        {},
        {
            emit.name: {
                "result": ["0", "1", "2"]
            },
        },
    )
    plan = create_execution_plan(k8s_ready, known_state=known_state)

    emit_step = plan.get_step_by_key(emit.name)
    user_defined_k8s_config = get_user_defined_k8s_config(emit_step.tags)

    assert user_defined_k8s_config.container_config
    assert user_defined_k8s_config.container_config["resources"]

    resources = user_defined_k8s_config.container_config["resources"]

    assert resources["requests"]["cpu"] == "250m"
    assert resources["requests"]["memory"] == "64Mi"
    assert resources["limits"]["cpu"] == "500m"
    assert resources["limits"]["memory"] == "2560Mi"

    for mapping_key in range(3):
        multiply_inputs_step = plan.get_step_by_key(
            f"{multiply_inputs.name}[{mapping_key}]")
        dynamic_step_user_defined_k8s_config = get_user_defined_k8s_config(
            multiply_inputs_step.tags)

        assert dynamic_step_user_defined_k8s_config.container_config
        assert dynamic_step_user_defined_k8s_config.container_config[
            "resources"]

        resources = dynamic_step_user_defined_k8s_config.container_config[
            "resources"]

        assert resources["requests"]["cpu"] == "500m"
        assert resources["requests"]["memory"] == "128Mi"
        assert resources["limits"]["cpu"] == "1000m"
        assert resources["limits"]["memory"] == "1Gi"
Exemple #26
0
def _do_execute_plan(graphene_info, execution_params, pipeline_def):
    check.inst_param(graphene_info, 'graphene_info', ResolveInfo)
    check.inst_param(execution_params, 'execution_params', ExecutionParams)

    run_id = execution_params.execution_metadata.run_id

    pipeline_run = graphene_info.context.instance.get_run_by_id(run_id)
    if not pipeline_run:
        # TODO switch to raising a UserFacingError if the run_id cannot be found
        # https://github.com/dagster-io/dagster/issues/1876
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            environment_dict=execution_params.environment_dict,
            mode=execution_params.mode or pipeline_def.get_default_mode_name(),
            tags=execution_params.execution_metadata.tags or {},
        )

    execution_plan = create_execution_plan(
        pipeline=pipeline_def,
        environment_dict=execution_params.environment_dict,
        run_config=pipeline_run,
    )

    if execution_params.step_keys:
        for step_key in execution_params.step_keys:
            if not execution_plan.has_step(step_key):
                raise UserFacingGraphQLError(
                    graphene_info.schema.type_named('InvalidStepError')(invalid_step_key=step_key)
                )

        execution_plan = execution_plan.build_subset_plan(execution_params.step_keys)

    event_logs = []

    def _on_event_record(record):
        if record.is_dagster_event:
            event_logs.append(record)

    graphene_info.context.instance.add_event_listener(run_id, _on_event_record)

    execute_plan(
        execution_plan=execution_plan,
        environment_dict=execution_params.environment_dict,
        pipeline_run=pipeline_run,
        instance=graphene_info.context.instance,
    )

    dauphin_pipeline = DauphinPipeline.from_pipeline_def(pipeline_def)

    def to_graphql_event(event_record):
        return from_dagster_event_record(
            graphene_info, event_record, dauphin_pipeline, execution_plan
        )

    return graphene_info.schema.type_named('ExecutePlanSuccess')(
        pipeline=dauphin_pipeline,
        has_failures=any(
            er
            for er in event_logs
            if er.is_dagster_event and er.dagster_event.event_type == DagsterEventType.STEP_FAILURE
        ),
        step_events=list(map(to_graphql_event, event_logs)),
        raw_event_records=list(map(serialize_dagster_namedtuple, event_logs)),
    )
Exemple #27
0
def test_all_step_events():  # pylint: disable=too-many-locals
    handle = ExecutionTargetHandle.for_pipeline_fn(define_test_events_pipeline)
    pipeline = handle.build_pipeline_definition()
    mode = pipeline.get_default_mode_name()
    run_config = RunConfig(mode=mode)
    execution_plan = create_execution_plan(pipeline, {}, run_config=run_config)
    step_levels = execution_plan.topological_step_levels()

    unhandled_events = STEP_EVENTS.copy()

    # Exclude types that are not step events
    ignored_events = {
        'LogMessageEvent',
        'PipelineStartEvent',
        'PipelineSuccessEvent',
        'PipelineInitFailureEvent',
        'PipelineFailureEvent',
    }

    event_counts = defaultdict(int)

    for step_level in step_levels:
        for step in step_level:

            variables = {
                'executionParams': {
                    'selector': {
                        'name': pipeline.name
                    },
                    'environmentConfigData': {
                        'storage': {
                            'filesystem': {}
                        }
                    },
                    'mode': mode,
                    'executionMetadata': {
                        'runId': run_config.run_id
                    },
                    'stepKeys': [step.key],
                }
            }
            instance = DagsterInstance.ephemeral()
            res = execute_query(handle,
                                START_PIPELINE_EXECUTION_MUTATION,
                                variables,
                                instance=instance)

            # go through the same dict, decrement all the event records we've seen from the GraphQL
            # response
            if not res.get('errors'):
                run_logs = res['data']['startPipelineExecution']['run'][
                    'logs']['nodes']

                events = [
                    dagster_event_from_dict(e, pipeline.name) for e in run_logs
                    if e['__typename'] not in ignored_events
                ]

                for event in events:
                    if event.step_key:
                        key = event.step_key + '.' + event.event_type_value
                    else:
                        key = event.event_type_value
                    event_counts[key] -= 1
                unhandled_events -= {
                    DagsterEventType(e.event_type_value)
                    for e in events
                }
            else:
                raise Exception(res['errors'])

            # build up a dict, incrementing all the event records we've produced in the run storage
            logs = instance.all_logs(run_config.run_id)
            for log in logs:
                if not log.dagster_event or (
                        DagsterEventType(log.dagster_event.event_type_value)
                        not in STEP_EVENTS.union(
                            set([DagsterEventType.ENGINE_EVENT]))):
                    continue
                if log.dagster_event.step_key:
                    key = log.dagster_event.step_key + '.' + log.dagster_event.event_type_value
                else:
                    key = log.dagster_event.event_type_value
                event_counts[key] += 1

    # Ensure we've processed all the events that were generated in the run storage
    assert sum(event_counts.values()) == 0

    # Ensure we've handled the universe of event types
    assert not unhandled_events
Exemple #28
0
    def reconstitute_pipeline_context(
        self,
        output_log_path=None,
        marshal_dir=None,
        run_config=None,
        executable_dict=None,
        pipeline_run_dict=None,
        solid_handle_kwargs=None,
        instance_ref_dict=None,
    ):
        """Reconstitutes a context for dagstermill-managed execution.

        You'll see this function called to reconstruct a pipeline context within the ``injected
        parameters`` cell of a dagstermill output notebook. Users should not call this function
        interactively except when debugging output notebooks.

        Use :func:`dagstermill.get_context` in the ``parameters`` cell of your notebook to define a
        context for interactive exploration and development. This call will be replaced by one to
        :func:`dagstermill.reconstitute_pipeline_context` when the notebook is executed by
        dagstermill.
        """
        check.opt_str_param(output_log_path, "output_log_path")
        check.opt_str_param(marshal_dir, "marshal_dir")
        run_config = check.opt_dict_param(run_config,
                                          "run_config",
                                          key_type=str)
        check.dict_param(pipeline_run_dict, "pipeline_run_dict")
        check.dict_param(executable_dict, "executable_dict")
        check.dict_param(solid_handle_kwargs, "solid_handle_kwargs")
        check.dict_param(instance_ref_dict, "instance_ref_dict")

        pipeline = ReconstructablePipeline.from_dict(executable_dict)
        pipeline_def = pipeline.get_definition()

        try:
            instance_ref = unpack_value(instance_ref_dict)
            instance = DagsterInstance.from_ref(instance_ref)
        except Exception as err:  # pylint: disable=broad-except
            raise DagstermillError(
                "Error when attempting to resolve DagsterInstance from serialized InstanceRef"
            ) from err

        pipeline_run = unpack_value(pipeline_run_dict)

        solid_handle = SolidHandle.from_dict(solid_handle_kwargs)
        solid_def = pipeline_def.get_solid(solid_handle).definition

        self.marshal_dir = marshal_dir
        self.in_pipeline = True
        self.solid_def = solid_def
        self.pipeline = pipeline

        execution_plan = create_execution_plan(
            self.pipeline,
            run_config,
            mode=pipeline_run.mode,
            step_keys_to_execute=pipeline_run.step_keys_to_execute,
        )

        with scoped_pipeline_context(
                execution_plan,
                run_config,
                pipeline_run,
                instance,
                scoped_resources_builder_cm=self._setup_resources,
                # Set this flag even though we're not in test for clearer error reporting
                raise_on_error=True,
        ) as pipeline_context:
            self.context = DagstermillRuntimeExecutionContext(
                pipeline_context=pipeline_context,
                solid_config=run_config.get("solids",
                                            {}).get(solid_def.name,
                                                    {}).get("config"),
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
Exemple #29
0
def test_emr_pyspark_execution_plan():
    os.environ["EMR_CLUSTER_ID"] = "some_cluster_id"
    create_execution_plan(my_pipeline,
                          mode="emr",
                          run_config=emr_preset.run_config)
def test_s3_io_manager_execution(mock_s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "s3_bucket": mock_s3_bucket.name
                }
            }
        }
    }

    run_id = make_new_run_id()

    execution_plan = create_execution_plan(pipeline_def, run_config=run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")

    io_manager = PickledObjectS3IOManager(mock_s3_bucket.name,
                                          construct_s3_client(max_attempts=5),
                                          s3_prefix="dagster")
    step_output_handle = StepOutputHandle("return_one")
    context = InputContext(
        pipeline_name=pipeline_def.name,
        solid_def=pipeline_def.solid_def_named("return_one"),
        config={},
        metadata={},
        upstream_output=OutputContext(
            step_key=step_output_handle.step_key,
            name=step_output_handle.output_name,
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            metadata={},
            mapping_key=None,
            config=None,
            solid_def=pipeline_def.solid_def_named("return_one"),
        ),
    )
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"]),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = InputContext(
        pipeline_name=pipeline_def.name,
        solid_def=pipeline_def.solid_def_named("add_one"),
        config={},
        metadata={},
        upstream_output=OutputContext(
            step_key=step_output_handle.step_key,
            name=step_output_handle.output_name,
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            metadata={},
            mapping_key=None,
            config=None,
            solid_def=pipeline_def.solid_def_named("add_one"),
        ),
    )

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2