Example #1
0
def execute_pipeline(pipeline, environment_dict=None, run_config=None):
    '''
    "Synchronous" version of :py:func:`execute_pipeline_iterator`.

    This is the entry point for dagster CLI and dagit execution. For the dagster-graphql entry
    point, see execute_plan() below.

    Parameters:
      pipeline (PipelineDefinition): Pipeline to run
      environment_dict (dict): The enviroment configuration that parameterizes this run
      run_config (RunConfig): Configuration for how this pipeline will be executed

    Returns:
      :py:class:`PipelineExecutionResult`
    '''

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')
    run_config = check_run_config_param(run_config, pipeline)

    execution_plan = create_execution_plan(pipeline, environment_dict,
                                           run_config)

    with scoped_pipeline_context(pipeline, environment_dict,
                                 run_config) as pipeline_context:
        event_list = list(
            _execute_pipeline_iterator(
                pipeline_context,
                execution_plan=execution_plan,
                run_config=run_config,
                step_keys_to_execute=run_config.step_keys_to_execute,
            ))

        return PipelineExecutionResult(
            pipeline,
            run_config.run_id,
            event_list,
            lambda: scoped_pipeline_context(
                pipeline,
                environment_dict,
                run_config,
                system_storage_data=SystemStorageData(
                    run_storage=pipeline_context.run_storage,
                    intermediates_manager=pipeline_context.
                    intermediates_manager,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
Example #2
0
def execute_pipeline(
    pipeline, environment_dict=None, run_config=None, instance=None, raise_on_error=True
):
    '''
    "Synchronous" version of :py:func:`execute_pipeline_iterator`.

    This is the entry point for dagster CLI and dagit execution. For the dagster-graphql entry
    point, see execute_plan() below.

    Parameters:
        pipeline (PipelineDefinition): Pipeline to run
        environment_dict (dict):
            The enviroment configuration that parameterizes this run
        run_config (RunConfig):
            Configuration for how this pipeline will be executed
        instance (DagsterInstance):
            The instance to execute against, defaults to ephemeral (no artifacts persisted)
        raise_on_error (Bool):
            Whether or not to raise exceptions when they occur. Defaults to True
            since this behavior is useful in tests which is the most common use of this API.

    Returns:
      :py:class:`PipelineExecutionResult`
    '''

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict, 'environment_dict')
    run_config = check_run_config_param(run_config, pipeline)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    execution_plan = create_execution_plan(pipeline, environment_dict, run_config)
    step_keys_to_execute = _resolve_step_keys(execution_plan, run_config.step_keys_to_execute)

    # run should be used and threaded through here
    # https://github.com/dagster-io/dagster/issues/1745
    _run = _create_run(instance, pipeline, run_config, environment_dict)

    with scoped_pipeline_context(
        pipeline, environment_dict, run_config, instance, raise_on_error=raise_on_error
    ) as pipeline_context:
        event_list = list(
            _pipeline_execution_iterator(
                pipeline_context,
                execution_plan=execution_plan,
                run_config=run_config,
                step_keys_to_execute=step_keys_to_execute,
            )
        )

        return PipelineExecutionResult(
            pipeline,
            run_config.run_id,
            event_list,
            lambda: scoped_pipeline_context(
                pipeline,
                environment_dict,
                run_config,
                instance,
                system_storage_data=SystemStorageData(
                    intermediates_manager=pipeline_context.intermediates_manager,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
Example #3
0
def execute_pipeline(pipeline,
                     environment_dict=None,
                     run_config=None,
                     instance=None,
                     raise_on_error=True):
    '''Execute a pipeline synchronously.

    Users will typically call this API when testing pipeline execution, or running standalone
    scripts.

    Parameters:
        pipeline (PipelineDefinition): The pipeline to execute.
        environment_dict (Optional[dict]): The enviroment configuration that parameterizes this run,
            as a dict.
        run_config (Optional[RunConfig]): Optionally specifies additional config options for
            pipeline execution.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
      :py:class:`PipelineExecutionResult`: The result of pipeline execution.

    For the asynchronous version, see :py:func:`execute_pipeline_iterator`.

    This is the entrypoint for dagster CLI execution. For the dagster-graphql entrypoint, see
    ``dagster.core.execution.api.execute_plan()``.
    '''

    check.inst_param(pipeline, 'pipeline', PipelineDefinition)
    environment_dict = check.opt_dict_param(environment_dict,
                                            'environment_dict')
    run_config = check_run_config_param(run_config, pipeline)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    execution_plan = create_execution_plan(pipeline, environment_dict,
                                           run_config)

    pipeline_run = _create_run(instance, pipeline, run_config,
                               environment_dict)

    with scoped_pipeline_context(
            pipeline,
            environment_dict,
            pipeline_run,
            instance,
            raise_on_error=raise_on_error) as pipeline_context:
        event_list = list(
            _pipeline_execution_iterator(pipeline_context, execution_plan,
                                         pipeline_run))

        return PipelineExecutionResult(
            pipeline,
            run_config.run_id,
            event_list,
            lambda: scoped_pipeline_context(
                pipeline,
                environment_dict,
                pipeline_run,
                instance,
                system_storage_data=SystemStorageData(
                    intermediates_manager=pipeline_context.
                    intermediates_manager,
                    file_manager=pipeline_context.file_manager,
                ),
            ),
        )
Example #4
0
def execute_run(pipeline, pipeline_run, instance, raise_on_error=False):
    '''Executes an existing pipeline run synchronously.

    Synchronous version of execute_run_iterator.

    Args:
        pipeline (ExecutablePipeline): The pipeline to execute.
        pipeline_run (PipelineRun): The run to execute
        instance (DagsterInstance): The instance in which the run has been created.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``False``.

    Returns:
        PipelineExecutionResult: The result of the execution.
    '''
    if isinstance(pipeline, PipelineDefinition):
        raise DagsterInvariantViolationError(
            'execute_run requires an ExecutablePipeline but received a PipelineDefinition '
            'directly instead. To support hand-off to other processes provide a '
            'ReconstructablePipeline which can be done using reconstructable(). For in '
            'process only execution you can use InMemoryExecutablePipeline.')

    check.inst_param(pipeline, 'pipeline', ExecutablePipeline)
    check.inst_param(pipeline_run, 'pipeline_run', PipelineRun)
    check.inst_param(instance, 'instance', DagsterInstance)
    check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)

    pipeline_def = pipeline.get_definition()
    if pipeline_run.solids_to_execute:
        if isinstance(pipeline_def, PipelineSubsetDefinition):
            check.invariant(
                pipeline_run.solids_to_execute == pipeline.solids_to_execute,
                'Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that '
                'conflicts with pipeline subset {pipeline_solids_to_execute}.'.
                format(
                    pipeline_solids_to_execute=str_format_set(
                        pipeline.solids_to_execute),
                    solids_to_execute=str_format_set(
                        pipeline_run.solids_to_execute),
                ),
            )
        else:
            # when `execute_run` is directly called, the sub pipeline hasn't been created
            # note that when we receive the solids to execute via PipelineRun, it won't support
            # solid selection query syntax
            pipeline = pipeline.subset_for_execution_from_existing_pipeline(
                pipeline_run.solids_to_execute)

    execution_plan = create_execution_plan(
        pipeline,
        environment_dict=pipeline_run.environment_dict,
        mode=pipeline_run.mode,
        step_keys_to_execute=pipeline_run.step_keys_to_execute,
    )

    _execute_run_iterable = _ExecuteRunWithPlanIterable(
        execution_plan=execution_plan,
        pipeline_run=pipeline_run,
        instance=instance,
        iterator=_pipeline_execution_iterator,
        environment_dict=pipeline_run.environment_dict,
        retries=None,
        raise_on_error=raise_on_error,
    )
    event_list = list(_execute_run_iterable)
    pipeline_context = _execute_run_iterable.pipeline_context

    return PipelineExecutionResult(
        pipeline.get_definition(),
        pipeline_run.run_id,
        event_list,
        lambda: scoped_pipeline_context(
            execution_plan,
            pipeline_run.environment_dict,
            pipeline_run,
            instance,
            system_storage_data=SystemStorageData(
                intermediates_manager=pipeline_context.intermediates_manager,
                file_manager=pipeline_context.file_manager,
            ),
        ),
    )
Example #5
0
def execute_run(pipeline, pipeline_run, instance, raise_on_error=False):
    '''Executes an existing pipeline run synchronously.

    Synchronous version of execute_run_iterator.

    Args:
        pipeline (Union[ExecutablePipeline, PipelineDefinition]): The pipeline to execute.
        pipeline_run (PipelineRun): The run to execute
        instance (DagsterInstance): The instance in which the run has been created.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``False``.
    
    Returns:
        PipelineExecutionResult: The result of the execution.
    '''
    pipeline, pipeline_def = _check_pipeline(pipeline)

    check.inst_param(pipeline_run, 'pipeline_run', PipelineRun)
    check.inst_param(instance, 'instance', DagsterInstance)
    check.invariant(pipeline_run.status == PipelineRunStatus.NOT_STARTED)

    if pipeline_run.solid_subset:
        pipeline_def = pipeline.get_definition()
        if isinstance(pipeline_def, PipelineSubsetForExecution):
            check.invariant(
                len(pipeline_run.solid_subset) == len(
                    pipeline_def.solid_subset)
                and set(pipeline_run.solid_subset) == set(
                    pipeline_def.solid_subset),
                'Cannot execute PipelineRun with solid_subset {solid_subset} that conflicts with '
                'pipeline subset {pipeline_solid_subset}.'.format(
                    pipeline_solid_subset=str_format_list(
                        pipeline_def.solid_subset),
                    solid_subset=str_format_list(pipeline_run.solid_subset),
                ),
            )
        else:
            pipeline = pipeline.subset_for_execution(pipeline_run.solid_subset)
            pipeline_def = pipeline.get_definition()

    execution_plan = create_execution_plan(
        pipeline,
        environment_dict=pipeline_run.environment_dict,
        mode=pipeline_run.mode,
        step_keys_to_execute=pipeline_run.step_keys_to_execute,
    )

    _execute_run_iterable = _ExecuteRunWithPlanIterable(
        execution_plan=execution_plan,
        pipeline_run=pipeline_run,
        instance=instance,
        iterator=_pipeline_execution_iterator,
        environment_dict=pipeline_run.environment_dict,
        retries=None,
        raise_on_error=raise_on_error,
    )
    event_list = list(_execute_run_iterable)
    pipeline_context = _execute_run_iterable.pipeline_context

    return PipelineExecutionResult(
        pipeline_def,
        pipeline_run.run_id,
        event_list,
        lambda: scoped_pipeline_context(
            execution_plan,
            pipeline_run.environment_dict,
            pipeline_run,
            instance,
            system_storage_data=SystemStorageData(
                intermediates_manager=pipeline_context.intermediates_manager,
                file_manager=pipeline_context.file_manager,
            ),
        ),
    )
Example #6
0
def execute_pipeline(
    pipeline,
    environment_dict=None,
    mode=None,
    preset=None,
    tags=None,
    run_config=None,
    instance=None,
    raise_on_error=True,
):
    '''Execute a pipeline synchronously.

    Users will typically call this API when testing pipeline execution, or running standalone
    scripts.

    Parameters:
        pipeline (PipelineDefinition): The pipeline to execute.
        environment_dict (Optional[dict]): The environment configuration that parametrizes this run,
            as a dict.
        mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``
            and ``preset``.
        preset (Optional[str]): The name of the pipeline preset to use. You may not set both
            ``mode`` and ``preset``.
        tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline
            logs.
        run_config (Optional[RunConfig]): Optionally specifies additional config options for
            pipeline execution.

            Deprecation notice: In 0.8.0, the use of `run_config` to set mode, tags, and step keys
            will be deprecated. In the interim, if you set a mode using `run_config`, this must
            match any mode set using `mode` or `preset`. If you set tags using `run_config`, any
            tags set using `tags` will take precedence. If you set step keys, these must be
            compatible with any solid subset specified using `preset`.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
      :py:class:`PipelineExecutionResult`: The result of pipeline execution.

    For the asynchronous version, see :py:func:`execute_pipeline_iterator`.

    This is the entrypoint for dagster CLI execution. For the dagster-graphql entrypoint, see
    ``dagster.core.execution.api.execute_plan()``.
    '''
    (
        pipeline,
        environment_dict,
        instance,
        mode,
        tags,
        run_config,
        execution_plan,
    ) = _check_execute_pipeline_args(
        'execute_pipeline',
        pipeline=pipeline,
        environment_dict=environment_dict,
        mode=mode,
        preset=preset,
        tags=tags,
        run_config=run_config,
        instance=instance,
    )

    pipeline_run = instance.create_run_for_pipeline(
        pipeline=pipeline,
        run_id=run_config.run_id,
        environment_dict=environment_dict,
        mode=mode,
        selector=pipeline.selector,
        step_keys_to_execute=run_config.step_keys_to_execute,
        tags=tags,
        root_run_id=run_config.previous_run_id,
        parent_run_id=run_config.previous_run_id,
    )

    initialization_manager = pipeline_initialization_manager(
        pipeline,
        environment_dict,
        pipeline_run,
        instance,
        execution_plan,
        raise_on_error=raise_on_error,
    )
    event_list = list(initialization_manager.generate_setup_events())
    pipeline_context = initialization_manager.get_object()
    try:
        if pipeline_context:
            event_list.extend(
                _pipeline_execution_iterator(pipeline_context, execution_plan, pipeline_run)
            )
    finally:
        event_list.extend(initialization_manager.generate_teardown_events())
    return PipelineExecutionResult(
        pipeline,
        pipeline_run.run_id,
        event_list,
        lambda: scoped_pipeline_context(
            pipeline,
            environment_dict,
            pipeline_run,
            instance,
            execution_plan,
            system_storage_data=SystemStorageData(
                intermediates_manager=pipeline_context.intermediates_manager,
                file_manager=pipeline_context.file_manager,
            ),
        ),
    )
Example #7
0
def execute_run(pipeline, pipeline_run, instance, raise_on_error=False):
    """Executes an existing pipeline run synchronously.

    Synchronous version of execute_run_iterator.

    Args:
        pipeline (IPipeline): The pipeline to execute.
        pipeline_run (PipelineRun): The run to execute
        instance (DagsterInstance): The instance in which the run has been created.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``False``.

    Returns:
        PipelineExecutionResult: The result of the execution.
    """
    if isinstance(pipeline, PipelineDefinition):
        raise DagsterInvariantViolationError(
            "execute_run requires an IPipeline but received a PipelineDefinition "
            "directly instead. To support hand-off to other processes provide a "
            "ReconstructablePipeline which can be done using reconstructable(). For in "
            "process only execution you can use InMemoryPipeline.")

    check.inst_param(pipeline, "pipeline", IPipeline)
    check.inst_param(pipeline_run, "pipeline_run", PipelineRun)
    check.inst_param(instance, "instance", DagsterInstance)
    check.invariant(
        pipeline_run.status == PipelineRunStatus.NOT_STARTED,
        desc=
        "Pipeline run {} ({}) in state {}, expected PipelineRunStatus.NOT_STARTED"
        .format(pipeline_run.pipeline_name, pipeline_run.run_id,
                pipeline_run.status),
    )
    pipeline_def = pipeline.get_definition()
    if pipeline_run.solids_to_execute:
        if isinstance(pipeline_def, PipelineSubsetDefinition):
            check.invariant(
                pipeline_run.solids_to_execute == pipeline.solids_to_execute,
                "Cannot execute PipelineRun with solids_to_execute {solids_to_execute} that "
                "conflicts with pipeline subset {pipeline_solids_to_execute}.".
                format(
                    pipeline_solids_to_execute=str_format_set(
                        pipeline.solids_to_execute),
                    solids_to_execute=str_format_set(
                        pipeline_run.solids_to_execute),
                ),
            )
        else:
            # when `execute_run` is directly called, the sub pipeline hasn't been created
            # note that when we receive the solids to execute via PipelineRun, it won't support
            # solid selection query syntax
            pipeline = pipeline.subset_for_execution_from_existing_pipeline(
                pipeline_run.solids_to_execute)

    execution_plan = create_execution_plan(
        pipeline,
        run_config=pipeline_run.run_config,
        mode=pipeline_run.mode,
        step_keys_to_execute=pipeline_run.step_keys_to_execute,
    )

    if is_memoized_run(pipeline_run.tags):
        execution_plan = instance.resolve_memoized_execution_plan(
            execution_plan,
            run_config=pipeline_run.run_config,
            mode=pipeline_run.mode)

    _execute_run_iterable = _ExecuteRunWithPlanIterable(
        execution_plan=execution_plan,
        iterator=_pipeline_execution_iterator,
        execution_context_manager=PipelineExecutionContextManager(
            execution_plan=execution_plan,
            pipeline_run=pipeline_run,
            instance=instance,
            run_config=pipeline_run.run_config,
            raise_on_error=raise_on_error,
        ),
    )
    event_list = list(_execute_run_iterable)
    pipeline_context = _execute_run_iterable.pipeline_context

    return PipelineExecutionResult(
        pipeline.get_definition(),
        pipeline_run.run_id,
        event_list,
        lambda: scoped_pipeline_context(
            execution_plan,
            pipeline_run.run_config,
            pipeline_run,
            instance,
            intermediate_storage=pipeline_context.intermediate_storage,
            system_storage_data=SystemStorageData(
                intermediate_storage=pipeline_context.intermediate_storage,
                file_manager=pipeline_context.file_manager,
            ),
        ),
    )