Exemple #1
0
def define_dagstermill_solid(
    name,
    notebook_path,
    input_defs=None,
    output_defs=None,
    config=None,
    required_resource_keys=None,
    output_notebook=None,
    config_schema=None,
):
    '''Wrap a Jupyter notebook in a solid.

    Arguments:
        name (str): The name of the solid.
        notebook_path (str): Path to the backing notebook.
        input_defs (Optional[List[InputDefinition]]): The solid's inputs.
        output_defs (Optional[List[OutputDefinition]]): The solid's outputs. Your notebook should
            call :py:func:`~dagstermill.yield_result` to yield each of these outputs.
        required_resource_keys (Optional[Set[str]]): The string names of any required resources.
        output_notebook (Optional[str]): If set, will be used as the name of an injected output of
            type :py:class:`~dagster.FileHandle` that will point to the executed notebook (in
            addition to the :py:class:`~dagster.Materialization` that is always created). This
            respects the :py:class:`~dagster.core.storage.file_manager.FileManager` configured on
            the pipeline system storage, so, e.g., if :py:class:`~dagster_aws.s3.s3_system_storage`
            is configured, the output will be a :py:class:`~dagster_aws.s3.S3FileHandle`.

    Returns:
        :py:class:`~dagster.SolidDefinition`
    '''
    check.str_param(name, 'name')
    check.str_param(notebook_path, 'notebook_path')
    input_defs = check.opt_list_param(input_defs,
                                      'input_defs',
                                      of_type=InputDefinition)
    output_defs = check.opt_list_param(output_defs,
                                       'output_defs',
                                       of_type=OutputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 'required_resource_keys',
                                                 of_type=str)

    return SolidDefinition(
        name=name,
        input_defs=input_defs,
        compute_fn=_dm_solid_compute(name, notebook_path, output_notebook),
        output_defs=output_defs +
        ([OutputDefinition(dagster_type=FileHandle, name=output_notebook)]
         if output_notebook else []),
        config_schema=canonicalize_backcompat_args(
            check_user_facing_opt_config_param(config_schema, 'config_schema'),
            'config_schema',
            check_user_facing_opt_config_param(config, 'config'),
            'config',
            '0.9.0',
        ),
        required_resource_keys=required_resource_keys,
        description='This solid is backed by the notebook at {path}'.format(
            path=notebook_path),
        tags={
            'notebook_path': notebook_path,
            'kind': 'ipynb'
        },
    )
Exemple #2
0
    def __new__(
        cls,
        pipeline_name=None,
        run_id=None,
        run_config=None,
        mode=None,
        solid_selection=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot_id=None,
        execution_plan_snapshot_id=None,
        ## GRAVEYARD BELOW
        # see https://github.com/dagster-io/dagster/issues/2372 for explanation
        previous_run_id=None,
        selector=None,
        solid_subset=None,
        environment_dict=None,
    ):
        # a frozenset which contains the names of the solids to execute
        check.opt_set_param(solids_to_execute,
                            'solids_to_execute',
                            of_type=str)
        # a list of solid queries provided by the user
        # possible to be None when only solids_to_execute is set by the user directly
        check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

        check.opt_list_param(step_keys_to_execute,
                             'step_keys_to_execute',
                             of_type=str)

        check.opt_str_param(root_run_id, 'root_run_id')
        check.opt_str_param(parent_run_id, 'parent_run_id')

        check.invariant(
            (root_run_id is not None and parent_run_id is not None)
            or (root_run_id is None and parent_run_id is None),
            ('Must set both root_run_id and parent_run_id when creating a PipelineRun that '
             'belongs to a run group'),
        )

        # Compatibility
        # ----------------------------------------------------------------------------------------
        check.invariant(
            not (run_config is not None and environment_dict is not None),
            'Cannot set both run_config and environment_dict. Use run_config parameter.',
        )
        run_config = run_config or environment_dict
        # Historical runs may have previous_run_id set, in which case
        # that previous ID becomes both the root and the parent
        if previous_run_id:
            if not (parent_run_id and root_run_id):
                parent_run_id = previous_run_id
                root_run_id = previous_run_id

        check.opt_inst_param(selector, 'selector', ExecutionSelector)
        if selector:
            check.invariant(
                pipeline_name is None or selector.name == pipeline_name,
                ('Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: '
                 'selector was passed with pipeline {selector_pipeline}'.
                 format(pipeline_name=pipeline_name,
                        selector_pipeline=selector.name)),
            )
            if pipeline_name is None:
                pipeline_name = selector.name

            check.invariant(
                solids_to_execute is None
                or set(selector.solid_subset) == solids_to_execute,
                ('Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: '
                 'selector was passed with subset {selector_subset}'.format(
                     solids_to_execute=solids_to_execute,
                     selector_subset=selector.solid_subset)),
            )
            # for old runs that only have selector but no solids_to_execute
            if solids_to_execute is None:
                solids_to_execute = (frozenset(selector.solid_subset)
                                     if selector.solid_subset else None)

        # for old runs that specified list-type solid_subset
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        if solid_subset:
            solids_to_execute = frozenset(solid_subset)
        # ----------------------------------------------------------------------------------------

        return super(PipelineRun, cls).__new__(
            cls,
            pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'),
            run_id=check.opt_str_param(run_id,
                                       'run_id',
                                       default=make_new_run_id()),
            run_config=check.opt_dict_param(run_config,
                                            'run_config',
                                            key_type=str),
            mode=check.opt_str_param(mode, 'mode'),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=check.opt_inst_param(status, 'status', PipelineRunStatus,
                                        PipelineRunStatus.NOT_STARTED),
            tags=check.opt_dict_param(tags, 'tags', key_type=str),
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id,
                                                     'pipeline_snapshot_id'),
            execution_plan_snapshot_id=check.opt_str_param(
                execution_plan_snapshot_id, 'execution_plan_snapshot_id'),
        )
    def execute(pipeline_context, execution_plan, step_keys_to_execute=None):
        check.inst_param(pipeline_context, 'pipeline_context',
                         SystemPipelineExecutionContext)
        check.inst_param(execution_plan, 'execution_plan', ExecutionPlan)
        check.opt_list_param(step_keys_to_execute,
                             'step_keys_to_execute',
                             of_type=str)

        step_key_set = None if step_keys_to_execute is None else set(
            step_keys_to_execute)

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Executing steps in process (pid: {pid})'.format(pid=os.getpid()),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), step_key_set),
        )

        with time_execution_scope() as timer_result:
            check.param_invariant(
                isinstance(pipeline_context.executor_config, ExecutorConfig),
                'pipeline_context',
                'Expected executor_config to be ExecutorConfig got {}'.format(
                    pipeline_context.executor_config),
            )

            failed_or_skipped_steps = set()

            step_levels = execution_plan.topological_step_levels()

            # It would be good to implement a reference tracking algorithm here to
            # garbage collect results that are no longer needed by any steps
            # https://github.com/dagster-io/dagster/issues/811
            for step_level in step_levels:
                for step in step_level:
                    if step_key_set and step.key not in step_key_set:
                        continue

                    step_context = pipeline_context.for_step(step)

                    failed_inputs = []
                    for step_input in step.step_inputs:
                        failed_inputs.extend(
                            failed_or_skipped_steps.intersection(
                                step_input.dependency_keys))

                    if failed_inputs:
                        step_context.log.info((
                            'Dependencies for step {step} failed: {failed_inputs}. Not executing.'
                        ).format(step=step.key, failed_inputs=failed_inputs))
                        failed_or_skipped_steps.add(step.key)
                        yield DagsterEvent.step_skipped_event(step_context)
                        continue

                    uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs(
                        step_context, step)
                    if uncovered_inputs:
                        # In partial pipeline execution, we may end up here without having validated the
                        # missing dependent outputs were optional
                        _assert_missing_inputs_optional(
                            uncovered_inputs, execution_plan, step.key)

                        step_context.log.info((
                            'Not all inputs covered for {step}. Not executing. Output missing for '
                            'inputs: {uncovered_inputs}').format(
                                uncovered_inputs=uncovered_inputs,
                                step=step.key))
                        failed_or_skipped_steps.add(step.key)
                        yield DagsterEvent.step_skipped_event(step_context)
                        continue

                    for step_event in check.generator(
                            dagster_event_sequence_for_step(step_context)):
                        check.inst(step_event, DagsterEvent)
                        if step_event.is_step_failure:
                            failed_or_skipped_steps.add(step.key)

                        yield step_event

        yield DagsterEvent.engine_event(
            pipeline_context,
            'Finished steps in process (pid: {pid}) in {duration_ms}'.format(
                pid=os.getpid(),
                duration_ms=format_duration(timer_result.millis)),
            event_specific_data=EngineEventData.in_process(
                os.getpid(), step_key_set),
        )
Exemple #4
0
    def __init__(
        self,
        solid_defs: Optional[List[NodeDefinition]] = None,
        name: Optional[str] = None,
        description: Optional[str] = None,
        dependencies: Optional[Dict[Union[str, NodeInvocation],
                                    Dict[str, IDependencyDefinition]]] = None,
        mode_defs: Optional[List[ModeDefinition]] = None,
        preset_defs: Optional[List[PresetDefinition]] = None,
        tags: Dict[str, Any] = None,
        hook_defs: Optional[AbstractSet[HookDefinition]] = None,
        solid_retry_policy: Optional[RetryPolicy] = None,
        graph_def=None,
        _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
        version_strategy: Optional[VersionStrategy] = None,
    ):
        # If a graph is specificed directly use it
        if check.opt_inst_param(graph_def, "graph_def", GraphDefinition):
            self._graph_def = graph_def
            self._name = name or graph_def.name

        # Otherwise fallback to legacy construction
        else:
            if name is None:
                check.failed("name must be set provided")
            self._name = name

            if solid_defs is None:
                check.failed("solid_defs must be provided")

            self._graph_def = GraphDefinition(
                name=name,
                dependencies=dependencies,
                node_defs=solid_defs,
                input_mappings=None,
                output_mappings=None,
                config=None,
                description=None,
            )

        # tags and description can exist on graph as well, but since
        # same graph may be in multiple pipelines/jobs, keep separate layer
        self._description = check.opt_str_param(description, "description")
        self._tags = validate_tags(tags)

        self._current_level_node_defs = self._graph_def.node_defs

        mode_definitions = check.opt_list_param(mode_defs,
                                                "mode_defs",
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    "Modes must have unique names.").format(
                        mode_name=mode_def.name, pipeline_name=self.name))
            seen_modes.add(mode_def.name)

        self._hook_defs = check.opt_set_param(hook_defs,
                                              "hook_defs",
                                              of_type=HookDefinition)
        self._solid_retry_policy = check.opt_inst_param(
            solid_retry_policy, "solid_retry_policy", RetryPolicy)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs",
                                                 PresetDefinition)
        self._preset_dict: Dict[str, PresetDefinition] = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    "PresetDefinitions must have unique names.").format(
                        name=preset.name, pipeline_name=self.name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self.name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        self._resource_requirements = {
            mode_def.name: _checked_resource_reqs_for_mode(
                mode_def,
                self._current_level_node_defs,
                self._graph_def._dagster_type_dict,
                self._graph_def._node_dict,
                self._hook_defs,
                self._graph_def._dependency_structure,
            )
            for mode_def in self._mode_definitions
        }

        # Recursively explore all nodes in the this pipeline
        self._all_node_defs = _build_all_node_defs(
            self._current_level_node_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition)
        self._cached_run_config_schemas: Dict[str, "RunConfigSchema"] = {}
        self._cached_external_pipeline = None

        self.version_strategy = check.opt_inst_param(version_strategy,
                                                     "version_strategy",
                                                     VersionStrategy)

        if self.version_strategy is not None:
            experimental_class_warning("VersionStrategy")
Exemple #5
0
def define_python_dagster_type(
    python_type,
    name=None,
    description=None,
    input_hydration_config=None,
    output_materialization_config=None,
    serialization_strategy=None,
    auto_plugins=None,
    type_check=None,
):
    '''Core machinery for defining a Dagster type corresponding to an existing python type.

    Users should generally use the :py:func:`@dagster_type` decorator or :py:func:`as_dagster_type`,
    both of which defer to this function.

    Args:
        python_type (cls): The python type to wrap as a Dagster type.
        name (Optional[str]): Name of the new Dagster type. If ``None``, the name (``__name__``) of
            the ``python_type`` will be used.
        description (Optional[str]): A user-readable description of the type.
        input_hydration_config (Optional[InputHydrationConfig]): An instance of a class constructed
            using the :py:func:`@input_hydration_config <dagster.InputHydrationConfig>` decorator
            that can map config data to a value of this type.
        output_materialization_config (Optiona[OutputMaterializationConfig]): An instance of a class
            constructed using the
            :py:func:`@output_materialization_config <dagster.output_materialization_config>`
            decorator that can persist values of this type.
        serialization_strategy (Optional[SerializationStrategy]): An instance of a class that
            inherits from :py:class:`SerializationStrategy`. The default strategy for serializing
            this value when automatically persisting it between execution steps. You should set
            this value if the ordinary serialization machinery (e.g., pickle) will not be adequate
            for this type.
        auto_plugins (Optional[List[TypeStoragePlugin]]): If types must be serialized differently
            depending on the storage being used for intermediates, they should specify this
            argument. In these cases the serialization_strategy argument is not sufficient because
            serialization requires specialized API calls, e.g. to call an S3 API directly instead
            of using a generic file object. See ``dagster_pyspark.DataFrame`` for an example.
        type_check (Optional[Callable[[Any], Union[bool, TypeCheck]]]): If specified, this function
            will be called in place of the default isinstance type check. This function should
            return ``True`` if the type check succeds, ``False`` if it fails, or, if additional
            metadata should be emitted along with the type check success or failure, an instance of
            :py:class:`TypeCheck` with the ``success`` field set appropriately.
    '''

    check.type_param(python_type, 'python_type')
    check.opt_str_param(name, 'name', python_type.__name__)
    check.opt_str_param(description, 'description')
    check.opt_inst_param(input_hydration_config, 'input_hydration_config',
                         InputHydrationConfig)
    check.opt_inst_param(output_materialization_config,
                         'output_materialization_config',
                         OutputMaterializationConfig)
    check.opt_inst_param(
        serialization_strategy,
        'serialization_strategy',
        SerializationStrategy,
        default=PickleSerializationStrategy(),
    )

    auto_plugins = check.opt_list_param(auto_plugins,
                                        'auto_plugins',
                                        of_type=type)
    check.param_invariant(
        all(
            issubclass(auto_plugin_type, TypeStoragePlugin)
            for auto_plugin_type in auto_plugins),
        'auto_plugins',
    )

    check.opt_callable_param(type_check, 'type_check')

    class _ObjectType(PythonObjectType):
        def __init__(self):
            super(_ObjectType, self).__init__(
                python_type=python_type,
                name=name,
                description=description,
                input_hydration_config=input_hydration_config,
                output_materialization_config=output_materialization_config,
                serialization_strategy=serialization_strategy,
                auto_plugins=auto_plugins,
                type_check=type_check,
            )

    return _ObjectType
Exemple #6
0
def reexecute_pipeline(
    pipeline: Union[IPipeline, PipelineDefinition],
    parent_run_id: str,
    run_config: Optional[dict] = None,
    step_selection: Optional[List[str]] = None,
    mode: Optional[str] = None,
    preset: Optional[str] = None,
    tags: Optional[Dict[str, Any]] = None,
    instance: DagsterInstance = None,
    raise_on_error: bool = True,
) -> PipelineExecutionResult:
    """Reexecute an existing pipeline run.

    Users will typically call this API when testing pipeline reexecution, or running standalone
    scripts.

    Parameters:
        pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.
        parent_run_id (str): The id of the previous run to reexecute. The run must exist in the
            instance.
        run_config (Optional[dict]): The environment configuration that parametrizes this run,
            as a dict.
        step_selection (Optional[List[str]]): A list of step selection queries (including single
            step keys) to execute. For example:
            - ['some_solid']: select the execution step "some_solid" itself.
            - ['*some_solid']: select the step "some_solid" and all its ancestors
                (upstream dependencies).
            - ['*some_solid+++']: select the step "some_solid", all its ancestors,
                and its descendants (downstream dependencies) within 3 levels down.
            - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select
                "some_solid" and all its ancestors, "other_solid_a" itself, and
                "other_solid_b" and its direct child execution steps.
        mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``
            and ``preset``.
        preset (Optional[str]): The name of the pipeline preset to use. You may not set both
            ``mode`` and ``preset``.
        tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline
            logs.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.
        raise_on_error (Optional[bool]): Whether or not to raise exceptions when they occur.
            Defaults to ``True``, since this is the most useful behavior in test.

    Returns:
      :py:class:`PipelineExecutionResult`: The result of pipeline execution.

    For the asynchronous version, see :py:func:`reexecute_pipeline_iterator`.
    """

    check.opt_list_param(step_selection, "step_selection", of_type=str)

    check.str_param(parent_run_id, "parent_run_id")

    with ephemeral_instance_if_missing(instance) as execute_instance:
        (pipeline, run_config, mode, tags, _,
         _) = _check_execute_pipeline_args(
             pipeline=pipeline,
             run_config=run_config,
             mode=mode,
             preset=preset,
             tags=tags,
         )

        parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)
        check.invariant(
            parent_pipeline_run,
            "No parent run with id {parent_run_id} found in instance.".format(
                parent_run_id=parent_run_id),
        )

        # resolve step selection DSL queries using parent execution plan snapshot
        if step_selection:
            full_plan = create_execution_plan(pipeline,
                                              parent_pipeline_run.run_config,
                                              mode)
            step_keys = parse_items_from_selection(step_selection)
            # resolve execution plan with any resolved dynamic step keys
            resolved_plan = full_plan.build_subset_plan(step_keys)
            # parse selection using all step deps
            step_keys_to_execute = parse_step_selection(
                resolved_plan.get_all_step_deps(), step_selection)
        else:
            step_keys_to_execute = None

        pipeline_run = execute_instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            run_config=run_config,
            mode=mode,
            tags=tags,
            solid_selection=parent_pipeline_run.solid_selection,
            solids_to_execute=parent_pipeline_run.solids_to_execute,
            # convert to frozenset https://github.com/dagster-io/dagster/issues/2914
            step_keys_to_execute=list(step_keys_to_execute)
            if step_keys_to_execute else None,
            root_run_id=parent_pipeline_run.root_run_id
            or parent_pipeline_run.run_id,
            parent_run_id=parent_pipeline_run.run_id,
        )

        return execute_run(pipeline,
                           pipeline_run,
                           execute_instance,
                           raise_on_error=raise_on_error)
Exemple #7
0
def _check_execute_pipeline_args(
    pipeline: Union[PipelineDefinition, IPipeline],
    run_config: Optional[dict],
    mode: Optional[str],
    preset: Optional[str],
    tags: Optional[Dict[str, Any]],
    solid_selection: Optional[List[str]] = None,
) -> Tuple[IPipeline, Optional[dict], Optional[str], Dict[str, Any],
           FrozenSet[str], Optional[List[str]], ]:
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)

    run_config = check.opt_dict_param(run_config, "run_config")
    check.opt_str_param(mode, "mode")
    check.opt_str_param(preset, "preset")
    check.invariant(
        not (mode is not None and preset is not None),
        "You may set only one of `mode` (got {mode}) or `preset` (got {preset})."
        .format(mode=mode, preset=preset),
    )

    tags = check.opt_dict_param(tags, "tags", key_type=str)
    check.opt_list_param(solid_selection, "solid_selection", of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                "The environment set in preset '{preset}' does not agree with the environment "
                "passed in the `run_config` argument.".format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None
                or solid_selection == pipeline_preset.solid_selection,
                "The solid_selection set in preset '{preset}', {preset_subset}, does not agree with "
                "the `solid_selection` argument: {solid_selection}".format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            "Mode {mode} does not agree with the mode set in preset '{preset}': "
            "('{preset_mode}')".format(preset=preset,
                                       preset_mode=pipeline_preset.mode,
                                       mode=mode),
        )

        mode = pipeline_preset.mode

        tags = merge_dicts(pipeline_preset.tags, tags)

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError((
                "You have attempted to execute pipeline {name} with mode {mode}. "
                "Available modes: {modes}").format(
                    name=pipeline_def.name,
                    mode=mode,
                    modes=pipeline_def.available_modes,
                ))
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError((
                "Pipeline {name} has multiple modes (Available modes: {modes}) and you have "
                "attempted to execute it without specifying a mode. Set "
                "mode property on the PipelineRun object.").format(
                    name=pipeline_def.name,
                    modes=pipeline_def.available_modes))
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
Exemple #8
0
def get_step_input_source(plan_builder, solid, input_name, input_def,
                          dependency_structure, handle, parent_step_inputs):
    check.inst_param(plan_builder, "plan_builder", _PlanBuilder)
    check.inst_param(solid, "solid", Solid)
    check.str_param(input_name, "input_name")
    check.inst_param(input_def, "input_def", InputDefinition)
    check.inst_param(dependency_structure, "dependency_structure",
                     DependencyStructure)
    check.opt_inst_param(handle, "handle", SolidHandle)
    check.opt_list_param(parent_step_inputs,
                         "parent_step_inputs",
                         of_type=StepInput)

    input_handle = solid.input_handle(input_name)
    solid_config = plan_builder.environment_config.solids.get(str(handle))
    input_config = solid_config.inputs.get(
        input_name) if solid_config else None

    input_def = solid.definition.input_def_named(input_name)
    if input_def.root_manager_key and not dependency_structure.has_deps(
            input_handle):
        return FromRootInputManager(input_def=input_def,
                                    config_data=input_config)

    if dependency_structure.has_singular_dep(input_handle):
        solid_output_handle = dependency_structure.get_singular_dep(
            input_handle)
        step_output_handle = plan_builder.get_output_handle(
            solid_output_handle)
        if isinstance(step_output_handle, UnresolvedStepOutputHandle):
            return FromUnresolvedStepOutput(
                unresolved_step_output_handle=step_output_handle,
                input_def=input_def,
                config_data=input_config,
            )

        if solid_output_handle.output_def.is_dynamic:
            return FromPendingDynamicStepOutput(
                step_output_handle=step_output_handle,
                input_def=input_def,
                config_data=input_config,
            )

        return FromStepOutput(
            step_output_handle=step_output_handle,
            input_def=input_def,
            config_data=input_config,
            fan_in=False,
        )

    if dependency_structure.has_multi_deps(input_handle):
        sources = []
        for idx, handle_or_placeholder in enumerate(
                dependency_structure.get_multi_deps(input_handle)):
            if handle_or_placeholder is MappedInputPlaceholder:
                parent_name = solid.container_mapped_fan_in_input(
                    input_name, idx).definition.name
                parent_inputs = {
                    step_input.name: step_input
                    for step_input in parent_step_inputs
                }
                parent_input = parent_inputs[parent_name]
                sources.append(parent_input.source)
            else:
                sources.append(
                    FromStepOutput(
                        step_output_handle=plan_builder.get_output_handle(
                            handle_or_placeholder),
                        input_def=input_def,
                        config_data=input_config,
                        fan_in=True,
                    ))

        return FromMultipleSources(sources)

    if solid_config and input_name in solid_config.inputs:
        return FromConfig(
            solid_config.inputs[input_name],
            dagster_type=input_def.dagster_type,
            input_name=input_name,
        )

    if solid.container_maps_input(input_name):
        parent_name = solid.container_mapped_input(input_name).definition.name
        parent_inputs = {
            step_input.name: step_input
            for step_input in parent_step_inputs
        }
        if parent_name in parent_inputs:
            parent_input = parent_inputs[parent_name]
            return parent_input.source
        # else fall through to Nothing case or raise

    if solid.definition.input_has_default(input_name):
        return FromDefaultValue(
            solid.definition.default_value_for_input(input_name))

    # At this point we have an input that is not hooked up to
    # the output of another solid or provided via environment config.

    # We will allow this for "Nothing" type inputs and continue.
    if input_def.dagster_type.kind == DagsterTypeKind.NOTHING:
        return None

    # Otherwise we throw an error.
    raise DagsterInvariantViolationError(
        ("In pipeline {pipeline_name} solid {solid_name}, input {input_name} "
         "must get a value either (a) from a dependency or (b) from the "
         "inputs section of its configuration.").format(
             pipeline_name=plan_builder.pipeline_name,
             solid_name=solid.name,
             input_name=input_name))
Exemple #9
0
    def __new__(
        cls,
        pipeline_name=None,
        run_id=None,
        environment_dict=None,
        mode=None,
        solid_subset=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        pipeline_snapshot_id=None,
        execution_plan_snapshot_id=None,
        ## GRAVEYARD BELOW
        # see https://github.com/dagster-io/dagster/issues/2372 for explanation
        previous_run_id=None,
        selector=None,
    ):
        from dagster.core.definitions.pipeline import ExecutionSelector

        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        check.opt_list_param(step_keys_to_execute,
                             'step_keys_to_execute',
                             of_type=str)

        check.opt_str_param(root_run_id, 'root_run_id')
        check.opt_str_param(parent_run_id, 'parent_run_id')

        check.invariant(
            (root_run_id is not None and parent_run_id is not None)
            or (root_run_id is None and parent_run_id is None),
            ('Must set both root_run_id and parent_run_id when creating a PipelineRun that '
             'belongs to a run group'),
        )

        # Compatibility
        # ----------------------------------------------------------------------------------------
        # Historical runs may have previous_run_id set, in which case
        # that previous ID becomes both the root and the parent
        if previous_run_id:
            if not (parent_run_id and root_run_id):
                parent_run_id = previous_run_id
                root_run_id = previous_run_id

        check.opt_inst_param(selector, 'selector', ExecutionSelector)
        if selector:
            check.invariant(
                pipeline_name is None or selector.name == pipeline_name,
                ('Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: '
                 'selector was passed with pipeline {selector_pipeline}'.
                 format(pipeline_name=pipeline_name,
                        selector_pipeline=selector.name)),
            )
            if pipeline_name is None:
                pipeline_name = selector.name

            check.invariant(
                solid_subset is None or selector.solid_subset == solid_subset,
                ('Conflicting solid_subset {solid_subset} in arguments to PipelineRun: '
                 'selector was passed with subset {selector_subset}'.format(
                     solid_subset=solid_subset,
                     selector_subset=selector.solid_subset)),
            )
            if solid_subset is None:
                solid_subset = selector.solid_subset
        # ----------------------------------------------------------------------------------------

        return super(PipelineRun, cls).__new__(
            cls,
            pipeline_name=check.opt_str_param(pipeline_name, 'pipeline_name'),
            run_id=check.opt_str_param(run_id,
                                       'run_id',
                                       default=make_new_run_id()),
            environment_dict=check.opt_dict_param(environment_dict,
                                                  'environment_dict',
                                                  key_type=str),
            mode=check.opt_str_param(mode, 'mode'),
            solid_subset=solid_subset,
            step_keys_to_execute=step_keys_to_execute,
            status=check.opt_inst_param(status, 'status', PipelineRunStatus,
                                        PipelineRunStatus.NOT_STARTED),
            tags=check.opt_dict_param(tags, 'tags', key_type=str),
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot_id=check.opt_str_param(pipeline_snapshot_id,
                                                     'pipeline_snapshot_id'),
            execution_plan_snapshot_id=check.opt_str_param(
                execution_plan_snapshot_id, 'execution_plan_snapshot_id'),
        )
Exemple #10
0
    def from_pkg_resources(name,
                           pkg_resource_defs=None,
                           solid_selection=None,
                           mode=None,
                           tags=None):
        '''Load a preset from a package resource, using :py:func:`pkg_resources.resource_string`.

        Example:

        .. code-block:: python

            PresetDefinition.from_pkg_resources(
                name='local',
                mode='local',
                pkg_resource_defs=[
                    ('dagster_examples.airline_demo.environments', 'local_base.yaml'),
                    ('dagster_examples.airline_demo.environments', 'local_warehouse.yaml'),
                ],
            )


        Args:
            name (str): The name of this preset. Must be unique in the presets defined on a given
                pipeline.
            pkg_resource_defs (Optional[List[(str, str)]]): List of pkg_resource modules/files to
                load as environment config for this preset.
            solid_selection (Optional[List[str]]): A list of solid subselection (including single
                solid names) to execute with this partition. e.g.
                ``['*some_solid+', 'other_solid']``
            mode (Optional[str]): The mode to apply when executing this preset. (default:
                'default')
            tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.

        Returns:
            PresetDefinition: A PresetDefinition constructed from the provided YAML strings

        Raises:
            DagsterInvariantViolationError: When one of the YAML documents is invalid and has a
                parse error.
        '''
        pkg_resource_defs = check.opt_list_param(pkg_resource_defs,
                                                 'pkg_resource_defs',
                                                 of_type=tuple)

        try:
            yaml_strings = [
                six.ensure_str(
                    pkg_resources.resource_string(*pkg_resource_def))
                for pkg_resource_def in pkg_resource_defs
            ]
        except (ModuleNotFoundError, FileNotFoundError,
                UnicodeDecodeError) as err:
            six.raise_from(
                DagsterInvariantViolationError(
                    'Encountered error attempting to parse yaml. Loading YAMLs from '
                    'package resources {pkg_resource_defs} '
                    'on preset "{name}".'.format(
                        pkg_resource_defs=pkg_resource_defs, name=name)),
                err,
            )

        return PresetDefinition.from_yaml_strings(name, yaml_strings,
                                                  solid_selection, mode, tags)
Exemple #11
0
    def from_files(name,
                   environment_files=None,
                   config_files=None,
                   solid_selection=None,
                   mode=None,
                   tags=None):
        '''Static constructor for presets from YAML files.

        Args:
            name (str): The name of this preset. Must be unique in the presets defined on a given
                pipeline.
            config_files (Optional[List[str]]): List of paths or glob patterns for yaml files
                to load and parse as the environment config for this preset.
            solid_selection (Optional[List[str]]): A list of solid subselection (including single
                solid names) to execute with the preset. e.g. ``['*some_solid+', 'other_solid']``
            mode (Optional[str]): The mode to apply when executing this preset. (default:
                'default')
            tags (Optional[Dict[str, Any]]): The tags to apply when executing this preset.

        Returns:
            PresetDefinition: A PresetDefinition constructed from the provided YAML files.

        Raises:
            DagsterInvariantViolationError: When one of the YAML files is invalid and has a parse
                error.
        '''
        check.str_param(name, 'name')
        config_files = canonicalize_backcompat_args(config_files,
                                                    'config_files',
                                                    environment_files,
                                                    'environment_files',
                                                    '0.9.0')
        config_files = check.opt_list_param(config_files, 'config_files')
        solid_selection = check.opt_nullable_list_param(solid_selection,
                                                        'solid_selection',
                                                        of_type=str)
        mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)

        filenames = []
        for file_glob in config_files or []:
            globbed_files = glob(file_glob)
            if not globbed_files:
                raise DagsterInvalidDefinitionError(
                    'File or glob pattern "{file_glob}" for "config_files" in preset '
                    '"{name}" produced no results.'.format(
                        name=name, file_glob=file_glob))

            filenames += [
                os.path.realpath(globbed_file)
                for globbed_file in globbed_files
            ]

        try:
            merged = merge_yamls(filenames)
        except yaml.YAMLError as err:
            six.raise_from(
                DagsterInvariantViolationError(
                    'Encountered error attempting to parse yaml. Parsing files {file_set} '
                    'loaded by file/patterns {files} on preset "{name}".'.
                    format(file_set=filenames, files=config_files, name=name)),
                err,
            )

        return PresetDefinition(name, merged, solid_selection, mode, tags)
Exemple #12
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
        check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute,
                            "solids_to_execute",
                            of_type=str)
        check.opt_list_param(solid_selection, "solid_selection", of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by IPipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "
                    "that conflicts with solids_to_execute arg {solids_to_execute}"
                    .format(
                        pipeline_solids_to_execute=str_format_list(
                            pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute)

        full_execution_plan = execution_plan or create_execution_plan(
            pipeline_def,
            run_config=run_config,
            mode=mode,
        )
        check.invariant(
            len(full_execution_plan.step_keys_to_execute) == len(
                full_execution_plan.steps))

        if _is_memoized_run(tags):
            if step_keys_to_execute:
                raise DagsterInvariantViolationError(
                    "step_keys_to_execute parameter cannot be used in conjunction with memoized "
                    "pipeline runs.")

            step_keys_to_execute = self.resolve_unmemoized_steps(
                full_execution_plan,
                run_config=run_config,
                mode=mode,
            )  # TODO: tighter integration with existing step_keys_to_execute functionality

        subsetted_execution_plan = (
            full_execution_plan.build_subset_plan(step_keys_to_execute)
            if step_keys_to_execute else full_execution_plan)

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(
                mode, "mode", default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                subsetted_execution_plan,
                pipeline_def.get_pipeline_snapshot_id()),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(
            ),
        )
Exemple #13
0
 def __new__(cls, partition_names=None):
     return super(ExternalPartitionNamesData, cls).__new__(
         cls,
         partition_names=check.opt_list_param(partition_names,
                                              'partition_names', str),
     )
Exemple #14
0
    def __call__(self, fn: Callable[..., Any]) -> "OpDefinition":
        from ..op_definition import OpDefinition

        if self.input_defs is not None and self.ins is not None:
            check.failed(
                "Values cannot be provided for both the 'input_defs' and 'ins' arguments"
            )

        if self.output_defs is not None and self.out is not None:
            check.failed(
                "Values cannot be provided for both the 'output_defs' and 'out' arguments"
            )

        inferred_out = infer_output_props(fn)

        if self.ins is not None:
            input_defs = [
                inp.to_definition(name) for name, inp in self.ins.items()
            ]
        else:
            input_defs = check.opt_list_param(self.input_defs,
                                              "input_defs",
                                              of_type=InputDefinition)

        output_defs_from_out = _resolve_output_defs_from_outs(
            inferred_out=inferred_out, out=self.out)
        resolved_output_defs = (output_defs_from_out if output_defs_from_out
                                is not None else self.output_defs)

        if not self.name:
            self.name = fn.__name__

        if resolved_output_defs is None:
            resolved_output_defs = [
                OutputDefinition.create_from_inferred(infer_output_props(fn))
            ]
        elif len(resolved_output_defs) == 1:
            resolved_output_defs = [
                resolved_output_defs[0].combine_with_inferred(
                    infer_output_props(fn))
            ]

        compute_fn = (DecoratedSolidFunction(
            decorated_fn=fn) if self.decorator_takes_context else
                      NoContextDecoratedSolidFunction(decorated_fn=fn))

        resolved_input_defs = resolve_checked_solid_fn_inputs(
            decorator_name="@op",
            fn_name=self.name,
            compute_fn=compute_fn,
            explicit_input_defs=input_defs,
            exclude_nothing=True,
        )

        op_def = OpDefinition(
            name=self.name,
            input_defs=resolved_input_defs,
            output_defs=resolved_output_defs,
            compute_fn=compute_fn,
            config_schema=self.config_schema,
            description=self.description
            or format_docstring_for_description(fn),
            required_resource_keys=self.required_resource_keys,
            tags=self.tags,
            version=self.version,
            retry_policy=self.retry_policy,
        )
        update_wrapper(op_def, compute_fn.decorated_fn)
        return op_def
Exemple #15
0
 def __init__(self, name, solid_subset=None):
     self.name = check.str_param(name, 'name')
     if solid_subset is None:
         self.solid_subset = None
     else:
         self.solid_subset = check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
Exemple #16
0
    def __init__(
        self,
        solid_defs,
        name=None,
        description=None,
        dependencies=None,
        mode_defs=None,
        preset_defs=None,
    ):
        self.name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self.description = check.opt_str_param(description, 'description')

        mode_definitions = check.opt_list_param(mode_defs,
                                                'mode_defs',
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self.mode_definitions = mode_definitions

        current_level_solid_defs = check.list_param(_check_solids_arg(
            self.name, solid_defs),
                                                    'solid_defs',
                                                    of_type=ISolidDefinition)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    'Modes must have unique names.').format(
                        mode_name=mode_def.name, pipeline_name=self.name))
            seen_modes.add(mode_def.name)

        self.dependencies = validate_dependency_dict(dependencies)

        dependency_structure, pipeline_solid_dict = create_execution_structure(
            current_level_solid_defs,
            self.dependencies,
            container_definition=None)

        self._solid_dict = pipeline_solid_dict
        self._dependency_structure = dependency_structure

        self._runtime_type_dict = construct_runtime_type_dictionary(
            current_level_solid_defs)

        self._preset_dict = {}
        for preset in check.opt_list_param(preset_defs, 'preset_defs',
                                           PresetDefinition):
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    'PresetDefinitions must have unique names.').format(
                        name=preset.name, pipeline_name=self.name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self.name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self.mode_definitions,
                                        current_level_solid_defs)

        self._all_solid_defs = {}
        for current_level_solid_def in current_level_solid_defs:
            for solid_def in current_level_solid_def.iterate_solid_defs():
                self._all_solid_defs[solid_def.name] = solid_def
Exemple #17
0
def _subset(recon_pipeline, solid_subset):
    check.inst_param(recon_pipeline, 'recon_pipeline', ReconstructablePipeline)
    check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
    return recon_pipeline.subset_for_execution(solid_subset) if solid_subset else recon_pipeline
Exemple #18
0
def bash_script_solid(bash_script_path,
                      name='bash_script_solid',
                      input_defs=None,
                      **kwargs):
    '''This function is a factory which constructs a solid that will execute a Bash command read
    from a script file.

    Any kwargs passed to this function will be passed along to the underlying :func:`@solid
    <dagster.solid>` decorator. However, note that overriding ``config`` or ``output_defs`` is not
    supported.

    You might consider using :func:`@composite_solid <dagster.composite_solid>` to wrap this solid
    in the cases where you'd like to configure the bash solid with different config fields.


    Examples:

    .. literalinclude:: ../../../../../python_modules/libraries/dagster-bash/dagster_bash_tests/example_bash_script_solid.py
       :language: python


    Args:
        bash_script_path (str): The script file to execute.
        name (str, optional): The name of this solid. Defaults to "bash_script_solid".
        input_defs (List[InputDefinition], optional): input definitions for the solid. Defaults to
            a single Nothing input.

    Raises:
        Failure: Raised when the shell command returns a non-zero exit code.

    Returns:
        SolidDefinition: Returns the constructed solid definition.
    '''
    check.str_param(bash_script_path, 'bash_script_path')
    name = check.str_param(name, 'name')
    check.opt_list_param(input_defs, 'input_defs', of_type=InputDefinition)

    if 'output_defs' in kwargs:
        raise TypeError(
            'Overriding output_defs for bash solid is not supported.')

    if 'config' in kwargs:
        raise TypeError('Overriding config for bash solid is not supported.')

    @solid(name=name,
           description=kwargs.pop('description',
                                  'A solid to invoke a bash command.'),
           input_defs=input_defs or [InputDefinition('start', Nothing)],
           output_defs=[OutputDefinition(str, 'result')],
           config=bash_solid_config(),
           **kwargs)
    def _bash_script_solid(context):
        output, return_code = execute_script_file(
            bash_script_path=bash_script_path,
            log=context.log,
            **context.solid_config)

        if return_code:
            raise Failure(
                description=
                'Bash command execution failed with output: {output}'.format(
                    output=output))

        return output

    return _bash_script_solid
Exemple #19
0
def reexecute_pipeline_iterator(
    pipeline: Union[IPipeline, PipelineDefinition],
    parent_run_id: str,
    run_config: Optional[dict] = None,
    step_selection: Optional[List[str]] = None,
    mode: Optional[str] = None,
    preset: Optional[str] = None,
    tags: Optional[Dict[str, Any]] = None,
    instance: DagsterInstance = None,
) -> Iterator[DagsterEvent]:
    """Reexecute a pipeline iteratively.

    Rather than package up the result of running a pipeline into a single object, like
    :py:func:`reexecute_pipeline`, this function yields the stream of events resulting from pipeline
    reexecution.

    This is intended to allow the caller to handle these events on a streaming basis in whatever
    way is appropriate.

    Parameters:
        pipeline (Union[IPipeline, PipelineDefinition]): The pipeline to execute.
        parent_run_id (str): The id of the previous run to reexecute. The run must exist in the
            instance.
        run_config (Optional[dict]): The environment configuration that parametrizes this run,
            as a dict.
        step_selection (Optional[List[str]]): A list of step selection queries (including single
            step keys) to execute. For example:
            - ['some_solid']: select the execution step "some_solid" itself.
            - ['*some_solid']: select the step "some_solid" and all its ancestors
                (upstream dependencies).
            - ['*some_solid+++']: select the step "some_solid", all its ancestors,
                and its descendants (downstream dependencies) within 3 levels down.
            - ['*some_solid', 'other_solid_a', 'other_solid_b+']: select
                "some_solid" and all its ancestors, "other_solid_a" itself, and
                "other_solid_b" and its direct child execution steps.
        mode (Optional[str]): The name of the pipeline mode to use. You may not set both ``mode``
            and ``preset``.
        preset (Optional[str]): The name of the pipeline preset to use. You may not set both
            ``mode`` and ``preset``.
        tags (Optional[Dict[str, Any]]): Arbitrary key-value pairs that will be added to pipeline
            logs.
        instance (Optional[DagsterInstance]): The instance to execute against. If this is ``None``,
            an ephemeral instance will be used, and no artifacts will be persisted from the run.

    Returns:
      Iterator[DagsterEvent]: The stream of events resulting from pipeline reexecution.
    """

    check.opt_list_param(step_selection, "step_selection", of_type=str)

    check.str_param(parent_run_id, "parent_run_id")

    with ephemeral_instance_if_missing(instance) as execute_instance:
        (pipeline, run_config, mode, tags, _,
         _) = _check_execute_pipeline_args(
             pipeline=pipeline,
             run_config=run_config,
             mode=mode,
             preset=preset,
             tags=tags,
             solid_selection=None,
         )
        parent_pipeline_run = execute_instance.get_run_by_id(parent_run_id)
        check.invariant(
            parent_pipeline_run,
            "No parent run with id {parent_run_id} found in instance.".format(
                parent_run_id=parent_run_id),
        )

        # resolve step selection DSL queries using parent execution plan snapshot
        if step_selection:
            parent_execution_plan_snapshot = execute_instance.get_execution_plan_snapshot(
                parent_pipeline_run.execution_plan_snapshot_id)
            step_keys_to_execute = parse_step_selection(
                parent_execution_plan_snapshot.step_deps, step_selection)
        else:
            step_keys_to_execute = None

        pipeline_run = execute_instance.create_run_for_pipeline(
            pipeline_def=pipeline.get_definition(),
            run_config=run_config,
            mode=mode,
            tags=tags,
            solid_selection=parent_pipeline_run.solid_selection,
            solids_to_execute=parent_pipeline_run.solids_to_execute,
            # convert to frozenset https://github.com/dagster-io/dagster/issues/2914
            step_keys_to_execute=list(step_keys_to_execute)
            if step_keys_to_execute else None,
            root_run_id=parent_pipeline_run.root_run_id
            or parent_pipeline_run.run_id,
            parent_run_id=parent_pipeline_run.run_id,
        )

        return execute_run_iterator(pipeline, pipeline_run, execute_instance)
Exemple #20
0
    def __init__(
        self,
        type_check_fn: TypeCheckFn,
        key: t.Optional[str] = None,
        name: t.Optional[str] = None,
        is_builtin: bool = False,
        description: t.Optional[str] = None,
        loader: t.Optional[DagsterTypeLoader] = None,
        materializer: t.Optional[DagsterTypeMaterializer] = None,
        required_resource_keys: t.Set[str] = None,
        kind: DagsterTypeKind = DagsterTypeKind.REGULAR,
        typing_type: t.Any = None,
        metadata_entries: t.Optional[t.List[MetadataEntry]] = None,
        metadata: t.Optional[t.Dict[str, RawMetadataValue]] = None,
    ):
        check.opt_str_param(key, "key")
        check.opt_str_param(name, "name")

        check.invariant(not (name is None and key is None), "Must set key or name")
        if name is None:
            key = check.not_none(
                key,
                "If name is not provided, must provide key.",
            )
            self.key, self._name = key, None
        elif key is None:
            name = check.not_none(
                name,
                "If key is not provided, must provide name.",
            )
            self.key, self._name = name, name
        else:
            check.invariant(key and name)
            self.key, self._name = key, name

        self.description = check.opt_str_param(description, "description")
        self.loader = check.opt_inst_param(loader, "loader", DagsterTypeLoader)
        self.materializer = check.opt_inst_param(
            materializer, "materializer", DagsterTypeMaterializer
        )

        self.required_resource_keys = check.opt_set_param(
            required_resource_keys,
            "required_resource_keys",
        )

        self._type_check_fn = check.callable_param(type_check_fn, "type_check_fn")
        _validate_type_check_fn(self._type_check_fn, self._name)

        self.is_builtin = check.bool_param(is_builtin, "is_builtin")
        check.invariant(
            self.display_name is not None,
            "All types must have a valid display name, got None for key {}".format(key),
        )

        self.kind = check.inst_param(kind, "kind", DagsterTypeKind)

        self.typing_type = typing_type

        metadata_entries = check.opt_list_param(
            metadata_entries, "metadata_entries", of_type=MetadataEntry
        )
        metadata = check.opt_dict_param(metadata, "metadata", key_type=str)
        self._metadata_entries = normalize_metadata(metadata, metadata_entries)
Exemple #21
0
    def __init__(
        self,
        solid_defs,
        name=None,
        description=None,
        dependencies=None,
        mode_defs=None,
        preset_defs=None,
        tags=None,
        _parent_pipeline_def=None,  # https://github.com/dagster-io/dagster/issues/2115
        _hook_defs=None,
    ):
        self._name = check.opt_str_param(name, "name", "<<unnamed>>")
        self._description = check.opt_str_param(description, "description")

        mode_definitions = check.opt_list_param(mode_defs,
                                                "mode_defs",
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs),
            "solid_defs",
            of_type=ISolidDefinition)
        self._tags = validate_tags(tags)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    "Modes must have unique names.").format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, solid_dict = create_execution_structure(
            self._current_level_solid_defs,
            self._dependencies,
            container_definition=None)

        self._solid_dict = solid_dict
        self._dependency_structure = dependency_structure

        # eager toposort solids to detect cycles
        self.solids_in_topological_order = self._solids_in_topological_order()

        self._dagster_type_dict = construct_dagster_type_dictionary(
            self._current_level_solid_defs)

        self._preset_defs = check.opt_list_param(preset_defs, "preset_defs",
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    "PresetDefinitions must have unique names.").format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self._mode_definitions,
                                        self._current_level_solid_defs,
                                        self._solid_dict)

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = _build_all_solid_defs(
            self._current_level_solid_defs)
        self._parent_pipeline_def = check.opt_inst_param(
            _parent_pipeline_def, "_parent_pipeline_def", PipelineDefinition)
        self._cached_run_config_schemas = {}
        self._cached_external_pipeline = None

        self._hook_defs = check.opt_set_param(_hook_defs,
                                              "_hook_defs",
                                              of_type=HookDefinition)
Exemple #22
0
    def create_run_for_pipeline(
        self,
        pipeline_def,
        execution_plan=None,
        run_id=None,
        run_config=None,
        mode=None,
        solids_to_execute=None,
        step_keys_to_execute=None,
        status=None,
        tags=None,
        root_run_id=None,
        parent_run_id=None,
        solid_selection=None,
    ):
        from dagster.core.execution.api import create_execution_plan
        from dagster.core.execution.plan.plan import ExecutionPlan
        from dagster.core.snap import snapshot_from_execution_plan

        check.inst_param(pipeline_def, "pipeline_def", PipelineDefinition)
        check.opt_inst_param(execution_plan, "execution_plan", ExecutionPlan)

        # note that solids_to_execute is required to execute the solid subset, which is the
        # frozenset version of the previous solid_subset.
        # solid_selection is not required and will not be converted to solids_to_execute here.
        # i.e. this function doesn't handle solid queries.
        # solid_selection is only used to pass the user queries further down.
        check.opt_set_param(solids_to_execute,
                            "solids_to_execute",
                            of_type=str)
        check.opt_list_param(solid_selection, "solid_selection", of_type=str)

        if solids_to_execute:
            if isinstance(pipeline_def, PipelineSubsetDefinition):
                # for the case when pipeline_def is created by ExecutablePipeline or ExternalPipeline
                check.invariant(
                    solids_to_execute == pipeline_def.solids_to_execute,
                    "Cannot create a PipelineRun from pipeline subset {pipeline_solids_to_execute} "
                    "that conflicts with solids_to_execute arg {solids_to_execute}"
                    .format(
                        pipeline_solids_to_execute=str_format_list(
                            pipeline_def.solids_to_execute),
                        solids_to_execute=str_format_list(solids_to_execute),
                    ),
                )
            else:
                # for cases when `create_run_for_pipeline` is directly called
                pipeline_def = pipeline_def.get_pipeline_subset_def(
                    solids_to_execute=solids_to_execute)

        if execution_plan is None:
            execution_plan = create_execution_plan(
                pipeline_def,
                run_config=run_config,
                mode=mode,
                step_keys_to_execute=step_keys_to_execute,
            )

        return self.create_run(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=check.opt_str_param(
                mode, "mode", default=pipeline_def.get_default_mode_name()),
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot=pipeline_def.get_pipeline_snapshot(),
            execution_plan_snapshot=snapshot_from_execution_plan(
                execution_plan, pipeline_def.get_pipeline_snapshot_id()),
            parent_pipeline_snapshot=pipeline_def.get_parent_pipeline_snapshot(
            ),
        )
Exemple #23
0
def _checked_input_resource_reqs_for_mode(
    dependency_structure: DependencyStructure,
    node_dict: Dict[str, Node],
    mode_def: ModeDefinition,
    outer_dependency_structures: Optional[List[DependencyStructure]] = None,
    outer_solids: Optional[List[Node]] = None,
) -> Set[str]:
    outer_dependency_structures = check.opt_list_param(
        outer_dependency_structures, "outer_dependency_structures",
        DependencyStructure)
    outer_solids = check.opt_list_param(outer_solids, "outer_solids", Node)

    resource_reqs = set()
    mode_root_input_managers = set(
        key for key, resource_def in mode_def.resource_defs.items()
        if isinstance(resource_def, RootInputManagerDefinition))

    for node in node_dict.values():
        if node.is_graph:
            graph_def = node.definition.ensure_graph_def()
            # check inner solids
            resource_reqs.update(
                _checked_input_resource_reqs_for_mode(
                    dependency_structure=graph_def.dependency_structure,
                    node_dict=graph_def.node_dict,
                    mode_def=mode_def,
                    outer_dependency_structures=outer_dependency_structures +
                    [dependency_structure],
                    outer_solids=outer_solids + [node],
                ))
        for handle in node.input_handles():
            source_output_handles = None
            if dependency_structure.has_deps(handle):
                # input is connected to outputs from the same dependency structure
                source_output_handles = dependency_structure.get_deps_list(
                    handle)
            else:
                # input is connected to outputs from outer dependency structure, e.g. first solids
                # in a composite
                curr_node = node
                curr_handle = handle
                curr_index = len(outer_solids) - 1

                # Checks to see if input is mapped to an outer dependency structure
                while curr_index >= 0 and curr_node.container_maps_input(
                        curr_handle.input_name):
                    curr_handle = SolidInputHandle(
                        solid=outer_solids[curr_index],
                        input_def=curr_node.container_mapped_input(
                            curr_handle.input_name).definition,
                    )

                    if outer_dependency_structures[curr_index].has_deps(
                            curr_handle):
                        source_output_handles = outer_dependency_structures[
                            curr_index].get_deps_list(curr_handle)
                        break

                    curr_node = outer_solids[curr_index]
                    curr_index -= 1

            if source_output_handles:
                # input is connected to source output handles within the graph
                for source_output_handle in source_output_handles:
                    output_manager_key = source_output_handle.output_def.io_manager_key
                    output_manager_def = mode_def.resource_defs[
                        output_manager_key]
                    if not isinstance(output_manager_def,
                                      IInputManagerDefinition):
                        raise DagsterInvalidDefinitionError(
                            f'Input "{handle.input_def.name}" of {node.describe_node()} is '
                            f'connected to output "{source_output_handle.output_def.name}" '
                            f"of {source_output_handle.solid.describe_node()}. That output does not "
                            "have an output "
                            f"manager that knows how to load inputs, so we don't know how "
                            f"to load the input. To address this, assign an IOManager to "
                            f"the upstream output.")
            else:
                # input is unconnected
                input_def = handle.input_def
                if (not input_def.dagster_type.loader
                        and not input_def.dagster_type.kind
                        == DagsterTypeKind.NOTHING
                        and not input_def.root_manager_key):
                    raise DagsterInvalidDefinitionError(
                        "Input '{input_name}' in {described_node} is not connected to "
                        "the output of a previous node and can not be loaded from configuration, "
                        "making it impossible to execute. "
                        "Possible solutions are:\n"
                        "  * add a dagster_type_loader for the type '{dagster_type}'\n"
                        "  * connect '{input_name}' to the output of another node\n"
                        .format(
                            described_node=node.describe_node(),
                            input_name=input_def.name,
                            dagster_type=input_def.dagster_type.display_name,
                        ))

                # If a root manager is provided, it's always used. I.e. it has priority over
                # the other ways of loading unsatisfied inputs - dagster type loaders and
                # default values.
                if input_def.root_manager_key:
                    resource_reqs.add(input_def.root_manager_key)
                    if input_def.root_manager_key not in mode_def.resource_defs:
                        error_msg = _get_missing_resource_error_msg(
                            resource_type="root input manager",
                            resource_key=input_def.root_manager_key,
                            descriptor=
                            f"unsatisfied input '{input_def.name}' of {node.describe_node()}",
                            mode_def=mode_def,
                            resource_defs_of_type=mode_root_input_managers,
                        )
                        raise DagsterInvalidDefinitionError(error_msg)

    return resource_reqs
Exemple #24
0
 def __init__(self, sequence):
     self.sequence = check.opt_list_param(sequence, 'sequence', of_type=(int, float))
Exemple #25
0
 def __init__(self, description=None, metadata_entries=None):
     super(Failure, self).__init__(description)
     self.description = check.opt_str_param(description, "description")
     self.metadata_entries = check.opt_list_param(
         metadata_entries, "metadata_entries", of_type=EventMetadataEntry)
Exemple #26
0
    def __init__(
        self,
        solid_defs,
        name=None,
        description=None,
        dependencies=None,
        mode_defs=None,
        preset_defs=None,
    ):
        self._name = check.opt_str_param(name, 'name', '<<unnamed>>')
        self._description = check.opt_str_param(description, 'description')

        mode_definitions = check.opt_list_param(mode_defs,
                                                'mode_defs',
                                                of_type=ModeDefinition)

        if not mode_definitions:
            mode_definitions = [ModeDefinition()]

        self._mode_definitions = mode_definitions

        self._current_level_solid_defs = check.list_param(
            _check_solids_arg(self._name, solid_defs),
            'solid_defs',
            of_type=ISolidDefinition)

        seen_modes = set()
        for mode_def in mode_definitions:
            if mode_def.name in seen_modes:
                raise DagsterInvalidDefinitionError((
                    'Two modes seen with the name "{mode_name}" in "{pipeline_name}". '
                    'Modes must have unique names.').format(
                        mode_name=mode_def.name, pipeline_name=self._name))
            seen_modes.add(mode_def.name)

        self._dependencies = validate_dependency_dict(dependencies)

        dependency_structure, solid_dict = create_execution_structure(
            self._current_level_solid_defs,
            self._dependencies,
            container_definition=None)

        self._solid_dict = solid_dict
        self._dependency_structure = dependency_structure

        self._runtime_type_dict = construct_dagster_type_dictionary(
            self._current_level_solid_defs)

        self._preset_defs = check.opt_list_param(preset_defs, 'preset_defs',
                                                 PresetDefinition)
        self._preset_dict = {}
        for preset in self._preset_defs:
            if preset.name in self._preset_dict:
                raise DagsterInvalidDefinitionError((
                    'Two PresetDefinitions seen with the name "{name}" in "{pipeline_name}". '
                    'PresetDefinitions must have unique names.').format(
                        name=preset.name, pipeline_name=self._name))
            if preset.mode not in seen_modes:
                raise DagsterInvalidDefinitionError(
                    ('PresetDefinition "{name}" in "{pipeline_name}" '
                     'references mode "{mode}" which is not defined.').format(
                         name=preset.name,
                         pipeline_name=self._name,
                         mode=preset.mode))
            self._preset_dict[preset.name] = preset

        # Validate solid resource dependencies
        _validate_resource_dependencies(self._mode_definitions,
                                        self._current_level_solid_defs)

        # Validate unsatisfied inputs can be materialized from config
        _validate_inputs(self._dependency_structure, self._solid_dict)

        self._all_solid_defs = _build_all_solid_defs(
            self._current_level_solid_defs)

        self._selector = ExecutionSelector(self.name, list(solid_dict.keys()))

        self._cached_enviroment_schemas = {}
Exemple #27
0
def get_step_input(plan_builder, solid, input_name, input_def,
                   dependency_structure, handle, parent_step_inputs):
    check.inst_param(plan_builder, 'plan_builder', _PlanBuilder)
    check.inst_param(solid, 'solid', Solid)
    check.str_param(input_name, 'input_name')
    check.inst_param(input_def, 'input_def', InputDefinition)
    check.inst_param(dependency_structure, 'dependency_structure',
                     DependencyStructure)
    check.opt_inst_param(handle, 'handle', SolidHandle)
    check.opt_list_param(parent_step_inputs,
                         'parent_step_inputs',
                         of_type=StepInput)

    solid_config = plan_builder.environment_config.solids.get(str(handle))
    if solid_config and input_name in solid_config.inputs:
        return StepInput(
            input_name,
            input_def.runtime_type,
            StepInputSourceType.CONFIG,
            config_data=solid_config.inputs[input_name],
        )

    input_handle = solid.input_handle(input_name)
    if dependency_structure.has_singular_dep(input_handle):
        solid_output_handle = dependency_structure.get_singular_dep(
            input_handle)
        return StepInput(
            input_name,
            input_def.runtime_type,
            StepInputSourceType.SINGLE_OUTPUT,
            [plan_builder.get_output_handle(solid_output_handle)],
        )

    if dependency_structure.has_multi_deps(input_handle):
        solid_output_handles = dependency_structure.get_multi_deps(
            input_handle)
        return StepInput(
            input_name,
            input_def.runtime_type,
            StepInputSourceType.MULTIPLE_OUTPUTS,
            [
                plan_builder.get_output_handle(solid_output_handle)
                for solid_output_handle in solid_output_handles
            ],
        )

    if solid.container_maps_input(input_name):
        parent_name = solid.container_mapped_input(input_name).definition.name
        parent_inputs = {
            step_input.name: step_input
            for step_input in parent_step_inputs
        }
        if parent_name in parent_inputs:
            parent_input = parent_inputs[parent_name]
            return StepInput(
                input_name,
                input_def.runtime_type,
                parent_input.source_type,
                parent_input.source_handles,
                parent_input.config_data,
            )

    # At this point we have an input that is not hooked up to
    # the output of another solid or provided via environment config.

    # We will allow this for "Nothing" type inputs and continue.
    if input_def.runtime_type.is_nothing:
        return None

    # Otherwise we throw an error.
    raise DagsterInvariantViolationError(
        ('In pipeline {pipeline_name} solid {solid_name}, input {input_name} '
         'must get a value either (a) from a dependency or (b) from the '
         'inputs section of its configuration.').format(
             pipeline_name=plan_builder.pipeline_name,
             solid_name=solid.name,
             input_name=input_name))
Exemple #28
0
 def build_sub_pipeline(self, solid_subset):
     check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
     return self if solid_subset is None else _build_sub_pipeline(
         self, solid_subset)
Exemple #29
0
def _check_execute_pipeline_args(
    pipeline, run_config, mode, preset, tags, instance, solid_selection=None
):
    pipeline = _check_pipeline(pipeline)
    pipeline_def = pipeline.get_definition()
    check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition)

    run_config = check.opt_dict_param(run_config, 'run_config')
    check.opt_str_param(mode, 'mode')
    check.opt_str_param(preset, 'preset')
    check.invariant(
        not (mode is not None and preset is not None),
        'You may set only one of `mode` (got {mode}) or `preset` (got {preset}).'.format(
            mode=mode, preset=preset
        ),
    )

    tags = check.opt_dict_param(tags, 'tags', key_type=str)
    check.opt_list_param(solid_selection, 'solid_selection', of_type=str)

    if preset is not None:
        pipeline_preset = pipeline_def.get_preset(preset)

        if pipeline_preset.run_config is not None:
            check.invariant(
                (not run_config) or (pipeline_preset.run_config == run_config),
                'The environment set in preset \'{preset}\' does not agree with the environment '
                'passed in the `run_config` argument.'.format(preset=preset),
            )

            run_config = pipeline_preset.run_config

        # load solid_selection from preset
        if pipeline_preset.solid_selection is not None:
            check.invariant(
                solid_selection is None or solid_selection == pipeline_preset.solid_selection,
                'The solid_selection set in preset \'{preset}\', {preset_subset}, does not agree with '
                'the `solid_selection` argument: {solid_selection}'.format(
                    preset=preset,
                    preset_subset=pipeline_preset.solid_selection,
                    solid_selection=solid_selection,
                ),
            )
            solid_selection = pipeline_preset.solid_selection

        check.invariant(
            mode is None or mode == pipeline_preset.mode,
            'Mode {mode} does not agree with the mode set in preset \'{preset}\': '
            '(\'{preset_mode}\')'.format(
                preset=preset, preset_mode=pipeline_preset.mode, mode=mode
            ),
        )

        mode = pipeline_preset.mode

        tags = merge_dicts(pipeline_preset.tags, tags)

    if mode is not None:
        if not pipeline_def.has_mode_definition(mode):
            raise DagsterInvariantViolationError(
                (
                    'You have attempted to execute pipeline {name} with mode {mode}. '
                    'Available modes: {modes}'
                ).format(
                    name=pipeline_def.name, mode=mode, modes=pipeline_def.available_modes,
                )
            )
    else:
        if pipeline_def.is_multi_mode:
            raise DagsterInvariantViolationError(
                (
                    'Pipeline {name} has multiple modes (Available modes: {modes}) and you have '
                    'attempted to execute it without specifying a mode. Set '
                    'mode property on the PipelineRun object.'
                ).format(name=pipeline_def.name, modes=pipeline_def.available_modes)
            )
        mode = pipeline_def.get_default_mode_name()

    tags = merge_dicts(pipeline_def.tags, tags)

    check.opt_inst_param(instance, 'instance', DagsterInstance)
    instance = instance or DagsterInstance.ephemeral()

    # generate pipeline subset from the given solid_selection
    if solid_selection:
        pipeline = pipeline.subset_for_execution(solid_selection)

    return (
        pipeline,
        run_config,
        instance,
        mode,
        tags,
        pipeline.solids_to_execute,
        solid_selection,
    )
Exemple #30
0
    def _from_storage(
            cls,
            pipeline_name=None,
            run_id=None,
            run_config=None,
            mode=None,
            solid_selection=None,
            solids_to_execute=None,
            step_keys_to_execute=None,
            status=None,
            tags=None,
            root_run_id=None,
            parent_run_id=None,
            pipeline_snapshot_id=None,
            execution_plan_snapshot_id=None,
            # backcompat
            environment_dict=None,
            previous_run_id=None,
            selector=None,
            solid_subset=None,
            reexecution_config=None,  # pylint: disable=unused-argument
            **kwargs):

        # serdes log
        # * removed reexecution_config - serdes logic expected to strip unknown keys so no need to preserve
        # * added pipeline_snapshot_id
        # * renamed previous_run_id -> parent_run_id, added root_run_id
        # * added execution_plan_snapshot_id
        # * removed selector
        # * added solid_subset
        # * renamed solid_subset -> solid_selection, added solids_to_execute
        # * renamed environment_dict -> run_config

        # back compat for environment dict => run_config
        if environment_dict:
            check.invariant(
                not run_config,
                'Cannot set both run_config and environment_dict. Use run_config parameter.',
            )
            run_config = environment_dict

        # back compat for previous_run_id => parent_run_id, root_run_id
        if previous_run_id and not (parent_run_id and root_run_id):
            parent_run_id = previous_run_id
            root_run_id = previous_run_id

        # back compat for selector => pipeline_name, solids_to_execute
        selector = check.opt_inst_param(selector, 'selector',
                                        ExecutionSelector)
        if selector:
            check.invariant(
                pipeline_name is None or selector.name == pipeline_name,
                ('Conflicting pipeline name {pipeline_name} in arguments to PipelineRun: '
                 'selector was passed with pipeline {selector_pipeline}'.
                 format(pipeline_name=pipeline_name,
                        selector_pipeline=selector.name)),
            )
            if pipeline_name is None:
                pipeline_name = selector.name

            check.invariant(
                solids_to_execute is None
                or set(selector.solid_subset) == solids_to_execute,
                ('Conflicting solids_to_execute {solids_to_execute} in arguments to PipelineRun: '
                 'selector was passed with subset {selector_subset}'.format(
                     solids_to_execute=solids_to_execute,
                     selector_subset=selector.solid_subset)),
            )
            # for old runs that only have selector but no solids_to_execute
            if solids_to_execute is None:
                solids_to_execute = (frozenset(selector.solid_subset)
                                     if selector.solid_subset else None)

        # back compat for solid_subset => solids_to_execute
        check.opt_list_param(solid_subset, 'solid_subset', of_type=str)
        if solid_subset:
            solids_to_execute = frozenset(solid_subset)

        # warn about unused arguments
        if len(kwargs):
            warnings.warn(
                'Found unhandled arguments from stored PipelineRun: {args}'.
                format(args=kwargs.keys()))

        return cls.__new__(  # pylint: disable=redundant-keyword-arg
            cls,
            pipeline_name=pipeline_name,
            run_id=run_id,
            run_config=run_config,
            mode=mode,
            solid_selection=solid_selection,
            solids_to_execute=solids_to_execute,
            step_keys_to_execute=step_keys_to_execute,
            status=status,
            tags=tags,
            root_run_id=root_run_id,
            parent_run_id=parent_run_id,
            pipeline_snapshot_id=pipeline_snapshot_id,
            execution_plan_snapshot_id=execution_plan_snapshot_id,
        )