Example #1
0
def test_step_handles():
    plain = StepHandle.parse_from_key("foo")
    assert isinstance(plain, StepHandle)
    unresolved = StepHandle.parse_from_key("foo[?]")
    assert isinstance(unresolved, UnresolvedStepHandle)
    resolved = StepHandle.parse_from_key("foo[bar]")
    assert isinstance(resolved, ResolvedFromDynamicStepHandle)

    assert unresolved.resolve("bar") == resolved

    assert resolved.unresolved_form == unresolved
Example #2
0
    def build_subset_plan(self,
                          step_keys_to_execute: List[str]) -> "ExecutionPlan":
        check.list_param(step_keys_to_execute,
                         "step_keys_to_execute",
                         of_type=str)
        step_handles_to_execute = [
            StepHandle.parse_from_key(key) for key in step_keys_to_execute
        ]

        bad_keys = []
        for handle in step_handles_to_execute:
            if handle not in self.step_dict:
                bad_keys.append(handle.to_key())

        if bad_keys:
            raise DagsterExecutionStepNotFoundError(
                f"Can not build subset plan from unknown step{'s' if len(bad_keys)> 1 else ''}: {', '.join(bad_keys)}",
                step_keys=bad_keys,
            )

        return ExecutionPlan(
            self.pipeline,
            self.step_dict,
            step_handles_to_execute,
            self.environment_config,
            self.known_state,
        )
Example #3
0
    def build_subset_plan(
        self,
        step_keys_to_execute: List[str],
        pipeline_def: PipelineDefinition,
        environment_config: EnvironmentConfig,
    ) -> "ExecutionPlan":
        check.list_param(step_keys_to_execute,
                         "step_keys_to_execute",
                         of_type=str)
        step_handles_to_execute = [
            StepHandle.parse_from_key(key) for key in step_keys_to_execute
        ]

        bad_keys = []
        for handle in step_handles_to_execute:
            if handle not in self.step_dict:
                bad_keys.append(handle.to_key())

        if bad_keys:
            raise DagsterExecutionStepNotFoundError(
                f"Can not build subset plan from unknown step{'s' if len(bad_keys)> 1 else ''}: {', '.join(bad_keys)}",
                step_keys=bad_keys,
            )

        executable_map, resolvable_map = _compute_step_maps(
            self.step_dict,
            step_handles_to_execute,
            self.known_state,
        )

        return ExecutionPlan(
            self.step_dict,
            executable_map,
            resolvable_map,
            step_handles_to_execute,
            self.known_state,
            _compute_artifacts_persisted(
                self.step_dict,
                step_handles_to_execute,
                pipeline_def,
                environment_config,
                executable_map,
            ),
        )
Example #4
0
    def build_subset_plan(self,
                          step_keys_to_execute: List[str]) -> "ExecutionPlan":
        check.list_param(step_keys_to_execute,
                         "step_keys_to_execute",
                         of_type=str)
        step_handles_to_execute = [
            StepHandle.parse_from_key(key) for key in step_keys_to_execute
        ]

        bad_keys = []
        for handle in step_handles_to_execute:
            if handle in self.step_dict:
                pass  # no further processing required
            elif (isinstance(handle, ResolvedFromDynamicStepHandle)
                  and handle.unresolved_form in self.step_dict):
                unresolved_step = cast(UnresolvedExecutionStep,
                                       self.step_dict[handle.unresolved_form])
                # self.step_dict updated as side effect
                self.resolve(
                    unresolved_step.resolved_by_step_key,
                    {
                        unresolved_step.resolved_by_output_name:
                        [handle.mapping_key]
                    },
                )
                check.invariant(
                    handle in self.step_dict,
                    f"Handle did not resolve as expected, not found in step dict {handle}",
                )
            else:
                bad_keys.append(handle.to_key())

        if bad_keys:
            raise DagsterExecutionStepNotFoundError(
                f"Can not build subset plan from unknown step{'s' if len(bad_keys)> 1 else ''}: {', '.join(bad_keys)}",
                step_keys=bad_keys,
            )

        return ExecutionPlan(
            self.pipeline,
            self.step_dict,
            step_handles_to_execute,
            self.environment_config,
        )
Example #5
0
    def rebuild_from_snapshot(pipeline_name, execution_plan_snapshot):
        if not execution_plan_snapshot.can_reconstruct_plan:
            raise DagsterInvariantViolationError(
                "Tried to reconstruct an old ExecutionPlanSnapshot that was created before snapshots "
                "had enough information to fully reconstruct the ExecutionPlan"
            )

        step_dict = {}

        for step_snap in execution_plan_snapshot.steps:
            input_snaps = step_snap.inputs
            output_snaps = step_snap.outputs

            step_inputs = [
                ExecutionPlan.rebuild_step_input(step_input_snap)
                for step_input_snap in input_snaps
            ]

            step_outputs = [
                StepOutput(
                    step_output_snap.solid_handle,
                    step_output_snap.name,
                    step_output_snap.dagster_type_key,
                    step_output_snap.properties,
                ) for step_output_snap in output_snaps
            ]

            if step_snap.kind == StepKind.COMPUTE:
                step = ExecutionStep(
                    step_snap.step_handle,
                    pipeline_name,
                    step_inputs,
                    step_outputs,
                    step_snap.tags,
                )
            elif step_snap.kind == StepKind.UNRESOLVED_MAPPED:
                step = UnresolvedMappedExecutionStep(
                    step_snap.step_handle,
                    pipeline_name,
                    step_inputs,
                    step_outputs,
                    step_snap.tags,
                )
            elif step_snap.kind == StepKind.UNRESOLVED_COLLECT:
                step = UnresolvedCollectExecutionStep(
                    step_snap.step_handle,
                    pipeline_name,
                    step_inputs,
                    step_outputs,
                    step_snap.tags,
                )
            else:
                raise Exception(f"Unexpected step kind {str(step_snap.kind)}")

            step_dict[step.handle] = step

        step_handles_to_execute = [
            StepHandle.parse_from_key(key)
            for key in execution_plan_snapshot.step_keys_to_execute
        ]

        executable_map, resolvable_map = _compute_step_maps(
            step_dict,
            step_handles_to_execute,
            execution_plan_snapshot.initial_known_state,
        )

        return ExecutionPlan(
            step_dict,
            executable_map,
            resolvable_map,
            step_handles_to_execute,
            execution_plan_snapshot.initial_known_state,
            execution_plan_snapshot.artifacts_persisted,
        )
Example #6
0
 def has_step(self, key):
     check.str_param(key, "key")
     handle = StepHandle.parse_from_key(key)
     if isinstance(handle, ResolvedFromDynamicStepHandle):
         return handle.unresolved_form.to_key() in self._step_index
     return key in self._step_index
Example #7
0
def get_retry_steps_from_execution_plan(instance, execution_plan, parent_run_id):
    check.inst_param(instance, "instance", DagsterInstance)
    check.inst_param(execution_plan, "execution_plan", ExternalExecutionPlan)
    check.opt_str_param(parent_run_id, "parent_run_id")

    if not parent_run_id:
        return execution_plan.step_keys_in_plan

    parent_run = instance.get_run_by_id(parent_run_id)
    parent_run_logs = instance.all_logs(parent_run_id)

    # keep track of steps with dicts that point:
    # * step_key -> set(step_handle) in the normal case
    # * unresolved_step_key -> set(resolved_step_handle, ...) for dynamic outputs
    all_steps_in_parent_run_logs = defaultdict(set)
    failed_steps_in_parent_run_logs = defaultdict(set)
    successful_steps_in_parent_run_logs = defaultdict(set)
    interrupted_steps_in_parent_run_logs = defaultdict(set)
    skipped_steps_in_parent_run_logs = defaultdict(set)

    for record in parent_run_logs:
        if record.dagster_event and record.dagster_event.step_handle:
            step_handle = record.dagster_event.step_handle
            _update_tracking_dict(all_steps_in_parent_run_logs, step_handle)

            if record.dagster_event_type == DagsterEventType.STEP_FAILURE:
                _update_tracking_dict(failed_steps_in_parent_run_logs, step_handle)

            if record.dagster_event_type == DagsterEventType.STEP_SUCCESS:
                _update_tracking_dict(successful_steps_in_parent_run_logs, step_handle)

            if record.dagster_event_type == DagsterEventType.STEP_SKIPPED:
                _update_tracking_dict(skipped_steps_in_parent_run_logs, step_handle)

    for step_set in all_steps_in_parent_run_logs.values():
        for step_handle in step_set:
            if (
                not _in_tracking_dict(step_handle, failed_steps_in_parent_run_logs)
                and not _in_tracking_dict(step_handle, successful_steps_in_parent_run_logs)
                and not _in_tracking_dict(step_handle, skipped_steps_in_parent_run_logs)
            ):
                _update_tracking_dict(interrupted_steps_in_parent_run_logs, step_handle)

    to_retry = defaultdict(set)

    execution_deps = execution_plan.execution_deps()
    for step_snap in execution_plan.topological_steps():
        step_key = step_snap.key
        step_handle = StepHandle.parse_from_key(step_snap.key)

        if parent_run.step_keys_to_execute and step_snap.key not in parent_run.step_keys_to_execute:
            continue

        if step_snap.key in failed_steps_in_parent_run_logs:
            to_retry[step_key].update(failed_steps_in_parent_run_logs[step_key])

        # Interrupted steps can occur when graceful cleanup from a step failure fails to run,
        # and a step failure event is not generated
        if step_key in interrupted_steps_in_parent_run_logs:
            to_retry[step_key].update(interrupted_steps_in_parent_run_logs[step_key])

        # Missing steps did not execute, e.g. when a run was terminated
        if step_key not in all_steps_in_parent_run_logs:
            to_retry[step_key].add(step_handle)

        step_dep_keys = execution_deps[step_key]
        retrying_dep_keys = step_dep_keys.intersection(to_retry.keys())

        # this step is downstream of a step we are about to retry
        if retrying_dep_keys:
            for retrying_key in retrying_dep_keys:
                # If this step and its ancestor are both downstream of a dynamic output,
                # add resolved instances of this step for the retrying mapping keys
                if isinstance(step_handle, UnresolvedStepHandle) and all(
                    map(
                        lambda handle: isinstance(handle, ResolvedFromDynamicStepHandle),
                        to_retry[retrying_key],
                    )
                ):
                    for resolved_handle in to_retry[retrying_key]:
                        to_retry[step_key].add(step_handle.resolve(resolved_handle.mapping_key))

                else:
                    to_retry[step_key].add(step_handle)

    return [step_handle.to_key() for step_set in to_retry.values() for step_handle in step_set]
Example #8
0
def get_retry_steps_from_parent_run(
    instance,
    parent_run_id: str = None,
    parent_run: PipelineRun = None
) -> Tuple[List[str], Optional[KnownExecutionState]]:
    check.inst_param(instance, "instance", DagsterInstance)

    check.invariant(
        bool(parent_run_id) != bool(parent_run),
        "Must provide one of parent_run_id or parent_run")
    check.opt_str_param(parent_run_id, "parent_run_id")
    check.opt_inst_param(parent_run, "parent_run", PipelineRun)

    parent_run = parent_run or instance.get_run_by_id(parent_run_id)
    parent_run_id = parent_run.run_id
    parent_run_logs = instance.all_logs(parent_run_id)

    execution_plan_snapshot = instance.get_execution_plan_snapshot(
        parent_run.execution_plan_snapshot_id)

    if not execution_plan_snapshot:
        raise DagsterExecutionPlanSnapshotNotFoundError(
            f"Could not load execution plan snapshot for run {parent_run_id}")

    execution_plan = ExternalExecutionPlan(
        execution_plan_snapshot=execution_plan_snapshot)

    # keep track of steps with dicts that point:
    # * step_key -> set(step_handle) in the normal case
    # * unresolved_step_key -> set(resolved_step_handle, ...) for dynamic outputs
    all_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set)
    failed_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set)
    successful_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set)
    interrupted_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set)
    skipped_steps_in_parent_run_logs: Dict[str, set] = defaultdict(set)

    for record in parent_run_logs:
        if record.dagster_event and record.dagster_event.step_handle:
            step_handle = record.dagster_event.step_handle
            _update_tracking_dict(all_steps_in_parent_run_logs, step_handle)

            if record.dagster_event_type == DagsterEventType.STEP_FAILURE:
                _update_tracking_dict(failed_steps_in_parent_run_logs,
                                      step_handle)

            if record.dagster_event_type == DagsterEventType.STEP_SUCCESS:
                _update_tracking_dict(successful_steps_in_parent_run_logs,
                                      step_handle)

            if record.dagster_event_type == DagsterEventType.STEP_SKIPPED:
                _update_tracking_dict(skipped_steps_in_parent_run_logs,
                                      step_handle)

    for step_set in all_steps_in_parent_run_logs.values():
        for step_handle in step_set:
            if (not _in_tracking_dict(step_handle,
                                      failed_steps_in_parent_run_logs)
                    and not _in_tracking_dict(
                        step_handle, successful_steps_in_parent_run_logs)
                    and not _in_tracking_dict(
                        step_handle, skipped_steps_in_parent_run_logs)):
                _update_tracking_dict(interrupted_steps_in_parent_run_logs,
                                      step_handle)

    to_retry = defaultdict(set)

    execution_deps = execution_plan.execution_deps()
    for step_snap in execution_plan.topological_steps():
        step_key = step_snap.key
        step_handle = StepHandle.parse_from_key(step_snap.key)

        if parent_run.step_keys_to_execute and step_snap.key not in parent_run.step_keys_to_execute:
            continue

        if step_snap.key in failed_steps_in_parent_run_logs:
            to_retry[step_key].update(
                failed_steps_in_parent_run_logs[step_key])

        # Interrupted steps can occur when graceful cleanup from a step failure fails to run,
        # and a step failure event is not generated
        if step_key in interrupted_steps_in_parent_run_logs:
            to_retry[step_key].update(
                interrupted_steps_in_parent_run_logs[step_key])

        # Missing steps did not execute, e.g. when a run was terminated
        if step_key not in all_steps_in_parent_run_logs:
            to_retry[step_key].add(step_handle)

        step_dep_keys = execution_deps[step_key]
        retrying_dep_keys = step_dep_keys.intersection(to_retry.keys())

        # this step is downstream of a step we are about to retry
        if retrying_dep_keys:
            for retrying_key in retrying_dep_keys:
                # If this step and its ancestor are both downstream of a dynamic output,
                # add resolved instances of this step for the retrying mapping keys
                if isinstance(step_handle, UnresolvedStepHandle) and all(
                        map(
                            lambda handle: isinstance(
                                handle, ResolvedFromDynamicStepHandle),
                            to_retry[retrying_key],
                        )):
                    for resolved_handle in to_retry[retrying_key]:
                        to_retry[step_key].add(
                            step_handle.resolve(resolved_handle.mapping_key))

                else:
                    to_retry[step_key].add(step_handle)

    steps_to_retry = [
        step_handle.to_key() for step_set in to_retry.values()
        for step_handle in step_set
    ]

    return steps_to_retry, KnownExecutionState.for_reexecution(
        parent_run_logs, steps_to_retry)