Beispiel #1
0
def _recover_workflow_step(input_object_refs: List[ray.ObjectRef],
                           input_workflows: List[Any],
                           input_workflow_refs: List[WorkflowRef],
                           instant_workflow_inputs: Dict[int, StepID]):
    """A workflow step that recovers the output of an unfinished step.

    Args:
        input_object_refs: The object refs in the argument of
            the (original) step.
        input_workflows: The workflows in the argument of the (original) step.
            They are resolved into physical objects (i.e. the output of the
            workflows) here. They come from other recover workflows we
            construct recursively.
        instant_workflow_inputs: Same as 'input_workflows', but they come
            point to workflow steps that have output checkpoints. They override
            corresponding workflows in 'input_workflows'.

    Returns:
        The output of the recovered step.
    """
    reader = workflow_storage.get_workflow_storage()
    for index, _step_id in instant_workflow_inputs.items():
        # override input workflows with instant workflows
        input_workflows[index] = reader.load_step_output(_step_id)

    step_id = workflow_context.get_current_step_id()
    func: Callable = reader.load_step_func_body(step_id)
    args, kwargs = reader.load_step_args(
        step_id, input_workflows, input_object_refs, input_workflow_refs)
    return func(*args, **kwargs)
Beispiel #2
0
def _resolve_dynamic_workflow_refs(workflow_refs: "List[WorkflowRef]"):
    """Get the output of a workflow step with the step ID at runtime.

    We lookup the output by the following order:
    1. Query cached step output in the workflow manager. Fetch the physical
       output object.
    2. If failed to fetch the physical output object, look into the storage
       to see whether the output is checkpointed. Load the checkpoint.
    3. If failed to load the checkpoint, resume the step and get the output.
    """
    workflow_manager = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()
    workflow_id = context.workflow_id
    storage_url = context.storage_url
    workflow_ref_mapping = []
    for workflow_ref in workflow_refs:
        step_ref = ray.get(
            workflow_manager.get_cached_step_output.remote(
                workflow_id, workflow_ref.step_id
            )
        )
        get_cached_step = False
        if step_ref is not None:
            try:
                output, _ = _resolve_object_ref(step_ref)
                get_cached_step = True
            except Exception:
                get_cached_step = False
        if not get_cached_step:
            wf_store = workflow_storage.get_workflow_storage()
            try:
                output = wf_store.load_step_output(workflow_ref.step_id)
            except Exception:
                current_step_id = workflow_context.get_current_step_id()
                logger.warning(
                    "Failed to get the output of step "
                    f"{workflow_ref.step_id}. Trying to resume it. "
                    f"Current step: '{current_step_id}'"
                )
                step_ref = recovery.resume_workflow_step(
                    workflow_id, workflow_ref.step_id, storage_url, None
                ).persisted_output
                output, _ = _resolve_object_ref(step_ref)
        workflow_ref_mapping.append(output)
    return workflow_ref_mapping
Beispiel #3
0
def _recover_workflow_step(args: List[Any], kwargs: Dict[str, Any],
                           input_workflows: List[Any],
                           input_workflow_refs: List[WorkflowRef]):
    """A workflow step that recovers the output of an unfinished step.

    Args:
        args: The positional arguments for the step function.
        kwargs: The keyword args for the step function.
        input_workflows: The workflows in the argument of the (original) step.
            They are resolved into physical objects (i.e. the output of the
            workflows) here. They come from other recover workflows we
            construct recursively.

    Returns:
        The output of the recovered step.
    """
    reader = workflow_storage.get_workflow_storage()
    step_id = workflow_context.get_current_step_id()
    func: Callable = reader.load_step_func_body(step_id)
    return func(*args, **kwargs)
Beispiel #4
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        persisted_output, volatile_output = _wrap_run(func, runtime_options,
                                                      *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor "
                "is not allowed.")
        assert not isinstance(persisted_output, Workflow)
    else:
        store = workflow_storage.get_workflow_storage()
        commit_step(store, step_id, persisted_output, exception=None)
        if isinstance(persisted_output, Workflow):
            outer_most_step_id = context.outer_most_step_id
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            assert volatile_output is None
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(persisted_output)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Beispiel #5
0
def _wrap_run(func: Callable, runtime_options: "WorkflowStepRuntimeOptions",
              *args, **kwargs) -> Tuple[Any, Any]:
    """Wrap the function and execute it.

    It returns two parts, persisted_output (p-out) and volatile_output (v-out).
    P-out is the part of result to persist in a storage and pass to the
    next step. V-out is the part of result to return to the user but does not
    require persistence.

    This table describes their relationships

    +-----------------------------+-------+--------+----------------------+
    | Step Type                   | p-out | v-out  | catch exception into |
    +-----------------------------+-------+--------+----------------------+
    | Function Step               | Y     | N      | p-out                |
    +-----------------------------+-------+--------+----------------------+
    | Virtual Actor Step          | Y     | Y      | v-out                |
    +-----------------------------+-------+--------+----------------------+
    | Readonly Virtual Actor Step | N     | Y      | v-out                |
    +-----------------------------+-------+--------+----------------------+

    Args:
        func: The function body.
        runtime_options: Step execution params.

    Returns:
        State and output.
    """
    exception = None
    result = None
    # max_retries are for application level failure.
    # For ray failure, we should use max_retries.
    for i in range(runtime_options.max_retries):
        logger.info(f"{get_step_status_info(WorkflowStatus.RUNNING)}"
                    f"\t[{i + 1}/{runtime_options.max_retries}]")
        try:
            result = func(*args, **kwargs)
            exception = None
            break
        except BaseException as e:
            if i + 1 == runtime_options.max_retries:
                retry_msg = "Maximum retry reached, stop retry."
            else:
                retry_msg = "The step will be retried."
            logger.error(
                f"{workflow_context.get_name()} failed with error message"
                f" {e}. {retry_msg}")
            exception = e

    step_type = runtime_options.step_type
    if runtime_options.catch_exceptions:
        if step_type == StepType.FUNCTION:
            if isinstance(result, Workflow):
                # When it returns a nested workflow, catch_exception
                # should be passed recursively.
                assert exception is None
                result.data.step_options.catch_exceptions = True
                persisted_output, volatile_output = result, None
            else:
                persisted_output, volatile_output = (result, exception), None
        elif step_type == StepType.ACTOR_METHOD:
            # virtual actors do not persist exception
            persisted_output, volatile_output = result[0], (result[1],
                                                            exception)
        elif runtime_options.step_type == StepType.READONLY_ACTOR_METHOD:
            persisted_output, volatile_output = None, (result, exception)
        else:
            raise ValueError(f"Unknown StepType '{step_type}'")
    else:
        if exception is not None:
            if step_type != StepType.READONLY_ACTOR_METHOD:
                status = WorkflowStatus.FAILED
                _record_step_status(workflow_context.get_current_step_id(),
                                    status)
                logger.info(get_step_status_info(status))
            raise exception
        if step_type == StepType.FUNCTION:
            persisted_output, volatile_output = result, None
        elif step_type == StepType.ACTOR_METHOD:
            persisted_output, volatile_output = result
        elif step_type == StepType.READONLY_ACTOR_METHOD:
            persisted_output, volatile_output = None, result
        else:
            raise ValueError(f"Unknown StepType '{step_type}'")

    return persisted_output, volatile_output
Beispiel #6
0
def _workflow_step_executor(step_type: StepType, func: Callable,
                            context: workflow_context.WorkflowStepContext,
                            step_id: "StepID",
                            baked_inputs: "_BakedWorkflowInputs",
                            catch_exceptions: bool, max_retries: int) -> Any:
    """Executor function for workflow step.

    Args:
        step_type: The type of workflow step.
        func: The workflow step function.
        context: Workflow step context. Used to access correct storage etc.
        step_id: The ID of the step.
        baked_inputs: The processed inputs for the step.
        catch_exceptions: If set to be true, return
            (Optional[Result], Optional[Error]) instead of Result.
        max_retries: Max number of retries encounter of a failure.

    Returns:
        Workflow step output.
    """
    workflow_context.update_workflow_step_context(context, step_id)
    args, kwargs = _resolve_step_inputs(baked_inputs)
    store = workflow_storage.get_workflow_storage()
    try:
        persisted_output, volatile_output = _wrap_run(
            func, step_type, step_id, catch_exceptions, max_retries, *args,
            **kwargs)
    except Exception as e:
        commit_step(store, step_id, None, e)
        raise e
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor "
                "is not allowed.")
        assert not isinstance(persisted_output, Workflow)
    else:
        store = workflow_storage.get_workflow_storage()
        commit_step(store, step_id, persisted_output, None)
        outer_most_step_id = context.outer_most_step_id
        if isinstance(persisted_output, Workflow):
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            assert volatile_output is None
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(persisted_output)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Beispiel #7
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
    inplace: bool = False,
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.
        inplace: Execute the workflow inplace.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    context.checkpoint_context.checkpoint = runtime_options.checkpoint

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        with workflow_context.workflow_execution():
            persisted_output, volatile_output = _wrap_run(
                func, runtime_options, *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        # Always checkpoint the exception.
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor is not allowed."
            )
        assert not isinstance(persisted_output, Workflow)
    else:
        # TODO(suquark): Validate checkpoint options before
        # commit the step.
        store = workflow_storage.get_workflow_storage()
        if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC:
            commit_step(
                store,
                step_id,
                persisted_output,
                exception=None,
            )
        if isinstance(persisted_output, Workflow):
            sub_workflow = persisted_output
            outer_most_step_id = context.outer_most_step_id
            assert volatile_output is None
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            if inplace:
                _step_options = sub_workflow.data.step_options
                if (_step_options.step_type != StepType.WAIT
                        and runtime_options.ray_options !=
                        _step_options.ray_options):
                    logger.warning(
                        f"Workflow step '{sub_workflow.step_id}' uses "
                        f"a Ray option different to its caller step '{step_id}' "
                        f"and will be executed inplace. Ray assumes it still "
                        f"consumes the same resource as the caller. This may result "
                        f"in oversubscribing resources.")
                return (
                    InplaceReturnedWorkflow(
                        sub_workflow,
                        {"outer_most_step_id": outer_most_step_id}),
                    None,
                )
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(sub_workflow)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Beispiel #8
0
def _wrap_run(func: Callable, runtime_options: "WorkflowStepRuntimeOptions",
              *args, **kwargs) -> Tuple[Any, Any]:
    """Wrap the function and execute it.

    Args:
        func: The function body.
        runtime_options: Step execution params.

    Returns:
        State and output.
    """
    exception = None
    result = None
    done = False
    # max_retries are for application level failure.
    # For ray failure, we should use max_retries.
    i = 0
    while not done:
        if i == 0:
            logger.info(f"{get_step_status_info(WorkflowStatus.RUNNING)}")
        else:
            total_retries = (runtime_options.max_retries
                             if runtime_options.max_retries != -1 else "inf")
            logger.info(f"{get_step_status_info(WorkflowStatus.RUNNING)}"
                        f"\tretries: [{i}/{total_retries}]")
        try:
            result = func(*args, **kwargs)
            exception = None
            done = True
        except BaseException as e:
            if i == runtime_options.max_retries:
                retry_msg = "Maximum retry reached, stop retry."
                exception = e
                done = True
            else:
                retry_msg = "The step will be retried."
                i += 1
            logger.error(
                f"{workflow_context.get_name()} failed with error message"
                f" {e}. {retry_msg}")
    step_type = runtime_options.step_type
    if runtime_options.catch_exceptions:
        if step_type == StepType.FUNCTION:
            if isinstance(result, Workflow):
                # When it returns a nested workflow, catch_exception
                # should be passed recursively.
                assert exception is None
                result.data.step_options.catch_exceptions = True
                output = result
            else:
                output = (result, exception)
        else:
            raise ValueError(f"Unknown StepType '{step_type}'")
    else:
        if exception is not None:
            status = WorkflowStatus.FAILED
            _record_step_status(workflow_context.get_current_step_id(), status)
            logger.info(get_step_status_info(status))
            raise exception
        if step_type == StepType.FUNCTION:
            output = result
        else:
            raise ValueError(f"Unknown StepType '{step_type}'")

    return output