Example #1
0
def _ensure_workflow_initialized() -> None:
    # NOTE: Trying to get the actor has a side effect: it initializes Ray with
    # default arguments. This is different in "init()": it assigns a temporary
    # storage. This is why we need to check "ray.is_initialized()" first.
    if not ray.is_initialized():
        init()
    else:
        try:
            workflow_access.get_management_actor()
        except ValueError:
            init()
Example #2
0
def list_all(
        status_filter: Set[WorkflowStatus]
) -> List[Tuple[str, WorkflowStatus]]:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        runnings = []
    else:
        runnings = ray.get(workflow_manager.list_running_workflow.remote())
    if WorkflowStatus.RUNNING in status_filter and len(status_filter) == 1:
        return [(r, WorkflowStatus.RUNNING) for r in runnings]

    runnings = set(runnings)
    # Here we don't have workflow id, so use empty one instead
    store = workflow_storage.get_workflow_storage("")
    ret = []
    for (k, s) in store.list_workflow():
        if s == WorkflowStatus.RUNNING and k not in runnings:
            s = WorkflowStatus.RESUMABLE
        if s in status_filter:
            ret.append((k, s))
    return ret
Example #3
0
def cancel(workflow_id: str) -> None:
    """Cancel a workflow. Workflow checkpoints will still be saved in storage. To
       clean up saved checkpoints, see `workflow.delete()`.

    Args:
        workflow_id: The workflow to cancel.

    Examples:
        >>> from ray import workflow
        >>> some_job = ... # doctest: +SKIP
        >>> workflow_step = some_job.step() # doctest: +SKIP
        >>> output = workflow_step.run_async(workflow_id="some_job") # doctest: +SKIP
        >>> workflow.cancel(workflow_id="some_job") # doctest: +SKIP
        >>> assert [ # doctest: +SKIP
        ...     ("some_job", workflow.CANCELED)] == workflow.list_all()

    Returns:
        None

    """
    _ensure_workflow_initialized()
    if not isinstance(workflow_id, str):
        raise TypeError("workflow_id has to be a string type.")
    workflow_manager = workflow_access.get_management_actor()
    ray.get(workflow_manager.cancel_workflow.remote(workflow_id))
Example #4
0
def cancel(workflow_id: str) -> None:
    try:
        workflow_manager = get_management_actor()
        ray.get(workflow_manager.cancel_workflow.remote(workflow_id))
    except ValueError:
        wf_store = workflow_storage.get_workflow_storage(workflow_id)
        wf_store.save_workflow_meta(WorkflowMetaData(WorkflowStatus.CANCELED))
Example #5
0
def resume_all(with_failed: bool) -> List[Tuple[str, ray.ObjectRef]]:
    filter_set = {WorkflowStatus.RESUMABLE}
    if with_failed:
        filter_set.add(WorkflowStatus.FAILED)
    all_failed = list_all(filter_set)

    try:
        workflow_manager = get_management_actor()
    except Exception as e:
        raise RuntimeError("Failed to get management actor") from e

    job_id = ray.get_runtime_context().job_id.hex()
    reconstructed_refs = []
    reconstructed_workflows = []
    for wid, _ in all_failed:
        context = workflow_context.WorkflowStepContext(workflow_id=wid)
        reconstructed_refs.append(
            (context,
             workflow_manager.reconstruct_workflow.remote(job_id, context)))
    for context, ref in reconstructed_refs:
        try:
            ray.get(ref)  # make sure the workflow is already reconstructed
            reconstructed_workflows.append((
                context.workflow_id,
                workflow_manager.execute_workflow.remote(job_id, context),
            ))
        except Exception:
            # TODO(suquark): Here some workflows got resumed successfully but some
            #  failed and the user has no idea about this, which is very wired.
            # Maybe we should raise an exception here instead?
            logger.error(f"Failed to resume workflow {context.workflow_id}")

    return reconstructed_workflows
Example #6
0
def get_output_async(workflow_id: str,
                     *,
                     name: Optional[str] = None) -> ray.ObjectRef:
    """Get the output of a running workflow asynchronously.

    Args:
        workflow_id: The workflow to get the output of.
        name: If set, fetch the specific step instead of the output of the
            workflow.

    Returns:
        An object reference that can be used to retrieve the workflow task result.
    """
    _ensure_workflow_initialized()
    try:
        workflow_manager = workflow_access.get_management_actor()
    except ValueError as e:
        raise ValueError(
            "Failed to connect to the workflow management "
            "actor. The workflow could have already failed. You can use "
            "workflow.resume() to resume the workflow.") from e

    try:
        # check storage first
        wf_store = WorkflowStorage(workflow_id)
        tid = wf_store.inspect_output(name)
        if tid is not None:
            return workflow_access.load_step_output_from_storage.remote(
                workflow_id, name)
    except ValueError:
        pass

    return workflow_manager.get_output.remote(workflow_id, name)
Example #7
0
def resume_all(with_failed: bool) -> List[Tuple[str, ray.ObjectRef]]:
    filter_set = {WorkflowStatus.RESUMABLE}
    if with_failed:
        filter_set.add(WorkflowStatus.FAILED)
    all_failed = list_all(filter_set)
    try:
        workflow_manager = get_management_actor()
    except Exception as e:
        raise RuntimeError("Failed to get management actor") from e

    async def _resume_one(wid: str) -> Tuple[str, Optional[ray.ObjectRef]]:
        try:
            result: "WorkflowExecutionResult" = (
                await workflow_manager.run_or_resume.remote(wid)
            )
            obj = flatten_workflow_output(wid, result.persisted_output)
            return wid, obj
        except Exception:
            logger.error(f"Failed to resume workflow {wid}")
            return (wid, None)

    ret = workflow_storage.asyncio_run(
        asyncio.gather(*[_resume_one(wid) for (wid, _) in all_failed])
    )
    return [(wid, obj) for (wid, obj) in ret if obj is not None]
Example #8
0
def get_output(workflow_id: str, name: Optional[str]) -> ray.ObjectRef:
    """Get the output of a running workflow.
    See "api.get_output()" for details.
    """
    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    try:
        workflow_manager = get_management_actor()
    except ValueError as e:
        raise ValueError(
            "Failed to connect to the workflow management "
            "actor. The workflow could have already failed. You can use "
            "workflow.resume() to resume the workflow.") from e

    try:
        # check storage first
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        tid = wf_store.inspect_output(name)
        if tid is not None:
            return load_step_output_from_storage.remote(workflow_id, name)
    except ValueError:
        pass

    return workflow_manager.get_output.remote(workflow_id, name)
Example #9
0
def _record_step_status(step_id: "StepID",
                        status: "WorkflowStatus",
                        outputs: List["ObjectRef"] = []) -> None:
    workflow_id = workflow_context.get_current_workflow_id()
    workflow_manager = get_management_actor()
    ray.get(
        workflow_manager.update_step_status.remote(workflow_id, step_id,
                                                   status, outputs))
Example #10
0
def cancel(workflow_id: str) -> None:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        wf_store = workflow_storage.get_workflow_storage(workflow_id)
        # TODO(suquark): Here we update workflow status "offline", so it is likely
        # thread-safe because there is no workflow management actor updating the
        # workflow concurrently. But we should be careful if we are going to
        # update more workflow status offline in the future.
        wf_store.update_workflow_status(WorkflowStatus.CANCELED)
        return
    ray.get(workflow_manager.cancel_workflow.remote(workflow_id))
Example #11
0
def get_status(workflow_id: str) -> Optional[WorkflowStatus]:
    try:
        workflow_manager = get_management_actor()
        running = ray.get(
            workflow_manager.is_workflow_running.remote(workflow_id))
    except Exception:
        running = False
    if running:
        return WorkflowStatus.RUNNING
    store = workflow_storage.get_workflow_storage(workflow_id)
    meta = store.load_workflow_meta()
    if meta is None:
        raise ValueError(f"No such workflow_id {workflow_id}")
    return meta.status
Example #12
0
def get_output(workflow_id: str, name: Optional[str]) -> ray.ObjectRef:
    """Get the output of a running workflow.
    See "api.get_output()" for details.
    """
    assert ray.is_initialized()
    try:
        workflow_manager = get_management_actor()
    except ValueError as e:
        raise ValueError(
            "Failed to connect to the workflow management "
            "actor. The workflow could have already failed. You can use "
            "workflow.resume() to resume the workflow.") from e
    output = ray.get(workflow_manager.get_output.remote(workflow_id, name))
    return flatten_workflow_output(workflow_id, output)
Example #13
0
def get_status(workflow_id: str) -> Optional[WorkflowStatus]:
    try:
        workflow_manager = get_management_actor()
        running = ray.get(workflow_manager.is_workflow_running.remote(workflow_id))
    except Exception:
        running = False
    if running:
        return WorkflowStatus.RUNNING
    store = workflow_storage.get_workflow_storage(workflow_id)
    meta = store.load_workflow_meta()
    if meta is None:
        raise WorkflowNotFoundError(workflow_id)
    if meta.status == WorkflowStatus.RUNNING:
        return WorkflowStatus.RESUMABLE
    return meta.status
Example #14
0
def list_all(
        status_filter: Set[WorkflowStatus]
) -> List[Tuple[str, WorkflowStatus]]:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        runnings = []
    else:
        runnings = ray.get(workflow_manager.list_running_workflow.remote())
    if WorkflowStatus.RUNNING in status_filter and len(status_filter) == 1:
        return [(r, WorkflowStatus.RUNNING) for r in runnings]

    runnings = set(runnings)
    # Here we don't have workflow id, so use empty one instead
    store = workflow_storage.get_workflow_storage("")

    # This is the tricky part: the status "RESUMABLE" neither come from
    # the workflow management actor nor the storage. It is the status where
    # the storage says it is running but the workflow management actor
    # is not running it. This usually happened when there was a sudden crash
    # of the whole Ray runtime or the workflow management actor
    # (due to cluster etc.). So we includes 'RUNNING' status in the storage
    # filter to get "RESUMABLE" candidates.
    storage_status_filter = status_filter.copy()
    storage_status_filter.add(WorkflowStatus.RUNNING)
    status_from_storage = store.list_workflow(storage_status_filter)
    ret = []
    for (k, s) in status_from_storage:
        if s == WorkflowStatus.RUNNING:
            if k not in runnings:
                s = WorkflowStatus.RESUMABLE
            else:
                continue
        if s in status_filter:
            ret.append((k, s))
    if WorkflowStatus.RUNNING in status_filter:
        ret.extend((k, WorkflowStatus.RUNNING) for k in runnings)
    return ret
Example #15
0
def get_status(workflow_id: str) -> WorkflowStatus:
    """Get the status for a given workflow.

    Args:
        workflow_id: The workflow to query.

    Examples:
        >>> from ray import workflow
        >>> trip = ... # doctest: +SKIP
        >>> workflow_step = trip.step() # doctest: +SKIP
        >>> output = workflow_step.run(workflow_id="trip") # doctest: +SKIP
        >>> assert workflow.SUCCESSFUL == workflow.get_status("trip") # doctest: +SKIP

    Returns:
        The status of that workflow
    """
    _ensure_workflow_initialized()
    if not isinstance(workflow_id, str):
        raise TypeError("workflow_id has to be a string type.")
    workflow_manager = workflow_access.get_management_actor()
    return ray.get(workflow_manager.get_workflow_status.remote(workflow_id))
Example #16
0
def resume_async(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously.

    Resume a workflow and retrieve its output. If the workflow was incomplete,
    it will be re-executed from its checkpointed outputs. If the workflow was
    complete, returns the result immediately.

    Examples:
        >>> from ray import workflow
        >>> start_trip = ... # doctest: +SKIP
        >>> trip = start_trip.step() # doctest: +SKIP
        >>> res1 = trip.run_async(workflow_id="trip1") # doctest: +SKIP
        >>> res2 = workflow.resume("trip1") # doctest: +SKIP
        >>> assert ray.get(res1) == ray.get(res2) # doctest: +SKIP

    Args:
        workflow_id: The id of the workflow to resume.

    Returns:
        An object reference that can be used to retrieve the workflow result.
    """
    _ensure_workflow_initialized()
    logger.info(f'Resuming workflow [id="{workflow_id}"].')
    workflow_manager = workflow_access.get_management_actor()
    if ray.get(
            workflow_manager.is_workflow_non_terminating.remote(workflow_id)):
        raise RuntimeError(
            f"Workflow '{workflow_id}' is already running or pending.")
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    job_id = ray.get_runtime_context().job_id.hex()

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context))
    result = workflow_manager.execute_workflow.remote(job_id, context)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return result
Example #17
0
def list_all(
        status_filter: Set[WorkflowStatus]
) -> List[Tuple[str, WorkflowStatus]]:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        runnings = []
    else:
        runnings = ray.get(workflow_manager.list_running_workflow.remote())
    if WorkflowStatus.RUNNING in status_filter and len(status_filter) == 1:
        return [(r, WorkflowStatus.RUNNING) for r in runnings]

    runnings = set(runnings)
    # Here we don't have workflow id, so use empty one instead
    store = workflow_storage.get_workflow_storage("")

    exclude_running = False
    if (WorkflowStatus.RESUMABLE in status_filter
            and WorkflowStatus.RUNNING not in status_filter):
        # Here we have to add "RUNNING" to the status filter, because some "RESUMABLE"
        # workflows are converted from "RUNNING" workflows below.
        exclude_running = True
        status_filter.add(WorkflowStatus.RUNNING)
    status_from_storage = store.list_workflow(status_filter)
    ret = []
    for (k, s) in status_from_storage:
        if s == WorkflowStatus.RUNNING:
            if k not in runnings:
                s = WorkflowStatus.RESUMABLE
            elif exclude_running:
                continue
        if s in status_filter:
            ret.append((k, s))
    return ret
Example #18
0
def run_async(
    dag: DAGNode,
    *args,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    **kwargs,
) -> ray.ObjectRef:
    """Run a workflow asynchronously.

    If the workflow with the given id already exists, it will be resumed.

    Args:
        workflow_id: A unique identifier that can be used to resume the
            workflow. If not specified, a random id will be generated.
        metadata: The metadata to add to the workflow. It has to be able
            to serialize to json.

    Returns:
       The running result as ray.ObjectRef.

    """
    _ensure_workflow_initialized()
    if not isinstance(dag, DAGNode):
        raise TypeError("Input should be a DAG.")
    input_data = DAGInputData(*args, **kwargs)
    validate_user_metadata(metadata)
    metadata = metadata or {}

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    state = workflow_state_from_dag(dag, input_data, workflow_id)
    logger.info(f'Workflow job created. [id="{workflow_id}"].')

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    with workflow_context.workflow_step_context(context):
        # checkpoint the workflow
        ws = WorkflowStorage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        job_id = ray.get_runtime_context().job_id.hex()

        try:
            ws.get_entrypoint_step_id()
            wf_exists = True
        except Exception:
            # The workflow does not exist. We must checkpoint entry workflow.
            ws.save_workflow_execution_state("", state)
            wf_exists = False
        workflow_manager = workflow_access.get_management_actor()
        if ray.get(
                workflow_manager.is_workflow_non_terminating.remote(
                    workflow_id)):
            raise RuntimeError(
                f"Workflow '{workflow_id}' is already running or pending.")
        if wf_exists:
            return resume_async(workflow_id)
        ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE
        ray.get(
            workflow_manager.submit_workflow.remote(
                workflow_id, state, ignore_existing=ignore_existing))
        return workflow_manager.execute_workflow.remote(job_id, context)
Example #19
0
def workflow_state_from_dag(dag_node: DAGNode,
                            input_context: Optional[DAGInputData],
                            workflow_id: str):
    """
    Transform a Ray DAG to a workflow. Map FunctionNode to workflow step with
    the workflow decorator.

    Args:
        dag_node: The DAG to be converted to a workflow.
        input_context: The input data that wraps varibles for the input node of the DAG.
        workflow_id: The ID of the workflow.
    """
    if not isinstance(dag_node, FunctionNode):
        raise TypeError(
            "Currently workflow does not support classes as DAG inputs.")

    state = WorkflowExecutionState()

    # TODO(suquark): remove this cyclic importing later by changing the way of
    # task ID assignment.
    from ray.workflow.workflow_access import get_management_actor

    mgr = get_management_actor()
    context = workflow_context.get_workflow_step_context()

    def _node_visitor(node: Any) -> Any:
        if isinstance(node, FunctionNode):
            bound_options = node._bound_options.copy()
            num_returns = bound_options.get("num_returns", 1)
            if num_returns is None:  # ray could use `None` as default value
                num_returns = 1
            if num_returns > 1:
                raise ValueError("Workflow steps can only have one return.")

            workflow_options = bound_options.pop("_metadata",
                                                 {}).get(WORKFLOW_OPTIONS, {})

            # If checkpoint option is not specified, inherit checkpoint
            # options from context (i.e. checkpoint options of the outer
            # step). If it is still not specified, it's True by default.
            checkpoint = workflow_options.get("checkpoint", None)
            if checkpoint is None:
                checkpoint = context.checkpoint if context is not None else True
            # When it returns a nested workflow, catch_exception
            # should be passed recursively.
            catch_exceptions = workflow_options.get("catch_exceptions", None)
            if catch_exceptions is None:
                # TODO(suquark): should we also handle exceptions from a "leaf node"
                #   in the continuation? For example, we have a workflow
                #   > @ray.remote
                #   > def A(): pass
                #   > @ray.remote
                #   > def B(x): return x
                #   > @ray.remote
                #   > def C(x): return workflow.continuation(B.bind(A.bind()))
                #   > dag = C.options(**workflow.options(catch_exceptions=True)).bind()
                #   Should C catches exceptions of A?
                if node.get_stable_uuid() == dag_node.get_stable_uuid():
                    # 'catch_exception' context should be passed down to
                    # its direct continuation task.
                    # In this case, the direct continuation is the output node.
                    catch_exceptions = (context.catch_exceptions
                                        if context is not None else False)
                else:
                    catch_exceptions = False

            max_retries = bound_options.get("max_retries", 3)
            if not isinstance(max_retries, int) or max_retries < -1:
                raise ValueError(
                    "'max_retries' only accepts 0, -1 or a positive integer.")

            step_options = WorkflowStepRuntimeOptions(
                step_type=StepType.FUNCTION,
                catch_exceptions=catch_exceptions,
                max_retries=max_retries,
                allow_inplace=False,
                checkpoint=checkpoint,
                ray_options=bound_options,
            )

            workflow_refs: List[WorkflowRef] = []
            with serialization_context.workflow_args_serialization_context(
                    workflow_refs):
                _func_signature = signature.extract_signature(node._body)
                flattened_args = signature.flatten_args(
                    _func_signature, node._bound_args, node._bound_kwargs)
                # NOTE: When calling 'ray.put', we trigger python object
                # serialization. Under our serialization context,
                # Workflows are separated from the arguments,
                # leaving a placeholder object with all other python objects.
                # Then we put the placeholder object to object store,
                # so it won't be mutated later. This guarantees correct
                # semantics. See "tests/test_variable_mutable.py" as
                # an example.
                input_placeholder: ray.ObjectRef = ray.put(flattened_args)

            name = workflow_options.get("name")
            if name is None:
                name = f"{get_module(node._body)}.{slugify(get_qualname(node._body))}"
            task_id = ray.get(mgr.gen_step_id.remote(workflow_id, name))
            state.add_dependencies(task_id, [s.task_id for s in workflow_refs])
            state.task_input_args[task_id] = input_placeholder

            user_metadata = workflow_options.pop("metadata", {})
            validate_user_metadata(user_metadata)
            state.tasks[task_id] = Task(
                name=name,
                options=step_options,
                user_metadata=user_metadata,
                func_body=node._body,
            )
            return WorkflowRef(task_id)

        if isinstance(node, InputAttributeNode):
            return node._execute_impl()  # get data from input node
        if isinstance(node, InputNode):
            return input_context  # replace input node with input data
        if not isinstance(node, DAGNode):
            return node  # return normal objects
        raise TypeError(f"Unsupported DAG node: {node}")

    output_workflow_ref = dag_node.apply_recursive(_node_visitor)
    state.output_task_id = output_workflow_ref.task_id
    return state
Example #20
0
def list_all(
    status_filter: Optional[Union[Union[WorkflowStatus, str],
                                  Set[Union[WorkflowStatus, str]]]] = None
) -> List[Tuple[str, WorkflowStatus]]:
    """List all workflows matching a given status filter. When returning "RESUMEABLE"
    workflows, the workflows that was running ranks before the workflow that was pending
    in the result list.

    Args:
        status_filter: If given, only returns workflow with that status. This can
            be a single status or set of statuses. The string form of the
            status is also acceptable, i.e.,
            "RUNNING"/"FAILED"/"SUCCESSFUL"/"CANCELED"/"RESUMABLE"/"PENDING".
    Examples:
        >>> from ray import workflow
        >>> long_running_job = ... # doctest: +SKIP
        >>> workflow_step = long_running_job.step() # doctest: +SKIP
        >>> wf = workflow_step.run_async( # doctest: +SKIP
        ...     workflow_id="long_running_job")
        >>> jobs = workflow.list_all() # doctest: +SKIP
        >>> assert jobs == [ ("long_running_job", workflow.RUNNING) ] # doctest: +SKIP
        >>> ray.get(wf) # doctest: +SKIP
        >>> jobs = workflow.list_all({workflow.RUNNING}) # doctest: +SKIP
        >>> assert jobs == [] # doctest: +SKIP
        >>> jobs = workflow.list_all(workflow.SUCCESSFUL) # doctest: +SKIP
        >>> assert jobs == [ # doctest: +SKIP
        ...     ("long_running_job", workflow.SUCCESSFUL)]

    Returns:
        A list of tuple with workflow id and workflow status
    """
    _ensure_workflow_initialized()
    if isinstance(status_filter, str):
        status_filter = set({WorkflowStatus(status_filter)})
    elif isinstance(status_filter, WorkflowStatus):
        status_filter = set({status_filter})
    elif isinstance(status_filter, set):
        if all(isinstance(s, str) for s in status_filter):
            status_filter = {WorkflowStatus(s) for s in status_filter}
        elif not all(isinstance(s, WorkflowStatus) for s in status_filter):
            raise TypeError("status_filter contains element which is not"
                            " a type of `WorkflowStatus or str`."
                            f" {status_filter}")
    elif status_filter is None:
        status_filter = set(WorkflowStatus)
        status_filter.discard(WorkflowStatus.NONE)
    else:
        raise TypeError(
            "status_filter must be WorkflowStatus or a set of WorkflowStatus.")

    try:
        workflow_manager = workflow_access.get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        non_terminating_workflows = {}
    else:
        non_terminating_workflows = ray.get(
            workflow_manager.list_non_terminating_workflows.remote())

    ret = []
    if set(non_terminating_workflows.keys()).issuperset(status_filter):
        for status, workflows in non_terminating_workflows.items():
            if status in status_filter:
                for w in workflows:
                    ret.append((w, status))
        return ret

    ret = []
    # Here we don't have workflow id, so use empty one instead
    store = WorkflowStorage("")
    modified_status_filter = status_filter.copy()
    # Here we have to add non-terminating status to the status filter, because some
    # "RESUMABLE" workflows are converted from non-terminating workflows below.
    # This is the tricky part: the status "RESUMABLE" neither come from
    # the workflow management actor nor the storage. It is the status where
    # the storage says it is non-terminating but the workflow management actor
    # is not running it. This usually happened when there was a sudden crash
    # of the whole Ray runtime or the workflow management actor
    # (due to cluster etc.). So we includes non terminating status in the storage
    # filter to get "RESUMABLE" candidates.
    modified_status_filter.update(WorkflowStatus.non_terminating_status())
    status_from_storage = store.list_workflow(modified_status_filter)
    non_terminating_workflows = {
        k: set(v)
        for k, v in non_terminating_workflows.items()
    }
    resume_running = []
    resume_pending = []
    for (k, s) in status_from_storage:
        if s in non_terminating_workflows and k not in non_terminating_workflows[
                s]:
            if s == WorkflowStatus.RUNNING:
                resume_running.append(k)
            elif s == WorkflowStatus.PENDING:
                resume_pending.append(k)
            else:
                assert False, "This line of code should not be reachable."
            continue
        if s in status_filter:
            ret.append((k, s))
    if WorkflowStatus.RESUMABLE in status_filter:
        # The running workflows ranks before the pending workflows.
        for w in resume_running:
            ret.append((w, WorkflowStatus.RESUMABLE))
        for w in resume_pending:
            ret.append((w, WorkflowStatus.RESUMABLE))
    return ret
Example #21
0
def resume_all(
        include_failed: bool = False) -> List[Tuple[str, ray.ObjectRef]]:
    """Resume all resumable workflow jobs.

    This can be used after cluster restart to resume all tasks.

    Args:
        include_failed: Whether to resume FAILED workflows.

    Examples:
        >>> from ray import workflow
        >>> failed_job = ... # doctest: +SKIP
        >>> workflow_step = failed_job.step() # doctest: +SKIP
        >>> output = workflow_step.run_async(workflow_id="failed_job") # doctest: +SKIP
        >>> try: # doctest: +SKIP
        >>>     ray.get(output) # doctest: +SKIP
        >>> except Exception: # doctest: +SKIP
        >>>     print("JobFailed") # doctest: +SKIP
        >>> jobs = workflow.list_all() # doctest: +SKIP
        >>> assert jobs == [("failed_job", workflow.FAILED)] # doctest: +SKIP
        >>> assert workflow.resume_all( # doctest: +SKIP
        ...    include_failed=True).get("failed_job") is not None # doctest: +SKIP

    Returns:
        A list of (workflow_id, returned_obj_ref) resumed.
    """
    _ensure_workflow_initialized()
    filter_set = {WorkflowStatus.RESUMABLE}
    if include_failed:
        filter_set.add(WorkflowStatus.FAILED)
    all_failed = list_all(filter_set)

    try:
        workflow_manager = workflow_access.get_management_actor()
    except Exception as e:
        raise RuntimeError("Failed to get management actor") from e

    job_id = ray.get_runtime_context().job_id.hex()
    reconstructed_workflows = []
    for wid, _ in all_failed:
        context = workflow_context.WorkflowStepContext(workflow_id=wid)
        # TODO(suquark): This is not very efficient, but it makes sure
        #  running workflows has higher priority when getting reconstructed.
        try:
            ray.get(
                workflow_manager.reconstruct_workflow.remote(job_id, context))
        except Exception as e:
            # TODO(suquark): Here some workflows got resumed successfully but some
            #  failed and the user has no idea about this, which is very wired.
            # Maybe we should raise an exception here instead?
            logger.error(f"Failed to resume workflow {context.workflow_id}",
                         exc_info=e)
            raise
        reconstructed_workflows.append(context)

    results = []
    for context in reconstructed_workflows:
        results.append((
            context.workflow_id,
            workflow_manager.execute_workflow.remote(job_id, context),
        ))
    return results