Beispiel #1
0
def commit_step(
    store: workflow_storage.WorkflowStorage,
    step_id: "StepID",
    ret: Union["Workflow", Any],
    *,
    exception: Optional[Exception],
):
    """Checkpoint the step output.
    Args:
        store: The storage the current workflow is using.
        step_id: The ID of the step.
        ret: The returned object of the workflow step.
        exception: The exception caught by the step.
    """
    from ray.workflow.common import Workflow

    if isinstance(ret, Workflow):
        assert not ret.executed
        tasks = []
        for w in ret._iter_workflows_in_dag():
            # If this is a reference to a workflow, do not checkpoint
            # its input (again).
            if w.ref is None:
                tasks.append(_write_step_inputs(store, w.step_id, w.data))
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks))

    context = workflow_context.get_workflow_step_context()
    store.save_step_output(step_id,
                           ret,
                           exception=exception,
                           outer_most_step_id=context.outer_most_step_id)
Beispiel #2
0
            def step(method_name, method, *args, **kwargs):
                readonly = getattr(method, "__virtual_actor_readonly__", False)
                flattened_args = self.flatten_args(method_name, args, kwargs)
                actor_id = workflow_context.get_current_workflow_id()
                if not readonly:
                    if method_name == "__init__":
                        state_ref = None
                    else:
                        ws = WorkflowStorage(actor_id, get_global_storage())
                        state_ref = WorkflowRef(ws.get_entrypoint_step_id())
                    # This is a hack to insert a positional argument.
                    flattened_args = [signature.DUMMY_TYPE, state_ref
                                      ] + flattened_args
                workflow_inputs = serialization_context.make_workflow_inputs(
                    flattened_args)

                if readonly:
                    _actor_method = _wrap_readonly_actor_method(
                        actor_id, self.cls, method_name)
                    step_type = StepType.READONLY_ACTOR_METHOD
                else:
                    _actor_method = _wrap_actor_method(self.cls, method_name)
                    step_type = StepType.ACTOR_METHOD
                # TODO(suquark): Support actor options.
                workflow_data = WorkflowData(
                    func_body=_actor_method,
                    step_type=step_type,
                    inputs=workflow_inputs,
                    max_retries=1,
                    catch_exceptions=False,
                    ray_options={},
                    name=None,
                )
                wf = Workflow(workflow_data)
                return wf
Beispiel #3
0
def delete(workflow_id: str) -> None:
    """Delete a workflow, its checkpoints, and other information it may have
       persisted to storage. To stop a running workflow, see
       `workflow.cancel()`.

        NOTE: The caller should ensure that the workflow is not currently
        running before deleting it.

    Args:
        workflow_id: The workflow to delete.

    Examples:
        >>> from ray import workflow
        >>> some_job = ... # doctest: +SKIP
        >>> workflow_step = some_job.step() # doctest: +SKIP
        >>> output = workflow_step.run_async(workflow_id="some_job") # doctest: +SKIP
        >>> workflow.delete(workflow_id="some_job") # doctest: +SKIP
        >>> assert [] == workflow.list_all() # doctest: +SKIP

    Returns:
        None

    """

    _ensure_workflow_initialized()
    try:
        status = get_status(workflow_id)
        if status == WorkflowStatus.RUNNING:
            raise WorkflowRunningError("DELETE", workflow_id)
    except ValueError:
        raise WorkflowNotFoundError(workflow_id)

    wf_storage = WorkflowStorage(workflow_id)
    wf_storage.delete_workflow()
Beispiel #4
0
 def _create(self, args: Tuple[Any], kwargs: Dict[str, Any]):
     workflow_storage = WorkflowStorage(self._actor_id, self._storage)
     workflow_storage.save_actor_class_body(self._metadata.cls)
     ref = self._actor_method_call("__init__", args, kwargs)
     workflow_manager = get_or_create_management_actor()
     # keep the ref in a list to prevent dereference
     ray.get(workflow_manager.init_actor.remote(self._actor_id, [ref]))
Beispiel #5
0
    def step(self, *args, **kwargs):
        flattened_args = signature.flatten_args(self._signature, args, kwargs)
        actor_id = workflow_context.get_current_workflow_id()
        if not self.readonly:
            if self._method_name == "__init__":
                state_ref = None
            else:
                ws = WorkflowStorage(actor_id, get_global_storage())
                state_ref = WorkflowRef(ws.get_entrypoint_step_id())
            # This is a hack to insert a positional argument.
            flattened_args = [signature.DUMMY_TYPE, state_ref] + flattened_args
        workflow_inputs = serialization_context.make_workflow_inputs(
            flattened_args)

        if self.readonly:
            _actor_method = _wrap_readonly_actor_method(
                actor_id, self._original_class, self._method_name)
        else:
            _actor_method = _wrap_actor_method(self._original_class,
                                               self._method_name)
        workflow_data = WorkflowData(
            func_body=_actor_method,
            inputs=workflow_inputs,
            name=self._name,
            step_options=self._options,
            user_metadata=self._user_metadata,
        )
        wf = Workflow(workflow_data)
        return wf
Beispiel #6
0
def get_output_async(workflow_id: str,
                     *,
                     name: Optional[str] = None) -> ray.ObjectRef:
    """Get the output of a running workflow asynchronously.

    Args:
        workflow_id: The workflow to get the output of.
        name: If set, fetch the specific step instead of the output of the
            workflow.

    Returns:
        An object reference that can be used to retrieve the workflow task result.
    """
    _ensure_workflow_initialized()
    try:
        workflow_manager = workflow_access.get_management_actor()
    except ValueError as e:
        raise ValueError(
            "Failed to connect to the workflow management "
            "actor. The workflow could have already failed. You can use "
            "workflow.resume() to resume the workflow.") from e

    try:
        # check storage first
        wf_store = WorkflowStorage(workflow_id)
        tid = wf_store.inspect_output(name)
        if tid is not None:
            return workflow_access.load_step_output_from_storage.remote(
                workflow_id, name)
    except ValueError:
        pass

    return workflow_manager.get_output.remote(workflow_id, name)
Beispiel #7
0
def _construct_resume_workflow_from_step(
        reader: workflow_storage.WorkflowStorage, step_id: StepID,
        input_map: Dict[StepID, Any]) -> Union[Workflow, StepID]:
    """Try to construct a workflow (step) that recovers the workflow step.
    If the workflow step already has an output checkpointing file, we return
    the workflow step id instead.

    Args:
        reader: The storage reader for inspecting the step.
        step_id: The ID of the step we want to recover.
        input_map: This is a context storing the input which has been loaded.
            This context is important for dedupe

    Returns:
        A workflow that recovers the step, or a ID of a step
        that contains the output checkpoint file.
    """
    result: workflow_storage.StepInspectResult = reader.inspect_step(step_id)
    if result.output_object_valid:
        # we already have the output
        return step_id
    if isinstance(result.output_step_id, str):
        return _construct_resume_workflow_from_step(reader,
                                                    result.output_step_id,
                                                    input_map)
    # output does not exists or not valid. try to reconstruct it.
    if not result.is_recoverable():
        raise WorkflowStepNotRecoverableError(step_id)

    with serialization.objectref_cache():
        input_workflows = []
        for i, _step_id in enumerate(result.workflows):
            # Check whether the step has been loaded or not to avoid
            # duplication
            if _step_id in input_map:
                r = input_map[_step_id]
            else:
                r = _construct_resume_workflow_from_step(
                    reader, _step_id, input_map)
                input_map[_step_id] = r
            if isinstance(r, Workflow):
                input_workflows.append(r)
            else:
                assert isinstance(r, StepID)
                # TODO (Alex): We should consider caching these outputs too.
                input_workflows.append(reader.load_step_output(r))
        workflow_refs = list(map(WorkflowRef, result.workflow_refs))

        args, kwargs = reader.load_step_args(step_id, input_workflows,
                                             workflow_refs)

        recovery_workflow: Workflow = _recover_workflow_step.options(
            max_retries=result.max_retries,
            catch_exceptions=result.catch_exceptions,
            **result.ray_options).step(args, kwargs, input_workflows,
                                       workflow_refs)
        recovery_workflow._step_id = step_id
        recovery_workflow.data.step_type = result.step_type
        return recovery_workflow
Beispiel #8
0
 def _create(self, args: Tuple[Any], kwargs: Dict[str, Any]):
     workflow_storage = WorkflowStorage(self._actor_id)
     workflow_storage.save_actor_class_body(self._metadata.cls)
     method_helper = self._metadata.methods["__init__"]
     job_id = ray.get_runtime_context().job_id.hex()
     ref = self._actor_method_call(job_id, method_helper, args, kwargs)
     workflow_manager = get_or_create_management_actor()
     # keep the ref in a list to prevent dereference
     ray.get(workflow_manager.init_actor.remote(self._actor_id, [ref]))
Beispiel #9
0
def get_metadata(workflow_id: str,
                 name: Optional[str] = None) -> Dict[str, Any]:
    """Get the metadata of the workflow.

    This will return a dict of metadata of either the workflow (
    if only workflow_id is given) or a specific workflow step (if
    both workflow_id and step name are given). Exception will be
    raised if the given workflow id or step name does not exist.

    If only workflow id is given, this will return metadata on
    workflow level, which includes running status, workflow-level
    user metadata and workflow-level running stats (e.g. the
    start time and end time of the workflow).

    If both workflow id and step name are given, this will return
    metadata on workflow step level, which includes step inputs,
    step-level user metadata and step-level running stats (e.g.
    the start time and end time of the step).


    Args:
        workflow_id: The workflow to get the metadata of.
        name: If set, fetch the metadata of the specific step instead of
            the metadata of the workflow.

    Examples:
        >>> from ray import workflow
        >>> trip = ... # doctest: +SKIP
        >>> workflow_step = trip.options( # doctest: +SKIP
        ...     name="trip", metadata={"k1": "v1"}).step()
        >>> workflow_step.run( # doctest: +SKIP
        ...     workflow_id="trip1", metadata={"k2": "v2"})
        >>> workflow_metadata = workflow.get_metadata("trip1") # doctest: +SKIP
        >>> assert workflow_metadata["status"] == "SUCCESSFUL" # doctest: +SKIP
        >>> assert workflow_metadata["user_metadata"] == {"k2": "v2"} # doctest: +SKIP
        >>> assert "start_time" in workflow_metadata["stats"] # doctest: +SKIP
        >>> assert "end_time" in workflow_metadata["stats"] # doctest: +SKIP
        >>> step_metadata = workflow.get_metadata("trip1", "trip") # doctest: +SKIP
        >>> assert step_metadata["step_type"] == "FUNCTION" # doctest: +SKIP
        >>> assert step_metadata["user_metadata"] == {"k1": "v1"} # doctest: +SKIP
        >>> assert "start_time" in step_metadata["stats"] # doctest: +SKIP
        >>> assert "end_time" in step_metadata["stats"] # doctest: +SKIP

    Returns:
        A dictionary containing the metadata of the workflow.

    Raises:
        ValueError: if given workflow or workflow step does not exist.
    """
    _ensure_workflow_initialized()
    store = WorkflowStorage(workflow_id)
    if name is None:
        return store.load_workflow_metadata()
    else:
        return store.load_step_metadata(name)
Beispiel #10
0
def get_actor(actor_id: str, storage: Storage) -> VirtualActor:
    """Get an virtual actor.

    Args:
        actor_id: The ID of the actor.
        storage: The storage of the actor.

    Returns:
        A virtual actor.
    """
    ws = WorkflowStorage(actor_id, storage)
    cls = ws.load_actor_class_body()
    v_cls = VirtualActorClass._from_class(cls)
    return v_cls._construct(actor_id, storage)
Beispiel #11
0
def _reconstruct_wait_step(
    reader: workflow_storage.WorkflowStorage,
    result: workflow_storage.StepInspectResult,
    input_map: Dict[StepID, Any],
):
    input_workflows = []
    step_options = result.step_options
    wait_options = step_options.ray_options.get("wait_options", {})
    for i, _step_id in enumerate(result.workflows):
        # Check whether the step has been loaded or not to avoid
        # duplication
        if _step_id in input_map:
            r = input_map[_step_id]
        else:
            r = _construct_resume_workflow_from_step(reader, _step_id, input_map)
            input_map[_step_id] = r
        if isinstance(r, Workflow):
            input_workflows.append(r)
        else:
            assert isinstance(r, StepID)
            # TODO (Alex): We should consider caching these outputs too.
            output = reader.load_step_output(r)
            # Simulate a workflow with a workflow reference so it could be
            # used directly by 'workflow.wait'.
            static_ref = WorkflowStaticRef(step_id=r, ref=ray.put(output))
            wf = Workflow.from_ref(static_ref)
            input_workflows.append(wf)

    from ray import workflow

    return workflow.wait(input_workflows, **wait_options)
Beispiel #12
0
    def resolve(self, store: workflow_storage.WorkflowStorage) -> Tuple[List, Dict]:
        """
        This function resolves the inputs for the code inside
        a workflow step (works on the callee side). For outputs from other
        workflows, we resolve them into object instances inplace.

        For each ObjectRef argument, the function returns both the ObjectRef
        and the object instance. If the ObjectRef is a chain of nested
        ObjectRefs, then we resolve it recursively until we get the
        object instance, and we return the *direct* ObjectRef of the
        instance. This function does not resolve ObjectRef
        inside another object (e.g. list of ObjectRefs) to give users some
        flexibility.

        Returns:
            Instances of arguments.
        """
        workflow_ref_mapping = []
        for r in self.workflow_refs:
            if r.ref is None:
                workflow_ref_mapping.append(store.load_step_output(r.task_id))
            else:
                workflow_ref_mapping.append(r.ref)

        with serialization_context.workflow_args_resolving_context(
            workflow_ref_mapping
        ):
            # reconstruct input arguments under correct serialization context
            flattened_args: List[Any] = ray.get(self.args)

        # dereference arguments like Ray remote functions
        flattened_args = [
            ray.get(a) if isinstance(a, ObjectRef) else a for a in flattened_args
        ]
        return signature.recover_args(flattened_args)
Beispiel #13
0
def commit_step(store: workflow_storage.WorkflowStorage,
                step_id: "StepID",
                ret: Union["Workflow", Any],
                exception: Optional[Exception],
                outer_most_step_id: Optional[str] = None):
    """Checkpoint the step output.
    Args:
        store: The storage the current workflow is using.
        step_id: The ID of the step.
        ret: The returned object of the workflow step.
        outer_most_step_id: The ID of the outer most workflow. None if it
            does not exists. See "step_executor.execute_workflow" for detailed
            explanation.
    """
    from ray.workflow.common import Workflow
    if isinstance(ret, Workflow):
        store.save_subworkflow(ret)
    store.save_step_output(step_id, ret, exception, outer_most_step_id)
Beispiel #14
0
async def _write_step_inputs(wf_storage: workflow_storage.WorkflowStorage,
                             step_id: StepID, inputs: WorkflowData) -> None:
    """Save workflow inputs."""
    metadata = inputs.to_metadata()
    with serialization_context.workflow_args_keeping_context():
        # TODO(suquark): in the future we should write to storage directly
        # with plasma store object in memory.
        args_obj = ray.get(inputs.inputs.args)
    workflow_id = wf_storage._workflow_id
    storage = wf_storage._storage
    save_tasks = [
        # TODO (Alex): Handle the json case better?
        wf_storage._put(
            wf_storage._key_step_input_metadata(step_id), metadata, True),
        serialization.dump_to_storage(
            wf_storage._key_step_function_body(step_id), inputs.func_body,
            workflow_id, storage),
        serialization.dump_to_storage(
            wf_storage._key_step_args(step_id), args_obj, workflow_id, storage)
    ]
    await asyncio.gather(*save_tasks)
Beispiel #15
0
def commit_step(store: workflow_storage.WorkflowStorage, step_id: "StepID",
                ret: Union["Workflow", Any], exception: Optional[Exception]):
    """Checkpoint the step output.
    Args:
        store: The storage the current workflow is using.
        step_id: The ID of the step.
        ret: The returned object of the workflow step.
    """
    from ray.workflow.common import Workflow
    if isinstance(ret, Workflow):
        assert not ret.executed
        tasks = [
            _write_step_inputs(store, w.step_id, w.data)
            for w in ret._iter_workflows_in_dag()
        ]
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks))

    context = workflow_context.get_workflow_step_context()
    store.save_step_output(step_id,
                           ret,
                           exception=exception,
                           outer_most_step_id=context.outer_most_step_id)
Beispiel #16
0
def commit_step(store: workflow_storage.WorkflowStorage,
                step_id: "StepID",
                ret: Union["Workflow", Any],
                exception: Optional[Exception],
                outer_most_step_id: Optional[str] = None):
    """Checkpoint the step output.
    Args:
        store: The storage the current workflow is using.
        step_id: The ID of the step.
        ret: The returned object of the workflow step.
        outer_most_step_id: The ID of the outer most workflow. None if it
            does not exists. See "step_executor.execute_workflow" for detailed
            explanation.
    """
    from ray.workflow.common import Workflow
    if isinstance(ret, Workflow):
        assert not ret.executed
        tasks = [
            _write_step_inputs(store, w.step_id, w.data)
            for w in ret._iter_workflows_in_dag()
        ]
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks))

    store.save_step_output(step_id, ret, exception, outer_most_step_id)
Beispiel #17
0
def list_all(
    status_filter: Optional[Union[Union[WorkflowStatus, str],
                                  Set[Union[WorkflowStatus, str]]]] = None
) -> List[Tuple[str, WorkflowStatus]]:
    """List all workflows matching a given status filter. When returning "RESUMEABLE"
    workflows, the workflows that was running ranks before the workflow that was pending
    in the result list.

    Args:
        status_filter: If given, only returns workflow with that status. This can
            be a single status or set of statuses. The string form of the
            status is also acceptable, i.e.,
            "RUNNING"/"FAILED"/"SUCCESSFUL"/"CANCELED"/"RESUMABLE"/"PENDING".
    Examples:
        >>> from ray import workflow
        >>> long_running_job = ... # doctest: +SKIP
        >>> workflow_step = long_running_job.step() # doctest: +SKIP
        >>> wf = workflow_step.run_async( # doctest: +SKIP
        ...     workflow_id="long_running_job")
        >>> jobs = workflow.list_all() # doctest: +SKIP
        >>> assert jobs == [ ("long_running_job", workflow.RUNNING) ] # doctest: +SKIP
        >>> ray.get(wf) # doctest: +SKIP
        >>> jobs = workflow.list_all({workflow.RUNNING}) # doctest: +SKIP
        >>> assert jobs == [] # doctest: +SKIP
        >>> jobs = workflow.list_all(workflow.SUCCESSFUL) # doctest: +SKIP
        >>> assert jobs == [ # doctest: +SKIP
        ...     ("long_running_job", workflow.SUCCESSFUL)]

    Returns:
        A list of tuple with workflow id and workflow status
    """
    _ensure_workflow_initialized()
    if isinstance(status_filter, str):
        status_filter = set({WorkflowStatus(status_filter)})
    elif isinstance(status_filter, WorkflowStatus):
        status_filter = set({status_filter})
    elif isinstance(status_filter, set):
        if all(isinstance(s, str) for s in status_filter):
            status_filter = {WorkflowStatus(s) for s in status_filter}
        elif not all(isinstance(s, WorkflowStatus) for s in status_filter):
            raise TypeError("status_filter contains element which is not"
                            " a type of `WorkflowStatus or str`."
                            f" {status_filter}")
    elif status_filter is None:
        status_filter = set(WorkflowStatus)
        status_filter.discard(WorkflowStatus.NONE)
    else:
        raise TypeError(
            "status_filter must be WorkflowStatus or a set of WorkflowStatus.")

    try:
        workflow_manager = workflow_access.get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        non_terminating_workflows = {}
    else:
        non_terminating_workflows = ray.get(
            workflow_manager.list_non_terminating_workflows.remote())

    ret = []
    if set(non_terminating_workflows.keys()).issuperset(status_filter):
        for status, workflows in non_terminating_workflows.items():
            if status in status_filter:
                for w in workflows:
                    ret.append((w, status))
        return ret

    ret = []
    # Here we don't have workflow id, so use empty one instead
    store = WorkflowStorage("")
    modified_status_filter = status_filter.copy()
    # Here we have to add non-terminating status to the status filter, because some
    # "RESUMABLE" workflows are converted from non-terminating workflows below.
    # This is the tricky part: the status "RESUMABLE" neither come from
    # the workflow management actor nor the storage. It is the status where
    # the storage says it is non-terminating but the workflow management actor
    # is not running it. This usually happened when there was a sudden crash
    # of the whole Ray runtime or the workflow management actor
    # (due to cluster etc.). So we includes non terminating status in the storage
    # filter to get "RESUMABLE" candidates.
    modified_status_filter.update(WorkflowStatus.non_terminating_status())
    status_from_storage = store.list_workflow(modified_status_filter)
    non_terminating_workflows = {
        k: set(v)
        for k, v in non_terminating_workflows.items()
    }
    resume_running = []
    resume_pending = []
    for (k, s) in status_from_storage:
        if s in non_terminating_workflows and k not in non_terminating_workflows[
                s]:
            if s == WorkflowStatus.RUNNING:
                resume_running.append(k)
            elif s == WorkflowStatus.PENDING:
                resume_pending.append(k)
            else:
                assert False, "This line of code should not be reachable."
            continue
        if s in status_filter:
            ret.append((k, s))
    if WorkflowStatus.RESUMABLE in status_filter:
        # The running workflows ranks before the pending workflows.
        for w in resume_running:
            ret.append((w, WorkflowStatus.RESUMABLE))
        for w in resume_pending:
            ret.append((w, WorkflowStatus.RESUMABLE))
    return ret
Beispiel #18
0
def _construct_resume_workflow_from_step(
    reader: workflow_storage.WorkflowStorage,
    step_id: StepID,
    input_map: Dict[StepID, Any],
) -> Union[Workflow, StepID]:
    """Try to construct a workflow (step) that recovers the workflow step.
    If the workflow step already has an output checkpointing file, we return
    the workflow step id instead.

    Args:
        reader: The storage reader for inspecting the step.
        step_id: The ID of the step we want to recover.
        input_map: This is a context storing the input which has been loaded.
            This context is important for dedupe

    Returns:
        A workflow that recovers the step, or a ID of a step
        that contains the output checkpoint file.
    """
    result: workflow_storage.StepInspectResult = reader.inspect_step(step_id)
    if result.output_object_valid:
        # we already have the output
        return step_id
    if isinstance(result.output_step_id, str):
        return _construct_resume_workflow_from_step(
            reader, result.output_step_id, input_map
        )
    # output does not exists or not valid. try to reconstruct it.
    if not result.is_recoverable():
        raise WorkflowStepNotRecoverableError(step_id)

    step_options = result.step_options
    # Process the wait step as a special case.
    if step_options.step_type == StepType.WAIT:
        return _reconstruct_wait_step(reader, step_id, result, input_map)

    with serialization.objectref_cache():
        input_workflows = []
        for i, _step_id in enumerate(result.workflows):
            # Check whether the step has been loaded or not to avoid
            # duplication
            if _step_id in input_map:
                r = input_map[_step_id]
            else:
                r = _construct_resume_workflow_from_step(reader, _step_id, input_map)
                input_map[_step_id] = r
            if isinstance(r, Workflow):
                input_workflows.append(r)
            else:
                assert isinstance(r, StepID)
                # TODO (Alex): We should consider caching these outputs too.
                input_workflows.append(reader.load_step_output(r))
        workflow_refs = list(map(WorkflowRef, result.workflow_refs))

        args, kwargs = reader.load_step_args(step_id, input_workflows, workflow_refs)
        # Note: we must uppack args and kwargs, so the refs in the args/kwargs can get
        # resolved consistently like in Ray.
        recovery_workflow: Workflow = _recover_workflow_step.step(
            input_workflows,
            workflow_refs,
            *args,
            **kwargs,
        )
        recovery_workflow._step_id = step_id
        # override step_options
        recovery_workflow.data.step_options = step_options
        return recovery_workflow
Beispiel #19
0
def _construct_resume_workflow_from_step(
        reader: workflow_storage.WorkflowStorage,
        step_id: StepID,
        objectref_cache: Dict[str, Any] = None) -> Union[Workflow, StepID]:
    """Try to construct a workflow (step) that recovers the workflow step.
    If the workflow step already has an output checkpointing file, we return
    the workflow step id instead.

    Args:
        reader: The storage reader for inspecting the step.
        step_id: The ID of the step we want to recover.

    Returns:
        A workflow that recovers the step, or a ID of a step
        that contains the output checkpoint file.
    """
    if objectref_cache is None:
        objectref_cache = {}
    result: workflow_storage.StepInspectResult = reader.inspect_step(step_id)
    if result.output_object_valid:
        # we already have the output
        return step_id
    if isinstance(result.output_step_id, str):
        return _construct_resume_workflow_from_step(
            reader, result.output_step_id, objectref_cache=objectref_cache)
    # output does not exists or not valid. try to reconstruct it.
    if not result.is_recoverable():
        raise WorkflowStepNotRecoverableError(step_id)
    input_workflows = []
    instant_workflow_outputs: Dict[int, str] = {}
    for i, _step_id in enumerate(result.workflows):
        r = _construct_resume_workflow_from_step(
            reader, _step_id, objectref_cache=objectref_cache)
        if isinstance(r, Workflow):
            input_workflows.append(r)
        else:
            input_workflows.append(None)
            instant_workflow_outputs[i] = r
    workflow_refs = list(map(WorkflowRef, result.workflow_refs))

    # TODO (Alex): Refactor to remove this special case handling of object refs
    resolved_object_refs = []
    identifiers_to_await = []
    promises_to_await = []

    for identifier in result.object_refs:
        if identifier not in objectref_cache:
            paths = reader._key_step_args(identifier)
            promise = reader._get(paths)
            promises_to_await.append(promise)
            identifiers_to_await.append(identifier)

    loop = asyncio.get_event_loop()
    object_refs_to_cache = loop.run_until_complete(
        asyncio.gather(*promises_to_await))

    for identifier, object_ref in zip(identifiers_to_await,
                                      object_refs_to_cache):
        objectref_cache[identifier] = object_ref

    for identifier in result.object_refs:
        resolved_object_refs.append(objectref_cache[identifier])

    recovery_workflow: Workflow = _recover_workflow_step.options(
        max_retries=result.max_retries,
        catch_exceptions=result.catch_exceptions,
        **result.ray_options).step(resolved_object_refs, input_workflows,
                                   workflow_refs, instant_workflow_outputs)
    recovery_workflow._step_id = step_id
    recovery_workflow.data.step_type = result.step_type
    return recovery_workflow
Beispiel #20
0
def run_async(
    dag: DAGNode,
    *args,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    **kwargs,
) -> ray.ObjectRef:
    """Run a workflow asynchronously.

    If the workflow with the given id already exists, it will be resumed.

    Args:
        workflow_id: A unique identifier that can be used to resume the
            workflow. If not specified, a random id will be generated.
        metadata: The metadata to add to the workflow. It has to be able
            to serialize to json.

    Returns:
       The running result as ray.ObjectRef.

    """
    _ensure_workflow_initialized()
    if not isinstance(dag, DAGNode):
        raise TypeError("Input should be a DAG.")
    input_data = DAGInputData(*args, **kwargs)
    validate_user_metadata(metadata)
    metadata = metadata or {}

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    state = workflow_state_from_dag(dag, input_data, workflow_id)
    logger.info(f'Workflow job created. [id="{workflow_id}"].')

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    with workflow_context.workflow_step_context(context):
        # checkpoint the workflow
        ws = WorkflowStorage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        job_id = ray.get_runtime_context().job_id.hex()

        try:
            ws.get_entrypoint_step_id()
            wf_exists = True
        except Exception:
            # The workflow does not exist. We must checkpoint entry workflow.
            ws.save_workflow_execution_state("", state)
            wf_exists = False
        workflow_manager = workflow_access.get_management_actor()
        if ray.get(
                workflow_manager.is_workflow_non_terminating.remote(
                    workflow_id)):
            raise RuntimeError(
                f"Workflow '{workflow_id}' is already running or pending.")
        if wf_exists:
            return resume_async(workflow_id)
        ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE
        ray.get(
            workflow_manager.submit_workflow.remote(
                workflow_id, state, ignore_existing=ignore_existing))
        return workflow_manager.execute_workflow.remote(job_id, context)