Ejemplo n.º 1
0
def test_embedded_objectrefs(workflow_start_regular):
    workflow_id = test_embedded_objectrefs.__name__

    class ObjectRefsWrapper:
        def __init__(self, refs):
            self.refs = refs

    from ray.internal.storage import _storage_uri

    wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)])

    store = workflow_storage.get_workflow_storage(workflow_id)
    serialization.dump_to_storage("key", wrapped, workflow_id, store)

    # Be extremely explicit about shutting down. We want to make sure the
    # `_get` call deserializes the full object and puts it in the object store.
    # Shutting down the cluster should guarantee we don't accidently get the
    # old object and pass the test.
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)

    ray.init(storage=_storage_uri)
    workflow.init()
    storage2 = workflow_storage.get_workflow_storage(workflow_id)

    result = storage2._get("key")
    assert ray.get(result.refs) == [1, 2]
Ejemplo n.º 2
0
def delete(workflow_id: str) -> None:
    """Delete a workflow, its checkpoints, and other information it may have
       persisted to storage. To stop a running workflow, see
       `workflow.cancel()`.

        NOTE: The caller should ensure that the workflow is not currently
        running before deleting it.

    Args:
        workflow_id: The workflow to delete.

    Examples:
        >>> workflow_step = some_job.step()
        >>> output = workflow_step.run_async(workflow_id="some_job")
        >>> workflow.delete(workflow_id="some_job")
        >>> assert [] == workflow.list_all()

    Returns:
        None

    """

    try:
        status = get_status(workflow_id)
        if status == WorkflowStatus.RUNNING:
            raise WorkflowRunningError("DELETE", workflow_id)
    except ValueError:
        raise WorkflowNotFoundError(workflow_id)

    wf_storage = get_workflow_storage(workflow_id)
    wf_storage.delete_workflow()
Ejemplo n.º 3
0
def list_all(
        status_filter: Set[WorkflowStatus]
) -> List[Tuple[str, WorkflowStatus]]:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        runnings = []
    else:
        runnings = ray.get(workflow_manager.list_running_workflow.remote())
    if WorkflowStatus.RUNNING in status_filter and len(status_filter) == 1:
        return [(r, WorkflowStatus.RUNNING) for r in runnings]

    runnings = set(runnings)
    # Here we don't have workflow id, so use empty one instead
    store = workflow_storage.get_workflow_storage("")
    ret = []
    for (k, s) in store.list_workflow():
        if s == WorkflowStatus.RUNNING and k not in runnings:
            s = WorkflowStatus.RESUMABLE
        if s in status_filter:
            ret.append((k, s))
    return ret
Ejemplo n.º 4
0
def _recover_workflow_step(input_object_refs: List[ray.ObjectRef],
                           input_workflows: List[Any],
                           input_workflow_refs: List[WorkflowRef],
                           instant_workflow_inputs: Dict[int, StepID]):
    """A workflow step that recovers the output of an unfinished step.

    Args:
        input_object_refs: The object refs in the argument of
            the (original) step.
        input_workflows: The workflows in the argument of the (original) step.
            They are resolved into physical objects (i.e. the output of the
            workflows) here. They come from other recover workflows we
            construct recursively.
        instant_workflow_inputs: Same as 'input_workflows', but they come
            point to workflow steps that have output checkpoints. They override
            corresponding workflows in 'input_workflows'.

    Returns:
        The output of the recovered step.
    """
    reader = workflow_storage.get_workflow_storage()
    for index, _step_id in instant_workflow_inputs.items():
        # override input workflows with instant workflows
        input_workflows[index] = reader.load_step_output(_step_id)

    step_id = workflow_context.get_current_step_id()
    func: Callable = reader.load_step_func_body(step_id)
    args, kwargs = reader.load_step_args(
        step_id, input_workflows, input_object_refs, input_workflow_refs)
    return func(*args, **kwargs)
Ejemplo n.º 5
0
def test_dynamic_output(workflow_start_regular_shared):
    @ray.remote
    def exponential_fail(k, n):
        if n > 0:
            if n < 3:
                raise Exception("Failed intentionally")
            return workflow.continuation(
                exponential_fail.options(**workflow.options(
                    name=f"step_{n}")).bind(k * 2, n - 1))
        return k

    # When workflow fails, the dynamic output should points to the
    # latest successful step.
    try:
        workflow.run(
            exponential_fail.options(**workflow.options(name="step_0")).bind(
                3, 10),
            workflow_id="dynamic_output",
        )
    except Exception:
        pass
    from ray.workflow.workflow_storage import get_workflow_storage

    wf_storage = get_workflow_storage(workflow_id="dynamic_output")
    result = wf_storage.inspect_step("step_0")
    assert result.output_step_id == "step_3"
Ejemplo n.º 6
0
def cancel(workflow_id: str) -> None:
    try:
        workflow_manager = get_management_actor()
        ray.get(workflow_manager.cancel_workflow.remote(workflow_id))
    except ValueError:
        wf_store = workflow_storage.get_workflow_storage(workflow_id)
        wf_store.save_workflow_meta(WorkflowMetaData(WorkflowStatus.CANCELED))
Ejemplo n.º 7
0
def test_shortcut(workflow_start_regular):
    assert recursive_chain.step(0).run(workflow_id="shortcut") == 100
    # the shortcut points to the step with output checkpoint
    store = workflow_storage.get_workflow_storage("shortcut")
    step_id = store.get_entrypoint_step_id()
    output_step_id = store.inspect_step(step_id).output_step_id
    assert store.inspect_step(output_step_id).output_object_valid
Ejemplo n.º 8
0
def test_embedded_objectrefs(workflow_start_regular):
    workflow_id = test_embedded_objectrefs.__name__
    base_storage = storage.get_global_storage()

    class ObjectRefsWrapper:
        def __init__(self, refs):
            self.refs = refs

    url = base_storage.storage_url

    wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)])

    promise = serialization.dump_to_storage(["key"], wrapped, workflow_id,
                                            base_storage)
    workflow_storage.asyncio_run(promise)

    # Be extremely explicit about shutting down. We want to make sure the
    # `_get` call deserializes the full object and puts it in the object store.
    # Shutting down the cluster should guarantee we don't accidently get the
    # old object and pass the test.
    ray.shutdown()
    subprocess.check_output("ray stop --force", shell=True)

    workflow.init(url)
    storage2 = workflow_storage.get_workflow_storage(workflow_id)

    result = workflow_storage.asyncio_run(storage2._get(["key"]))
    assert ray.get(result.refs) == [1, 2]
Ejemplo n.º 9
0
def run(entry_workflow: Workflow,
        workflow_id: Optional[str] = None,
        metadata: Optional[Dict] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously.
    """
    if metadata is not None:
        if not isinstance(metadata, dict):
            raise ValueError("metadata must be a dict.")
        for k, v in metadata.items():
            try:
                json.dumps(v)
            except TypeError as e:
                raise ValueError("metadata values must be JSON serializable, "
                                 "however '{}' has a value whose {}.".format(
                                     k, e))
    metadata = metadata or {}

    store = get_global_storage()
    assert ray.is_initialized()
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    logger.info(
        f"Workflow job created. [id=\"{workflow_id}\", storage_url="
        f"\"{store.storage_url}\"]. Type: {entry_workflow.data.step_type} ")

    with workflow_context.workflow_step_context(workflow_id,
                                                store.storage_url):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists:
            commit_step(ws, "", entry_workflow, exception=None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION)
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(workflow_id,
                                                  ignore_existing))
        if entry_workflow.data.step_type == StepType.FUNCTION:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Ejemplo n.º 10
0
def run(
    entry_workflow: Workflow,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict] = None,
) -> ray.ObjectRef:
    """Run a workflow asynchronously."""
    validate_user_metadata(metadata)
    metadata = metadata or {}

    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"
    step_type = entry_workflow.data.step_options.step_type

    logger.info(
        f'Workflow job created. [id="{workflow_id}"]. Type: {step_type}.')

    with workflow_context.workflow_step_context(workflow_id):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # "Is growing" means we could adding steps to the (top-level)
        # workflow to grow the workflow dynamically at runtime.
        is_growing = step_type not in (StepType.FUNCTION, StepType.WAIT)

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if is_growing or not wf_exists:
            # We must checkpoint entry workflow.
            commit_step(ws, "", entry_workflow, exception=None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = is_growing
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        job_id = ray.get_runtime_context().job_id.hex()
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(job_id, workflow_id,
                                                  ignore_existing))
        if not is_growing:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Ejemplo n.º 11
0
def get_metadata(workflow_id: str, name: Optional[str]) -> Dict[str, Any]:
    """Get the metadata of the workflow.
    See "api.get_metadata()" for details.
    """
    store = workflow_storage.get_workflow_storage(workflow_id)
    if name is None:
        return store.load_workflow_metadata()
    else:
        return store.load_step_metadata(name)
Ejemplo n.º 12
0
def cancel(workflow_id: str) -> None:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        wf_store = workflow_storage.get_workflow_storage(workflow_id)
        # TODO(suquark): Here we update workflow status "offline", so it is likely
        # thread-safe because there is no workflow management actor updating the
        # workflow concurrently. But we should be careful if we are going to
        # update more workflow status offline in the future.
        wf_store.update_workflow_status(WorkflowStatus.CANCELED)
        return
    ray.get(workflow_manager.cancel_workflow.remote(workflow_id))
Ejemplo n.º 13
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    task_id: "TaskID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        task_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.

    Returns:
        Workflow step output.
    """
    with workflow_context.workflow_step_context(context):
        store = workflow_storage.get_workflow_storage()
        # Part 1: resolve inputs
        args, kwargs = baked_inputs.resolve(store)

        # Part 2: execute the step
        try:
            store.save_step_prerun_metadata(task_id,
                                            {"start_time": time.time()})
            with workflow_context.workflow_execution():
                output = _wrap_run(func, runtime_options, *args, **kwargs)
            store.save_step_postrun_metadata(task_id,
                                             {"end_time": time.time()})
        except Exception as e:
            # Always checkpoint the exception.
            store.save_step_output(task_id, None, exception=e)
            raise e

        if isinstance(output, DAGNode):
            output = workflow_state_from_dag(output, None, context.workflow_id)
            execution_metadata = WorkflowExecutionMetadata(
                is_output_workflow=True)
        else:
            execution_metadata = WorkflowExecutionMetadata()

        # Part 3: save outputs
        # TODO(suquark): Validate checkpoint options before commit the task.
        if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC:
            if isinstance(output, WorkflowExecutionState):
                store.save_workflow_execution_state(task_id, output)
            else:
                store.save_step_output(task_id, output, exception=None)
        return execution_metadata, output
Ejemplo n.º 14
0
 def get_workflow_status(self, workflow_id: str) -> WorkflowStatus:
     """Get the status of the workflow."""
     if workflow_id in self._workflow_executors:
         if workflow_id in self._queued_workflows:
             return WorkflowStatus.PENDING
         return WorkflowStatus.RUNNING
     store = workflow_storage.get_workflow_storage(workflow_id)
     status = store.load_workflow_status()
     if status == WorkflowStatus.NONE:
         raise WorkflowNotFoundError(workflow_id)
     elif status in WorkflowStatus.non_terminating_status():
         return WorkflowStatus.RESUMABLE
     return status
Ejemplo n.º 15
0
def test_shortcut(workflow_start_regular):
    @ray.remote
    def recursive_chain(x):
        if x < 100:
            return workflow.continuation(recursive_chain.bind(x + 1))
        else:
            return 100

    assert workflow.create(recursive_chain.bind(0)).run(workflow_id="shortcut") == 100
    # the shortcut points to the step with output checkpoint
    store = workflow_storage.get_workflow_storage("shortcut")
    task_id = store.get_entrypoint_step_id()
    output_step_id = store.inspect_step(task_id).output_step_id
    assert store.inspect_step(output_step_id).output_object_valid
Ejemplo n.º 16
0
def get_status(workflow_id: str) -> Optional[WorkflowStatus]:
    try:
        workflow_manager = get_management_actor()
        running = ray.get(
            workflow_manager.is_workflow_running.remote(workflow_id))
    except Exception:
        running = False
    if running:
        return WorkflowStatus.RUNNING
    store = workflow_storage.get_workflow_storage(workflow_id)
    meta = store.load_workflow_meta()
    if meta is None:
        raise ValueError(f"No such workflow_id {workflow_id}")
    return meta.status
Ejemplo n.º 17
0
def get_status(workflow_id: str) -> Optional[WorkflowStatus]:
    try:
        workflow_manager = get_management_actor()
        running = ray.get(workflow_manager.is_workflow_running.remote(workflow_id))
    except Exception:
        running = False
    if running:
        return WorkflowStatus.RUNNING
    store = workflow_storage.get_workflow_storage(workflow_id)
    meta = store.load_workflow_meta()
    if meta is None:
        raise WorkflowNotFoundError(workflow_id)
    if meta.status == WorkflowStatus.RUNNING:
        return WorkflowStatus.RESUMABLE
    return meta.status
Ejemplo n.º 18
0
def run(
    dag: DAGNode,
    dag_inputs: DAGInputData,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict] = None,
) -> ray.ObjectRef:
    """Run a workflow asynchronously."""
    validate_user_metadata(metadata)
    metadata = metadata or {}

    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    state = workflow_state_from_dag(dag, dag_inputs, workflow_id)
    logger.info(f'Workflow job created. [id="{workflow_id}"].')

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    with workflow_context.workflow_step_context(context):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        job_id = ray.get_runtime_context().job_id.hex()

        try:
            ws.get_entrypoint_step_id()
            wf_exists = True
        except Exception:
            # The workflow does not exist. We must checkpoint entry workflow.
            ws.save_workflow_execution_state("", state)
            wf_exists = False
        workflow_manager = get_or_create_management_actor()
        if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)):
            raise RuntimeError(f"Workflow '{workflow_id}' is already running.")
        if wf_exists:
            return resume(workflow_id)
        ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE
        ray.get(
            workflow_manager.submit_workflow.remote(
                workflow_id, state, ignore_existing=ignore_existing))
        return workflow_manager.execute_workflow.remote(job_id, context)
Ejemplo n.º 19
0
def _put_helper(identifier: str, obj: Any, workflow_id: str) -> None:
    # TODO (Alex): This check isn't sufficient, it only works for directly
    # nested object refs.
    if isinstance(obj, ray.ObjectRef):
        raise NotImplementedError(
            "Workflow does not support checkpointing nested object references yet."
        )
    key = _obj_id_to_key(identifier)
    from ray.workflow import workflow_storage  # avoid cyclic import

    dump_to_storage(
        key,
        obj,
        workflow_id,
        workflow_storage.get_workflow_storage(workflow_id),
        update_existing=False,
    )
Ejemplo n.º 20
0
def _resolve_dynamic_workflow_refs(workflow_refs: "List[WorkflowRef]"):
    """Get the output of a workflow step with the step ID at runtime.

    We lookup the output by the following order:
    1. Query cached step output in the workflow manager. Fetch the physical
       output object.
    2. If failed to fetch the physical output object, look into the storage
       to see whether the output is checkpointed. Load the checkpoint.
    3. If failed to load the checkpoint, resume the step and get the output.
    """
    workflow_manager = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()
    workflow_id = context.workflow_id
    storage_url = context.storage_url
    workflow_ref_mapping = []
    for workflow_ref in workflow_refs:
        step_ref = ray.get(
            workflow_manager.get_cached_step_output.remote(
                workflow_id, workflow_ref.step_id
            )
        )
        get_cached_step = False
        if step_ref is not None:
            try:
                output, _ = _resolve_object_ref(step_ref)
                get_cached_step = True
            except Exception:
                get_cached_step = False
        if not get_cached_step:
            wf_store = workflow_storage.get_workflow_storage()
            try:
                output = wf_store.load_step_output(workflow_ref.step_id)
            except Exception:
                current_step_id = workflow_context.get_current_step_id()
                logger.warning(
                    "Failed to get the output of step "
                    f"{workflow_ref.step_id}. Trying to resume it. "
                    f"Current step: '{current_step_id}'"
                )
                step_ref = recovery.resume_workflow_step(
                    workflow_id, workflow_ref.step_id, storage_url, None
                ).persisted_output
                output, _ = _resolve_object_ref(step_ref)
        workflow_ref_mapping.append(output)
    return workflow_ref_mapping
Ejemplo n.º 21
0
def run(entry_workflow: Workflow,
        workflow_id: Optional[str] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously.
    """
    store = get_global_storage()
    assert ray.is_initialized()
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url="
                f"\"{store.storage_url}\"].")

    with workflow_context.workflow_step_context(workflow_id,
                                                store.storage_url):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists:
            commit_step(ws, "", entry_workflow, None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION)
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(workflow_id,
                                                  ignore_existing))
        if entry_workflow.data.step_type == StepType.FUNCTION:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Ejemplo n.º 22
0
def _recover_workflow_step(args: List[Any], kwargs: Dict[str, Any],
                           input_workflows: List[Any],
                           input_workflow_refs: List[WorkflowRef]):
    """A workflow step that recovers the output of an unfinished step.

    Args:
        args: The positional arguments for the step function.
        kwargs: The keyword args for the step function.
        input_workflows: The workflows in the argument of the (original) step.
            They are resolved into physical objects (i.e. the output of the
            workflows) here. They come from other recover workflows we
            construct recursively.

    Returns:
        The output of the recovered step.
    """
    reader = workflow_storage.get_workflow_storage()
    step_id = workflow_context.get_current_step_id()
    func: Callable = reader.load_step_func_body(step_id)
    return func(*args, **kwargs)
Ejemplo n.º 23
0
def _workflow_wait_executor(
    func: Callable,
    context: "WorkflowStepContext",
    job_id: str,
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[WaitResult, None]:
    """Executor of 'workflow.wait' steps.

    It returns a tuple that contains wait result. The wait result is a list
    of result of workflows that are ready and a list of workflows that are
    pending.
    """
    # Part 1: Update the context for the step.
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    assert step_type == StepType.WAIT
    wait_options = runtime_options.ray_options.get("wait_options", {})

    # Part 2: Resolve any ready workflows.
    ready_workflows, remaining_workflows = baked_inputs.wait(**wait_options)
    ready_objects = [
        _resolve_static_workflow_ref(w.ref) for w in ready_workflows
    ]
    output = (ready_objects, remaining_workflows)

    # Part 3: Save the outputs.
    store = workflow_storage.get_workflow_storage()
    # TODO(suquark): Because the outputs are not generated by "workflow.wait",
    # we do not checkpoint the outputs here. Those steps that generate
    # outputs should checkpoint them.
    commit_step(store, step_id, output, exception=None)
    if context.last_step_of_workflow:
        # advance the progress of the workflow
        store.advance_progress(step_id)

    _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    return output
Ejemplo n.º 24
0
def _workflow_wait_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[WaitResult, None]:
    """Executor of 'workflow.wait' steps.

    It returns a tuple that contains wait result. The wait result is a list
    of result of workflows that are ready and a list of workflows that are
    pending.
    """
    # Part 1: Update the context for the step.
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    assert step_type == StepType.WAIT
    wait_options = runtime_options.ray_options.get("wait_options", {})

    # Part 2: Resolve any ready workflows.
    ready_workflows, remaining_workflows = baked_inputs.wait(**wait_options)
    ready_objects = []
    for w in ready_workflows:
        (
            obj,
            _,
        ) = _resolve_object_ref(w.ref.ref)
        ready_objects.append(obj)
    persisted_output = (ready_objects, remaining_workflows)

    # Part 3: Save the outputs.
    store = workflow_storage.get_workflow_storage()
    commit_step(store, step_id, persisted_output, exception=None)
    if context.last_step_of_workflow:
        # advance the progress of the workflow
        store.advance_progress(step_id)

    _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    return persisted_output, None
Ejemplo n.º 25
0
def list_all(
        status_filter: Set[WorkflowStatus]
) -> List[Tuple[str, WorkflowStatus]]:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        runnings = []
    else:
        runnings = ray.get(workflow_manager.list_running_workflow.remote())
    if WorkflowStatus.RUNNING in status_filter and len(status_filter) == 1:
        return [(r, WorkflowStatus.RUNNING) for r in runnings]

    runnings = set(runnings)
    # Here we don't have workflow id, so use empty one instead
    store = workflow_storage.get_workflow_storage("")

    # This is the tricky part: the status "RESUMABLE" neither come from
    # the workflow management actor nor the storage. It is the status where
    # the storage says it is running but the workflow management actor
    # is not running it. This usually happened when there was a sudden crash
    # of the whole Ray runtime or the workflow management actor
    # (due to cluster etc.). So we includes 'RUNNING' status in the storage
    # filter to get "RESUMABLE" candidates.
    storage_status_filter = status_filter.copy()
    storage_status_filter.add(WorkflowStatus.RUNNING)
    status_from_storage = store.list_workflow(storage_status_filter)
    ret = []
    for (k, s) in status_from_storage:
        if s == WorkflowStatus.RUNNING:
            if k not in runnings:
                s = WorkflowStatus.RESUMABLE
            else:
                continue
        if s in status_filter:
            ret.append((k, s))
    if WorkflowStatus.RUNNING in status_filter:
        ret.extend((k, WorkflowStatus.RUNNING) for k in runnings)
    return ret
Ejemplo n.º 26
0
def list_all(
        status_filter: Set[WorkflowStatus]
) -> List[Tuple[str, WorkflowStatus]]:
    try:
        workflow_manager = get_management_actor()
    except ValueError:
        workflow_manager = None

    if workflow_manager is None:
        runnings = []
    else:
        runnings = ray.get(workflow_manager.list_running_workflow.remote())
    if WorkflowStatus.RUNNING in status_filter and len(status_filter) == 1:
        return [(r, WorkflowStatus.RUNNING) for r in runnings]

    runnings = set(runnings)
    # Here we don't have workflow id, so use empty one instead
    store = workflow_storage.get_workflow_storage("")

    exclude_running = False
    if (WorkflowStatus.RESUMABLE in status_filter
            and WorkflowStatus.RUNNING not in status_filter):
        # Here we have to add "RUNNING" to the status filter, because some "RESUMABLE"
        # workflows are converted from "RUNNING" workflows below.
        exclude_running = True
        status_filter.add(WorkflowStatus.RUNNING)
    status_from_storage = store.list_workflow(status_filter)
    ret = []
    for (k, s) in status_from_storage:
        if s == WorkflowStatus.RUNNING:
            if k not in runnings:
                s = WorkflowStatus.RESUMABLE
            elif exclude_running:
                continue
        if s in status_filter:
            ret.append((k, s))
    return ret
Ejemplo n.º 27
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        persisted_output, volatile_output = _wrap_run(func, runtime_options,
                                                      *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor "
                "is not allowed.")
        assert not isinstance(persisted_output, Workflow)
    else:
        store = workflow_storage.get_workflow_storage()
        commit_step(store, step_id, persisted_output, exception=None)
        if isinstance(persisted_output, Workflow):
            outer_most_step_id = context.outer_most_step_id
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            assert volatile_output is None
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(persisted_output)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Ejemplo n.º 28
0
def _workflow_step_executor(step_type: StepType, func: Callable,
                            context: workflow_context.WorkflowStepContext,
                            step_id: "StepID",
                            baked_inputs: "_BakedWorkflowInputs",
                            catch_exceptions: bool, max_retries: int) -> Any:
    """Executor function for workflow step.

    Args:
        step_type: The type of workflow step.
        func: The workflow step function.
        context: Workflow step context. Used to access correct storage etc.
        step_id: The ID of the step.
        baked_inputs: The processed inputs for the step.
        catch_exceptions: If set to be true, return
            (Optional[Result], Optional[Error]) instead of Result.
        max_retries: Max number of retries encounter of a failure.

    Returns:
        Workflow step output.
    """
    workflow_context.update_workflow_step_context(context, step_id)
    args, kwargs = _resolve_step_inputs(baked_inputs)
    store = workflow_storage.get_workflow_storage()
    try:
        persisted_output, volatile_output = _wrap_run(
            func, step_type, step_id, catch_exceptions, max_retries, *args,
            **kwargs)
    except Exception as e:
        commit_step(store, step_id, None, e)
        raise e
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor "
                "is not allowed.")
        assert not isinstance(persisted_output, Workflow)
    else:
        store = workflow_storage.get_workflow_storage()
        commit_step(store, step_id, persisted_output, None)
        outer_most_step_id = context.outer_most_step_id
        if isinstance(persisted_output, Workflow):
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            assert volatile_output is None
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(persisted_output)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Ejemplo n.º 29
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
    inplace: bool = False,
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.
        inplace: Execute the workflow inplace.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    context.checkpoint_context.checkpoint = runtime_options.checkpoint

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        with workflow_context.workflow_execution():
            persisted_output, volatile_output = _wrap_run(
                func, runtime_options, *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        # Always checkpoint the exception.
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor is not allowed."
            )
        assert not isinstance(persisted_output, Workflow)
    else:
        # TODO(suquark): Validate checkpoint options before
        # commit the step.
        store = workflow_storage.get_workflow_storage()
        if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC:
            commit_step(
                store,
                step_id,
                persisted_output,
                exception=None,
            )
        if isinstance(persisted_output, Workflow):
            sub_workflow = persisted_output
            outer_most_step_id = context.outer_most_step_id
            assert volatile_output is None
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            if inplace:
                _step_options = sub_workflow.data.step_options
                if (_step_options.step_type != StepType.WAIT
                        and runtime_options.ray_options !=
                        _step_options.ray_options):
                    logger.warning(
                        f"Workflow step '{sub_workflow.step_id}' uses "
                        f"a Ray option different to its caller step '{step_id}' "
                        f"and will be executed inplace. Ray assumes it still "
                        f"consumes the same resource as the caller. This may result "
                        f"in oversubscribing resources.")
                return (
                    InplaceReturnedWorkflow(
                        sub_workflow,
                        {"outer_most_step_id": outer_most_step_id}),
                    None,
                )
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(sub_workflow)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Ejemplo n.º 30
0
def _load_ref_helper(key: str, workflow_id: str):
    # TODO(Alex): We should stream the data directly into `cloudpickle.load`.
    from ray.workflow import workflow_storage  # avoid cyclic import

    storage = workflow_storage.get_workflow_storage(workflow_id)
    return storage._get(key)