Ejemplo n.º 1
0
 def _create(self, args: Tuple[Any], kwargs: Dict[str, Any]):
     workflow_storage = WorkflowStorage(self._actor_id, self._storage)
     workflow_storage.save_actor_class_body(self._metadata.cls)
     ref = self._actor_method_call("__init__", args, kwargs)
     workflow_manager = get_or_create_management_actor()
     # keep the ref in a list to prevent dereference
     ray.get(workflow_manager.init_actor.remote(self._actor_id, [ref]))
Ejemplo n.º 2
0
def run(entry_workflow: Workflow,
        workflow_id: Optional[str] = None,
        metadata: Optional[Dict] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously.
    """
    if metadata is not None:
        if not isinstance(metadata, dict):
            raise ValueError("metadata must be a dict.")
        for k, v in metadata.items():
            try:
                json.dumps(v)
            except TypeError as e:
                raise ValueError("metadata values must be JSON serializable, "
                                 "however '{}' has a value whose {}.".format(
                                     k, e))
    metadata = metadata or {}

    store = get_global_storage()
    assert ray.is_initialized()
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    logger.info(
        f"Workflow job created. [id=\"{workflow_id}\", storage_url="
        f"\"{store.storage_url}\"]. Type: {entry_workflow.data.step_type} ")

    with workflow_context.workflow_step_context(workflow_id,
                                                store.storage_url):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists:
            commit_step(ws, "", entry_workflow, exception=None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION)
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(workflow_id,
                                                  ignore_existing))
        if entry_workflow.data.step_type == StepType.FUNCTION:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Ejemplo n.º 3
0
def run(
    entry_workflow: Workflow,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict] = None,
) -> ray.ObjectRef:
    """Run a workflow asynchronously."""
    validate_user_metadata(metadata)
    metadata = metadata or {}

    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"
    step_type = entry_workflow.data.step_options.step_type

    logger.info(
        f'Workflow job created. [id="{workflow_id}"]. Type: {step_type}.')

    with workflow_context.workflow_step_context(workflow_id):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # "Is growing" means we could adding steps to the (top-level)
        # workflow to grow the workflow dynamically at runtime.
        is_growing = step_type not in (StepType.FUNCTION, StepType.WAIT)

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if is_growing or not wf_exists:
            # We must checkpoint entry workflow.
            commit_step(ws, "", entry_workflow, exception=None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = is_growing
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        job_id = ray.get_runtime_context().job_id.hex()
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(job_id, workflow_id,
                                                  ignore_existing))
        if not is_growing:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Ejemplo n.º 4
0
 def _create(self, args: Tuple[Any], kwargs: Dict[str, Any]):
     workflow_storage = WorkflowStorage(self._actor_id)
     workflow_storage.save_actor_class_body(self._metadata.cls)
     method_helper = self._metadata.methods["__init__"]
     job_id = ray.get_runtime_context().job_id.hex()
     ref = self._actor_method_call(job_id, method_helper, args, kwargs)
     workflow_manager = get_or_create_management_actor()
     # keep the ref in a list to prevent dereference
     ray.get(workflow_manager.init_actor.remote(self._actor_id, [ref]))
Ejemplo n.º 5
0
    def step_id(self) -> StepID:
        if self._step_id is not None:
            return self._step_id

        from ray.workflow.workflow_access import \
            get_or_create_management_actor
        mgr = get_or_create_management_actor()
        self._step_id = ray.get(
            mgr.gen_step_id.remote(self._workflow_id, self._name))
        return self._step_id
Ejemplo n.º 6
0
def resume(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously. See "api.resume()" for details."""
    logger.info(f'Resuming workflow [id="{workflow_id}"].')
    workflow_manager = get_or_create_management_actor()
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    job_id = ray.get_runtime_context().job_id.hex()
    result: "WorkflowExecutionResult" = ray.get(
        workflow_manager.run_or_resume.remote(job_id,
                                              workflow_id,
                                              ignore_existing=False))
    logger.info(f"Workflow job {workflow_id} resumed.")
    return flatten_workflow_output(workflow_id, result.persisted_output)
Ejemplo n.º 7
0
def resume(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously. See "api.resume()" for details."""
    logger.info(f'Resuming workflow [id="{workflow_id}"].')
    workflow_manager = get_or_create_management_actor()
    if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)):
        raise RuntimeError(f"Workflow '{workflow_id}' is already running.")
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    job_id = ray.get_runtime_context().job_id.hex()

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context))
    result = workflow_manager.execute_workflow.remote(job_id, context)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return result
Ejemplo n.º 8
0
def run(
    dag: DAGNode,
    dag_inputs: DAGInputData,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict] = None,
) -> ray.ObjectRef:
    """Run a workflow asynchronously."""
    validate_user_metadata(metadata)
    metadata = metadata or {}

    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    state = workflow_state_from_dag(dag, dag_inputs, workflow_id)
    logger.info(f'Workflow job created. [id="{workflow_id}"].')

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    with workflow_context.workflow_step_context(context):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        job_id = ray.get_runtime_context().job_id.hex()

        try:
            ws.get_entrypoint_step_id()
            wf_exists = True
        except Exception:
            # The workflow does not exist. We must checkpoint entry workflow.
            ws.save_workflow_execution_state("", state)
            wf_exists = False
        workflow_manager = get_or_create_management_actor()
        if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)):
            raise RuntimeError(f"Workflow '{workflow_id}' is already running.")
        if wf_exists:
            return resume(workflow_id)
        ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE
        ray.get(
            workflow_manager.submit_workflow.remote(
                workflow_id, state, ignore_existing=ignore_existing))
        return workflow_manager.execute_workflow.remote(job_id, context)
Ejemplo n.º 9
0
def _resolve_dynamic_workflow_refs(workflow_refs: "List[WorkflowRef]"):
    """Get the output of a workflow step with the step ID at runtime.

    We lookup the output by the following order:
    1. Query cached step output in the workflow manager. Fetch the physical
       output object.
    2. If failed to fetch the physical output object, look into the storage
       to see whether the output is checkpointed. Load the checkpoint.
    3. If failed to load the checkpoint, resume the step and get the output.
    """
    workflow_manager = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()
    workflow_id = context.workflow_id
    storage_url = context.storage_url
    workflow_ref_mapping = []
    for workflow_ref in workflow_refs:
        step_ref = ray.get(
            workflow_manager.get_cached_step_output.remote(
                workflow_id, workflow_ref.step_id
            )
        )
        get_cached_step = False
        if step_ref is not None:
            try:
                output, _ = _resolve_object_ref(step_ref)
                get_cached_step = True
            except Exception:
                get_cached_step = False
        if not get_cached_step:
            wf_store = workflow_storage.get_workflow_storage()
            try:
                output = wf_store.load_step_output(workflow_ref.step_id)
            except Exception:
                current_step_id = workflow_context.get_current_step_id()
                logger.warning(
                    "Failed to get the output of step "
                    f"{workflow_ref.step_id}. Trying to resume it. "
                    f"Current step: '{current_step_id}'"
                )
                step_ref = recovery.resume_workflow_step(
                    workflow_id, workflow_ref.step_id, storage_url, None
                ).persisted_output
                output, _ = _resolve_object_ref(step_ref)
        workflow_ref_mapping.append(output)
    return workflow_ref_mapping
Ejemplo n.º 10
0
def run(entry_workflow: Workflow,
        workflow_id: Optional[str] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously.
    """
    store = get_global_storage()
    assert ray.is_initialized()
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url="
                f"\"{store.storage_url}\"].")

    with workflow_context.workflow_step_context(workflow_id,
                                                store.storage_url):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists:
            commit_step(ws, "", entry_workflow, None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION)
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(workflow_id,
                                                  ignore_existing))
        if entry_workflow.data.step_type == StepType.FUNCTION:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Ejemplo n.º 11
0
 def ready(self) -> "ObjectRef":
     """Return a future. If 'ray.get()' runs successfully, then the actor
     is fully initialized."""
     # TODO(suquark): should ray.get(xxx.ready()) always be true?
     workflow_manager = get_or_create_management_actor()
     return ray.get(workflow_manager.actor_ready.remote(self._actor_id))
Ejemplo n.º 12
0
def workflow_state_from_dag(dag_node: DAGNode,
                            input_context: Optional[DAGInputData],
                            workflow_id: str):
    """
    Transform a Ray DAG to a workflow. Map FunctionNode to workflow step with
    the workflow decorator.

    Args:
        dag_node: The DAG to be converted to a workflow.
        input_context: The input data that wraps varibles for the input node of the DAG.
        workflow_id: The ID of the workflow.
    """
    if not isinstance(dag_node, FunctionNode):
        raise TypeError(
            "Currently workflow does not support classes as DAG inputs.")

    state = WorkflowExecutionState()

    # TODO(suquark): remove this cyclic importing later by changing the way of
    # task ID assignment.
    from ray.workflow.workflow_access import get_or_create_management_actor

    mgr = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()

    def _node_visitor(node: Any) -> Any:
        if isinstance(node, FunctionNode):
            bound_options = node._bound_options.copy()
            num_returns = bound_options.get("num_returns", 1)
            if num_returns is None:  # ray could use `None` as default value
                num_returns = 1
            if num_returns > 1:
                raise ValueError("Workflow steps can only have one return.")

            workflow_options = bound_options.pop("_metadata",
                                                 {}).get(WORKFLOW_OPTIONS, {})

            # If checkpoint option is not specified, inherit checkpoint
            # options from context (i.e. checkpoint options of the outer
            # step). If it is still not specified, it's True by default.
            checkpoint = workflow_options.get("checkpoint", None)
            if checkpoint is None:
                checkpoint = context.checkpoint if context is not None else True
            # When it returns a nested workflow, catch_exception
            # should be passed recursively.
            catch_exceptions = workflow_options.get("catch_exceptions", None)
            if catch_exceptions is None:
                # TODO(suquark): should we also handle exceptions from a "leaf node"
                #   in the continuation? For example, we have a workflow
                #   > @ray.remote
                #   > def A(): pass
                #   > @ray.remote
                #   > def B(x): return x
                #   > @ray.remote
                #   > def C(x): return workflow.continuation(B.bind(A.bind()))
                #   > dag = C.options(**workflow.options(catch_exceptions=True)).bind()
                #   Should C catches exceptions of A?
                if node.get_stable_uuid() == dag_node.get_stable_uuid():
                    # 'catch_exception' context should be passed down to
                    # its direct continuation task.
                    # In this case, the direct continuation is the output node.
                    catch_exceptions = (context.catch_exceptions
                                        if context is not None else False)
                else:
                    catch_exceptions = False

            max_retries = workflow_options.get("max_retries", 3)
            if not isinstance(max_retries, int) or max_retries < -1:
                raise ValueError(
                    "'max_retries' only accepts 0, -1 or a positive integer.")

            step_options = WorkflowStepRuntimeOptions(
                step_type=StepType.FUNCTION,
                catch_exceptions=catch_exceptions,
                max_retries=max_retries,
                allow_inplace=False,
                checkpoint=checkpoint,
                ray_options=bound_options,
            )

            workflow_refs: List[WorkflowRef] = []
            with serialization_context.workflow_args_serialization_context(
                    workflow_refs):
                _func_signature = signature.extract_signature(node._body)
                flattened_args = signature.flatten_args(
                    _func_signature, node._bound_args, node._bound_kwargs)
                # NOTE: When calling 'ray.put', we trigger python object
                # serialization. Under our serialization context,
                # Workflows are separated from the arguments,
                # leaving a placeholder object with all other python objects.
                # Then we put the placeholder object to object store,
                # so it won't be mutated later. This guarantees correct
                # semantics. See "tests/test_variable_mutable.py" as
                # an example.
                input_placeholder: ray.ObjectRef = ray.put(flattened_args)

            name = workflow_options.get("name")
            if name is None:
                name = f"{get_module(node._body)}.{slugify(get_qualname(node._body))}"
            task_id = ray.get(mgr.gen_step_id.remote(workflow_id, name))
            state.add_dependencies(task_id, [s.task_id for s in workflow_refs])
            state.task_input_args[task_id] = input_placeholder

            user_metadata = workflow_options.pop("metadata", {})
            validate_user_metadata(user_metadata)
            state.tasks[task_id] = Task(
                name=name,
                options=step_options,
                user_metadata=user_metadata,
                func_body=node._body,
            )
            return WorkflowRef(task_id)

        if isinstance(node, InputAttributeNode):
            return node._execute_impl()  # get data from input node
        if isinstance(node, InputNode):
            return input_context  # replace input node with input data
        if not isinstance(node, DAGNode):
            return node  # return normal objects
        raise TypeError(f"Unsupported DAG node: {node}")

    output_workflow_ref = dag_node.apply_recursive(_node_visitor)
    state.output_task_id = output_workflow_ref.task_id
    return state