def options( self, *, max_retries=0, catch_exceptions=False, name=None, metadata=None, **ray_options, ) -> "_VirtualActorMethodHelper": validate_user_metadata(metadata) options = WorkflowStepRuntimeOptions.make( step_type=self._options.step_type, catch_exceptions=catch_exceptions if catch_exceptions is not None else self._options.catch_exceptions, max_retries=max_retries if max_retries is not None else self._options.max_retries, ray_options={ **self._options.ray_options, **(ray_options if ray_options is not None else {}), }, ) _self = _VirtualActorMethodHelper( self._original_class, self._original_method, self._method_name, runtime_options=options, ) _self._name = name if name is not None else self._name _self._user_metadata = { **self._user_metadata, **(metadata if metadata is not None else {}), } return _self
def run( entry_workflow: Workflow, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None, ) -> ray.ObjectRef: """Run a workflow asynchronously.""" validate_user_metadata(metadata) metadata = metadata or {} from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" step_type = entry_workflow.data.step_options.step_type logger.info( f'Workflow job created. [id="{workflow_id}"]. Type: {step_type}.') with workflow_context.workflow_step_context(workflow_id): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # "Is growing" means we could adding steps to the (top-level) # workflow to grow the workflow dynamically at runtime. is_growing = step_type not in (StepType.FUNCTION, StepType.WAIT) # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if is_growing or not wf_exists: # We must checkpoint entry workflow. commit_step(ws, "", entry_workflow, exception=None) workflow_manager = get_or_create_management_actor() ignore_existing = is_growing # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. job_id = ray.get_runtime_context().job_id.hex() result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(job_id, workflow_id, ignore_existing)) if not is_growing: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def options( self, *, max_retries: int = None, catch_exceptions: bool = None, name: str = None, metadata: Dict[str, Any] = None, allow_inplace: bool = None, checkpoint: "Optional[CheckpointModeType]" = None, **ray_options, ) -> "WorkflowStepFunction": """This function set how the step function is going to be executed. Args: max_retries: num of retries the step for an application level error. catch_exceptions: Whether the user want to take care of the failure mannually. If it's set to be true, (Optional[R], Optional[E]) will be returned. If it's false, the normal result will be returned. name: The name of this step, which will be used to generate the step_id of the step. The name will be used directly as the step id if possible, otherwise deduplicated by appending .N suffixes. metadata: metadata to add to the step. allow_inplace: Execute the workflow step inplace. checkpoint: The option for checkpointing. **ray_options: All parameters in this fields will be passed to ray remote function options. Returns: The step function itself. """ validate_user_metadata(metadata) name = name if name is not None else self._name metadata = { **self._user_metadata, **(metadata if metadata is not None else {}) } step_options = WorkflowStepRuntimeOptions.make( step_type=StepType.FUNCTION, catch_exceptions=catch_exceptions if catch_exceptions is not None else self._step_options.catch_exceptions, max_retries=max_retries if max_retries is not None else self._step_options.max_retries, allow_inplace=allow_inplace if allow_inplace is not None else self._step_options.allow_inplace, checkpoint=_inherit_checkpoint_option(checkpoint), ray_options={ **self._step_options.ray_options, **(ray_options if ray_options is not None else {}), }, ) return WorkflowStepFunction(self._func, step_options=step_options, name=name, metadata=metadata)
def __init__( self, func: Callable, *, step_options: "WorkflowStepRuntimeOptions" = None, name: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, ): validate_user_metadata(metadata) self._func = func self._step_options = step_options self._func_signature = signature.extract_signature(func) self._name = name or "" self._user_metadata = metadata or {} # Override signature and docstring @functools.wraps(func) def _build_workflow(*args, **kwargs) -> Workflow: flattened_args = signature.flatten_args(self._func_signature, args, kwargs) def prepare_inputs(): from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() return serialization_context.make_workflow_inputs( flattened_args) nonlocal step_options if step_options is None: step_options = WorkflowStepRuntimeOptions.make( step_type=StepType.FUNCTION) # We could have "checkpoint=None" when we use @workflow.step # with arguments. Avoid this by updating it here. step_options.checkpoint = _inherit_checkpoint_option( step_options.checkpoint) workflow_data = WorkflowData( func_body=self._func, inputs=None, step_options=step_options, name=self._name, user_metadata=self._user_metadata, ) return Workflow(workflow_data, prepare_inputs) self.step = _build_workflow
def run( dag: DAGNode, dag_inputs: DAGInputData, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None, ) -> ray.ObjectRef: """Run a workflow asynchronously.""" validate_user_metadata(metadata) metadata = metadata or {} from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" state = workflow_state_from_dag(dag, dag_inputs, workflow_id) logger.info(f'Workflow job created. [id="{workflow_id}"].') context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) with workflow_context.workflow_step_context(context): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) job_id = ray.get_runtime_context().job_id.hex() try: ws.get_entrypoint_step_id() wf_exists = True except Exception: # The workflow does not exist. We must checkpoint entry workflow. ws.save_workflow_execution_state("", state) wf_exists = False workflow_manager = get_or_create_management_actor() if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)): raise RuntimeError(f"Workflow '{workflow_id}' is already running.") if wf_exists: return resume(workflow_id) ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE ray.get( workflow_manager.submit_workflow.remote( workflow_id, state, ignore_existing=ignore_existing)) return workflow_manager.execute_workflow.remote(job_id, context)
def __init__(self, **workflow_options: Dict[str, Any]): # TODO(suquark): More rigid arguments check like @ray.remote arguments. This is # fairly complex, but we should enable it later. valid_options = { "name", "metadata", "catch_exceptions", "allow_inplace", "checkpoint", } invalid_keywords = set(workflow_options.keys()) - valid_options if invalid_keywords: raise ValueError( f"Invalid option keywords {invalid_keywords} for workflow steps. " f"Valid ones are {valid_options}.") from ray.workflow.common import WORKFLOW_OPTIONS validate_user_metadata(workflow_options.get("metadata")) self.options = {"_metadata": {WORKFLOW_OPTIONS: workflow_options}}
def _node_visitor(node: Any) -> Any: if isinstance(node, FunctionNode): bound_options = node._bound_options.copy() num_returns = bound_options.get("num_returns", 1) if num_returns is None: # ray could use `None` as default value num_returns = 1 if num_returns > 1: raise ValueError("Workflow steps can only have one return.") workflow_options = bound_options.pop("_metadata", {}).get(WORKFLOW_OPTIONS, {}) # If checkpoint option is not specified, inherit checkpoint # options from context (i.e. checkpoint options of the outer # step). If it is still not specified, it's True by default. checkpoint = workflow_options.get("checkpoint", None) if checkpoint is None: checkpoint = context.checkpoint if context is not None else True # When it returns a nested workflow, catch_exception # should be passed recursively. catch_exceptions = workflow_options.get("catch_exceptions", None) if catch_exceptions is None: # TODO(suquark): should we also handle exceptions from a "leaf node" # in the continuation? For example, we have a workflow # > @ray.remote # > def A(): pass # > @ray.remote # > def B(x): return x # > @ray.remote # > def C(x): return workflow.continuation(B.bind(A.bind())) # > dag = C.options(**workflow.options(catch_exceptions=True)).bind() # Should C catches exceptions of A? if node.get_stable_uuid() == dag_node.get_stable_uuid(): # 'catch_exception' context should be passed down to # its direct continuation task. # In this case, the direct continuation is the output node. catch_exceptions = (context.catch_exceptions if context is not None else False) else: catch_exceptions = False max_retries = bound_options.get("max_retries", 3) if not isinstance(max_retries, int) or max_retries < -1: raise ValueError( "'max_retries' only accepts 0, -1 or a positive integer.") step_options = WorkflowStepRuntimeOptions( step_type=StepType.FUNCTION, catch_exceptions=catch_exceptions, max_retries=max_retries, allow_inplace=False, checkpoint=checkpoint, ray_options=bound_options, ) workflow_refs: List[WorkflowRef] = [] with serialization_context.workflow_args_serialization_context( workflow_refs): _func_signature = signature.extract_signature(node._body) flattened_args = signature.flatten_args( _func_signature, node._bound_args, node._bound_kwargs) # NOTE: When calling 'ray.put', we trigger python object # serialization. Under our serialization context, # Workflows are separated from the arguments, # leaving a placeholder object with all other python objects. # Then we put the placeholder object to object store, # so it won't be mutated later. This guarantees correct # semantics. See "tests/test_variable_mutable.py" as # an example. input_placeholder: ray.ObjectRef = ray.put(flattened_args) name = workflow_options.get("name") if name is None: name = f"{get_module(node._body)}.{slugify(get_qualname(node._body))}" task_id = ray.get(mgr.gen_step_id.remote(workflow_id, name)) state.add_dependencies(task_id, [s.task_id for s in workflow_refs]) state.task_input_args[task_id] = input_placeholder user_metadata = workflow_options.pop("metadata", {}) validate_user_metadata(user_metadata) state.tasks[task_id] = Task( name=name, options=step_options, user_metadata=user_metadata, func_body=node._body, ) return WorkflowRef(task_id) if isinstance(node, InputAttributeNode): return node._execute_impl() # get data from input node if isinstance(node, InputNode): return input_context # replace input node with input data if not isinstance(node, DAGNode): return node # return normal objects raise TypeError(f"Unsupported DAG node: {node}")
def run_async( dag: DAGNode, *args, workflow_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, **kwargs, ) -> ray.ObjectRef: """Run a workflow asynchronously. If the workflow with the given id already exists, it will be resumed. Args: workflow_id: A unique identifier that can be used to resume the workflow. If not specified, a random id will be generated. metadata: The metadata to add to the workflow. It has to be able to serialize to json. Returns: The running result as ray.ObjectRef. """ _ensure_workflow_initialized() if not isinstance(dag, DAGNode): raise TypeError("Input should be a DAG.") input_data = DAGInputData(*args, **kwargs) validate_user_metadata(metadata) metadata = metadata or {} if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" state = workflow_state_from_dag(dag, input_data, workflow_id) logger.info(f'Workflow job created. [id="{workflow_id}"].') context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) with workflow_context.workflow_step_context(context): # checkpoint the workflow ws = WorkflowStorage(workflow_id) ws.save_workflow_user_metadata(metadata) job_id = ray.get_runtime_context().job_id.hex() try: ws.get_entrypoint_step_id() wf_exists = True except Exception: # The workflow does not exist. We must checkpoint entry workflow. ws.save_workflow_execution_state("", state) wf_exists = False workflow_manager = workflow_access.get_management_actor() if ray.get( workflow_manager.is_workflow_non_terminating.remote( workflow_id)): raise RuntimeError( f"Workflow '{workflow_id}' is already running or pending.") if wf_exists: return resume_async(workflow_id) ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE ray.get( workflow_manager.submit_workflow.remote( workflow_id, state, ignore_existing=ignore_existing)) return workflow_manager.execute_workflow.remote(job_id, context)