def _resume_workflow_step_executor( job_id: str, workflow_id: str, step_id: "StepID", current_output: [ray.ObjectRef], ) -> Tuple[ray.ObjectRef, ray.ObjectRef]: with workflow_context.workflow_logging_context(job_id): # TODO (yic): We need better dependency management for virtual actor # The current output will always be empty for normal workflow # For virtual actor, if it's not empty, it means the previous job is # running. This is a really bad one. for ref in current_output: try: while isinstance(ref, ray.ObjectRef): ref = ray.get(ref) except Exception: pass try: r = _construct_resume_workflow_from_step(workflow_id, step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if not isinstance(r, Workflow): return r, None with workflow_context.workflow_step_context( workflow_id, last_step_of_workflow=True): from ray.workflow.step_executor import execute_workflow result = execute_workflow(job_id, r) return result.persisted_output, result.volatile_output
def _resume_workflow_step_executor(workflow_id: str, step_id: "StepID", store_url: str, current_output: [ ray.ObjectRef ]) -> Tuple[ray.ObjectRef, ray.ObjectRef]: # TODO (yic): We need better dependency management for virtual actor # The current output will always be empty for normal workflow # For virtual actor, if it's not empty, it means the previous job is # running. This is a really bad one. for ref in current_output: try: while isinstance(ref, ray.ObjectRef): ref = ray.get(ref) except Exception: pass try: store = storage.create_storage(store_url) wf_store = workflow_storage.WorkflowStorage(workflow_id, store) r = _construct_resume_workflow_from_step(wf_store, step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): with workflow_context.workflow_step_context(workflow_id, store.storage_url): from ray.workflow.step_executor import (execute_workflow) result = execute_workflow(r, last_step_of_workflow=True) return result.persisted_output, result.volatile_output assert isinstance(r, StepID) return wf_store.load_step_output(r), None
def run(entry_workflow: Workflow, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None) -> ray.ObjectRef: """Run a workflow asynchronously. """ if metadata is not None: if not isinstance(metadata, dict): raise ValueError("metadata must be a dict.") for k, v in metadata.items(): try: json.dumps(v) except TypeError as e: raise ValueError("metadata values must be JSON serializable, " "however '{}' has a value whose {}.".format( k, e)) metadata = metadata or {} store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" logger.info( f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"]. Type: {entry_workflow.data.step_type} ") with workflow_context.workflow_step_context(workflow_id, store.storage_url): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists: commit_step(ws, "", entry_workflow, exception=None) workflow_manager = get_or_create_management_actor() ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION) # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing)) if entry_workflow.data.step_type == StepType.FUNCTION: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def run( entry_workflow: Workflow, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None, ) -> ray.ObjectRef: """Run a workflow asynchronously.""" validate_user_metadata(metadata) metadata = metadata or {} from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" step_type = entry_workflow.data.step_options.step_type logger.info( f'Workflow job created. [id="{workflow_id}"]. Type: {step_type}.') with workflow_context.workflow_step_context(workflow_id): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # "Is growing" means we could adding steps to the (top-level) # workflow to grow the workflow dynamically at runtime. is_growing = step_type not in (StepType.FUNCTION, StepType.WAIT) # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if is_growing or not wf_exists: # We must checkpoint entry workflow. commit_step(ws, "", entry_workflow, exception=None) workflow_manager = get_or_create_management_actor() ignore_existing = is_growing # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. job_id = ray.get_runtime_context().job_id.hex() result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(job_id, workflow_id, ignore_existing)) if not is_growing: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def _actor_method_call(self, method_helper: _VirtualActorMethodHelper, args, kwargs) -> "ObjectRef": with workflow_context.workflow_step_context(self._actor_id, self._storage.storage_url): wf = method_helper.step(*args, **kwargs) if method_helper.readonly: return execute_workflow(wf).volatile_output else: return wf.run_async(self._actor_id)
def _workflow_step_executor( func: Callable, context: "WorkflowStepContext", task_id: "TaskID", baked_inputs: "_BakedWorkflowInputs", runtime_options: "WorkflowStepRuntimeOptions", ) -> Tuple[Any, Any]: """Executor function for workflow step. Args: task_id: ID of the step. func: The workflow step function. baked_inputs: The processed inputs for the step. context: Workflow step context. Used to access correct storage etc. runtime_options: Parameters for workflow step execution. Returns: Workflow step output. """ with workflow_context.workflow_step_context(context): store = workflow_storage.get_workflow_storage() # Part 1: resolve inputs args, kwargs = baked_inputs.resolve(store) # Part 2: execute the step try: store.save_step_prerun_metadata(task_id, {"start_time": time.time()}) with workflow_context.workflow_execution(): output = _wrap_run(func, runtime_options, *args, **kwargs) store.save_step_postrun_metadata(task_id, {"end_time": time.time()}) except Exception as e: # Always checkpoint the exception. store.save_step_output(task_id, None, exception=e) raise e if isinstance(output, DAGNode): output = workflow_state_from_dag(output, None, context.workflow_id) execution_metadata = WorkflowExecutionMetadata( is_output_workflow=True) else: execution_metadata = WorkflowExecutionMetadata() # Part 3: save outputs # TODO(suquark): Validate checkpoint options before commit the task. if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC: if isinstance(output, WorkflowExecutionState): store.save_workflow_execution_state(task_id, output) else: store.save_step_output(task_id, output, exception=None) return execution_metadata, output
def _actor_method_call(self, method_name: str, args, kwargs) -> "ObjectRef": cls = self._metadata.cls method = getattr(cls, method_name, None) if method is None: raise AttributeError(f"Method '{method_name}' does not exist.") with workflow_context.workflow_step_context(self._actor_id, self._storage.storage_url): wf = method.step(*args, **kwargs) readonly = getattr(method, "__virtual_actor_readonly__", False) if readonly: return execute_workflow(wf).volatile_output else: return wf.run_async(self._actor_id)
def run( dag: DAGNode, dag_inputs: DAGInputData, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None, ) -> ray.ObjectRef: """Run a workflow asynchronously.""" validate_user_metadata(metadata) metadata = metadata or {} from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" state = workflow_state_from_dag(dag, dag_inputs, workflow_id) logger.info(f'Workflow job created. [id="{workflow_id}"].') context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) with workflow_context.workflow_step_context(context): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) job_id = ray.get_runtime_context().job_id.hex() try: ws.get_entrypoint_step_id() wf_exists = True except Exception: # The workflow does not exist. We must checkpoint entry workflow. ws.save_workflow_execution_state("", state) wf_exists = False workflow_manager = get_or_create_management_actor() if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)): raise RuntimeError(f"Workflow '{workflow_id}' is already running.") if wf_exists: return resume(workflow_id) ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE ray.get( workflow_manager.submit_workflow.remote( workflow_id, state, ignore_existing=ignore_existing)) return workflow_manager.execute_workflow.remote(job_id, context)
def run(entry_workflow: Workflow, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. """ store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") with workflow_context.workflow_step_context(workflow_id, store.storage_url): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists: commit_step(ws, "", entry_workflow, None) workflow_manager = get_or_create_management_actor() ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION) # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing)) if entry_workflow.data.step_type == StepType.FUNCTION: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def run_async( dag: DAGNode, *args, workflow_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, **kwargs, ) -> ray.ObjectRef: """Run a workflow asynchronously. If the workflow with the given id already exists, it will be resumed. Args: workflow_id: A unique identifier that can be used to resume the workflow. If not specified, a random id will be generated. metadata: The metadata to add to the workflow. It has to be able to serialize to json. Returns: The running result as ray.ObjectRef. """ _ensure_workflow_initialized() if not isinstance(dag, DAGNode): raise TypeError("Input should be a DAG.") input_data = DAGInputData(*args, **kwargs) validate_user_metadata(metadata) metadata = metadata or {} if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" state = workflow_state_from_dag(dag, input_data, workflow_id) logger.info(f'Workflow job created. [id="{workflow_id}"].') context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) with workflow_context.workflow_step_context(context): # checkpoint the workflow ws = WorkflowStorage(workflow_id) ws.save_workflow_user_metadata(metadata) job_id = ray.get_runtime_context().job_id.hex() try: ws.get_entrypoint_step_id() wf_exists = True except Exception: # The workflow does not exist. We must checkpoint entry workflow. ws.save_workflow_execution_state("", state) wf_exists = False workflow_manager = workflow_access.get_management_actor() if ray.get( workflow_manager.is_workflow_non_terminating.remote( workflow_id)): raise RuntimeError( f"Workflow '{workflow_id}' is already running or pending.") if wf_exists: return resume_async(workflow_id) ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE ray.get( workflow_manager.submit_workflow.remote( workflow_id, state, ignore_existing=ignore_existing)) return workflow_manager.execute_workflow.remote(job_id, context)