def resume_workflow_job(workflow_id: str, store: storage.Storage) -> ray.ObjectRef: """Resume a workflow job. Args: workflow_id: The ID of the workflow job. The ID is used to identify the workflow. store: The storage to access the workflow. Raises: WorkflowNotResumableException: fail to resume the workflow. Returns: The execution result of the workflow, represented by Ray ObjectRef. """ reader = workflow_storage.WorkflowStorage(workflow_id, store) try: entrypoint_step_id: StepID = reader.get_entrypoint_step_id() r = _construct_resume_workflow_from_step(reader, entrypoint_step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): try: workflow_context.init_workflow_step_context( workflow_id, store.storage_url) return execute_workflow(r) finally: workflow_context.set_workflow_step_context(None) return ray.put(reader.load_step_output(r))
def run(entry_workflow: Workflow, workflow_root_dir=None, workflow_id=None) -> ray.ObjectRef: """ Run a workflow asynchronously. Args: entry_workflow: The workflow to run. workflow_root_dir: The path of an external storage used for checkpointing. workflow_id: The ID of the workflow. The ID is used to identify the workflow. Returns: The execution result of the workflow, represented by Ray ObjectRef. """ if workflow_id is None: # TODO(suquark): include the name of the workflow in the default ID, # this makes the ID more readable. # Workflow ID format: {UUID}.{Unix time to nanoseconds} workflow_id = f"{uuid.uuid4().hex}.{time.time():.9f}" logger.info(f"Workflow job {workflow_id} created.") try: workflow_context.init_workflow_step_context(workflow_id, workflow_root_dir) rref = entry_workflow.execute() logger.info(f"Workflow job {workflow_id} started.") # TODO(suquark): although we do not return the resolved object to user, # the object was resolved temporarily to the driver script. # We may need a helper step for storing the resolved object # instead later. output = resolve_object_ref(rref)[1] finally: workflow_context.set_workflow_step_context(None) return output
def run(entry_workflow: Workflow, storage: Optional[Union[str, Storage]] = None, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. See "api.run()" for details.""" if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" if isinstance(storage, str): set_global_storage(create_storage(storage)) elif isinstance(storage, Storage): set_global_storage(storage) elif storage is not None: raise TypeError("'storage' should be None, str, or Storage type.") storage_url = get_global_storage().storage_url logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{storage_url}\"].") try: workflow_context.init_workflow_step_context(workflow_id, storage_url) commit_step(entry_workflow) try: actor = ray.get_actor(MANAGEMENT_ACTOR_NAME) except ValueError: # the actor does not exist actor = WorkflowManagementActor.options( name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get(actor.run_or_resume.remote(workflow_id, storage_url)) direct_output = flatten_workflow_output(workflow_id, output) finally: workflow_context.set_workflow_step_context(None) return direct_output
def run(entry_workflow: Workflow, storage: Optional[Union[str, Storage]] = None, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. See "api.run()" for details.""" if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" if isinstance(storage, str): set_global_storage(create_storage(storage)) elif isinstance(storage, Storage): set_global_storage(storage) elif storage is not None: raise TypeError("'storage' should be None, str, or Storage type.") storage_url = get_global_storage().storage_url logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{storage_url}\"].") try: workflow_context.init_workflow_step_context(workflow_id, storage_url) commit_step(entry_workflow) # TODO(suquark): Move this to a detached named actor, # so the workflow shares fate with the actor. # The current plan is resuming the workflow on the detached named # actor. This is extremely simple to implement, but I am not sure # of its performance. output = recovery.resume_workflow_job(workflow_id, get_global_storage()) logger.info(f"Workflow job {workflow_id} started.") finally: workflow_context.set_workflow_step_context(None) return output
def _func(context, task_id, args, kwargs): # NOTE: must use 'set_current_store_dir' to ensure that we are # accessing the correct global variable. workflow_context.set_workflow_step_context(context) scope = workflow_context.get_scope() scope.append(task_id) args, kwargs, resolved_object_refs = resolve_step_inputs( args, kwargs) # free references to potentially save memory del resolved_object_refs output = func(*args, **kwargs) if isinstance(output, Workflow): output = output.execute() return output