Beispiel #1
0
    def step_id(self) -> StepID:
        if self._step_id is not None:
            return self._step_id

        from ray.experimental.workflow.workflow_access import \
            get_or_create_management_actor
        mgr = get_or_create_management_actor()
        self._step_id = ray.get(
            mgr.gen_step_id.remote(self._workflow_id, self._name))
        return self._step_id
 def _create(self, args: Tuple[Any], kwargs: Dict[str, Any]):
     workflow_storage = WorkflowStorage(self._actor_id, self._storage)
     workflow_storage.save_actor_class_body(self._metadata.cls)
     # TODO(suquark): This is just a temporary solution.
     # A virtual actor writer should take place of this solution later.
     arg_list = self._metadata.flatten_args("__init__", args, kwargs)
     init_step = _virtual_actor_init.step(self._metadata.cls, arg_list)
     init_step._step_id = self._metadata.cls.__init__.__name__
     ref = init_step.run_async(workflow_id=self._actor_id)
     workflow_manager = get_or_create_management_actor()
     # keep the ref in a list to prevent dereference
     ray.get(workflow_manager.init_actor.remote(self._actor_id, [ref]))
Beispiel #3
0
def resume(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously. See "api.resume()" for details.
    """
    storage = get_global_storage()
    logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url="
                f"\"{storage.storage_url}\"].")
    workflow_manager = get_or_create_management_actor()
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    output = ray.get(workflow_manager.run_or_resume.remote(workflow_id))
    direct_output = flatten_workflow_output(workflow_id, output)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return direct_output
Beispiel #4
0
def run(entry_workflow: Workflow,
        workflow_id: Optional[str] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously.
    """
    store = get_global_storage()
    assert ray.is_initialized()
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url="
                f"\"{store.storage_url}\"].")

    with workflow_context.workflow_step_context(workflow_id,
                                                store.storage_url):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)

        wf_exists = True
        try:
            ws.get_entrypoint_step_id()
        except Exception:
            wf_exists = False

        # We only commit for
        #  - virtual actor tasks: it's dynamic tasks, so we always add
        #  - it's a new workflow
        # TODO (yic): follow up with force rerun
        if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists:
            commit_step(ws, "", entry_workflow, None)
        workflow_manager = get_or_create_management_actor()
        ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION)
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        result: "WorkflowExecutionResult" = ray.get(
            workflow_manager.run_or_resume.remote(workflow_id,
                                                  ignore_existing))
        if entry_workflow.data.step_type == StepType.FUNCTION:
            return flatten_workflow_output(workflow_id,
                                           result.persisted_output)
        else:
            return flatten_workflow_output(workflow_id, result.volatile_output)
Beispiel #5
0
def _resolve_dynamic_workflow_refs(workflow_refs: "List[WorkflowRef]"):
    """Get the output of a workflow step with the step ID at runtime.

    We lookup the output by the following order:
    1. Query cached step output in the workflow manager. Fetch the physical
       output object.
    2. If failed to fetch the physical output object, look into the storage
       to see whether the output is checkpointed. Load the checkpoint.
    3. If failed to load the checkpoint, resume the step and get the output.
    """
    workflow_manager = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()
    workflow_id = context.workflow_id
    storage_url = context.storage_url
    workflow_ref_mapping = []
    for workflow_ref in workflow_refs:
        step_ref = ray.get(
            workflow_manager.get_cached_step_output.remote(
                workflow_id, workflow_ref.step_id))
        get_cached_step = False
        if step_ref is not None:
            try:
                output, _ = _resolve_object_ref(step_ref)
                get_cached_step = True
            except Exception:
                get_cached_step = False
        if not get_cached_step:
            wf_store = workflow_storage.get_workflow_storage()
            try:
                output = wf_store.load_step_output(workflow_ref.step_id)
            except DataLoadError:
                current_step_id = workflow_context.get_current_step_id()
                logger.warning("Failed to get the output of step "
                               f"{workflow_ref.step_id}. Trying to resume it. "
                               f"Current step: '{current_step_id}'")
                step_ref = recovery.resume_workflow_step(
                    workflow_id, workflow_ref.step_id,
                    storage_url).persisted_output
                output, _ = _resolve_object_ref(step_ref)
        workflow_ref_mapping.append(output)
    return workflow_ref_mapping
Beispiel #6
0
def run(entry_workflow: Workflow,
        workflow_id: Optional[str] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously. See "api.run()" for details."""
    store = get_global_storage()
    assert ray.is_initialized()
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{entry_workflow.id}.{time.time():.9f}"
    logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url="
                f"\"{store.storage_url}\"].")

    # checkpoint the workflow
    ws = workflow_storage.WorkflowStorage(workflow_id, store)
    commit_step(ws, "", entry_workflow)
    workflow_manager = get_or_create_management_actor()
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    output = ray.get(workflow_manager.run_or_resume.remote(workflow_id))
    return flatten_workflow_output(workflow_id, output)
Beispiel #7
0
 def ready(self) -> "ObjectRef":
     """Return a future. If 'ray.get()' it successfully, then the actor
     is fully initialized."""
     # TODO(suquark): should ray.get(xxx.ready()) always be true?
     workflow_manager = get_or_create_management_actor()
     return ray.get(workflow_manager.actor_ready.remote(self._actor_id))