Esempio n. 1
0
def run(entry_workflow: Workflow,
        storage: Optional[Union[str, Storage]] = None,
        workflow_id: Optional[str] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously. See "api.run()" for details."""
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{entry_workflow.id}.{time.time():.9f}"
    if isinstance(storage, str):
        set_global_storage(create_storage(storage))
    elif isinstance(storage, Storage):
        set_global_storage(storage)
    elif storage is not None:
        raise TypeError("'storage' should be None, str, or Storage type.")
    storage_url = get_global_storage().storage_url
    logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url="
                f"\"{storage_url}\"].")
    try:
        workflow_context.init_workflow_step_context(workflow_id, storage_url)
        commit_step(entry_workflow)
        try:
            actor = ray.get_actor(MANAGEMENT_ACTOR_NAME)
        except ValueError:
            # the actor does not exist
            actor = WorkflowManagementActor.options(
                name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote()
        # NOTE: It is important to 'ray.get' the returned output. This
        # ensures caller of 'run()' holds the reference to the workflow
        # result. Otherwise if the actor removes the reference of the
        # workflow output, the caller may fail to resolve the result.
        output = ray.get(actor.run_or_resume.remote(workflow_id, storage_url))
        direct_output = flatten_workflow_output(workflow_id, output)
    finally:
        workflow_context.set_workflow_step_context(None)
    return direct_output
Esempio n. 2
0
def resume_workflow_job(workflow_id: str, store_url: str) -> ray.ObjectRef:
    """Resume a workflow job.

    Args:
        workflow_id: The ID of the workflow job. The ID is used to identify
            the workflow.
        store_url: The url of the storage to access the workflow.

    Raises:
        WorkflowNotResumableException: fail to resume the workflow.

    Returns:
        The execution result of the workflow, represented by Ray ObjectRef.
    """
    try:
        store = storage.create_storage(store_url)
        wf_store = workflow_storage.WorkflowStorage(workflow_id, store)
        entrypoint_step_id: StepID = wf_store.get_entrypoint_step_id()
        r = _construct_resume_workflow_from_step(wf_store, entrypoint_step_id)
    except Exception as e:
        raise WorkflowNotResumableError(workflow_id) from e

    if isinstance(r, Workflow):
        with workflow_context.workflow_step_context(workflow_id,
                                                    store.storage_url):
            from ray.experimental.workflow.step_executor import (
                execute_workflow)
            return execute_workflow(r)
    return wf_store.load_step_output(r)
Esempio n. 3
0
    def actor_ready(self, actor_id: str, storage_url: str) -> ray.ObjectRef:
        """Check if a workflow virtual actor is fully initialized.

        Args:
            actor_id: The ID of a workflow virtual actor.
            storage_url: A string that represents the storage.

        Returns:
            A future object that represents the state of the actor.
            "ray.get" the object successfully indicates the actor is
            initialized successfully.
        """
        store = storage.create_storage(storage_url)
        ws = workflow_storage.WorkflowStorage(actor_id, store)
        try:
            step_id = ws.get_entrypoint_step_id()
            output_exists = ws.inspect_step(step_id).output_object_valid
            if output_exists:
                return ray.put(None)
        except Exception:
            pass
        if actor_id not in self._actor_initialized:
            raise ValueError(f"Actor '{actor_id}' has not been created, or "
                             "it has failed before initialization.")
        return self._actor_initialized[actor_id]
Esempio n. 4
0
def resume(workflow_id: str,
           storage: Optional[Union[str, Storage]] = None) -> ray.ObjectRef:
    """Resume a workflow asynchronously. See "api.resume()" for details.
    """
    if isinstance(storage, str):
        store = create_storage(storage)
    elif isinstance(storage, Storage):
        store = storage
    elif storage is None:
        store = get_global_storage()
    else:
        raise TypeError("'storage' should be None, str, or Storage type.")
    logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url="
                f"\"{store.storage_url}\"].")
    try:
        actor = ray.get_actor(MANAGEMENT_ACTOR_NAME)
    except ValueError:
        # the actor does not exist
        actor = WorkflowManagementActor.options(
            name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote()
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    output = ray.get(
        actor.run_or_resume.remote(workflow_id, store.storage_url))
    direct_output = flatten_workflow_output(workflow_id, output)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return direct_output
Esempio n. 5
0
def _resume_workflow_step_executor(
        workflow_id: str, step_id: "StepID", store_url: str,
        current_output: [ray.ObjectRef
                         ]) -> Tuple[ray.ObjectRef, ray.ObjectRef]:
    # TODO (yic): We need better dependency management for virtual actor
    # The current output will always be empty for normal workflow
    # For virtual actor, if it's not empty, it means the previous job is
    # running. This is a really bad one.
    for ref in current_output:
        try:
            while isinstance(ref, ray.ObjectRef):
                ref = ray.get(ref)
        except Exception:
            pass
    try:
        store = storage.create_storage(store_url)
        wf_store = workflow_storage.WorkflowStorage(workflow_id, store)
        r = _construct_resume_workflow_from_step(wf_store, step_id)
    except Exception as e:
        raise WorkflowNotResumableError(workflow_id) from e

    if isinstance(r, Workflow):
        with workflow_context.workflow_step_context(workflow_id,
                                                    store.storage_url):
            from ray.experimental.workflow.step_executor import (
                execute_workflow)
            result = execute_workflow(r, last_step_of_workflow=True)
            return result.persisted_output, result.volatile_output
    assert isinstance(r, StepID)
    return wf_store.load_step_output(r), None
Esempio n. 6
0
def run(entry_workflow: Workflow,
        storage: Optional[Union[str, Storage]] = None,
        workflow_id: Optional[str] = None) -> ray.ObjectRef:
    """Run a workflow asynchronously. See "api.run()" for details."""
    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{entry_workflow.id}.{time.time():.9f}"
    if isinstance(storage, str):
        set_global_storage(create_storage(storage))
    elif isinstance(storage, Storage):
        set_global_storage(storage)
    elif storage is not None:
        raise TypeError("'storage' should be None, str, or Storage type.")
    storage_url = get_global_storage().storage_url
    logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url="
                f"\"{storage_url}\"].")
    try:
        workflow_context.init_workflow_step_context(workflow_id, storage_url)
        commit_step(entry_workflow)
        # TODO(suquark): Move this to a detached named actor,
        # so the workflow shares fate with the actor.
        # The current plan is resuming the workflow on the detached named
        # actor. This is extremely simple to implement, but I am not sure
        # of its performance.
        output = recovery.resume_workflow_job(workflow_id,
                                              get_global_storage())
        logger.info(f"Workflow job {workflow_id} started.")
    finally:
        workflow_context.set_workflow_step_context(None)
    return output
Esempio n. 7
0
def _get_storage(storage: Optional[Union[str, Storage]]) -> Storage:
    if storage is None:
        return get_global_storage()
    elif isinstance(storage, str):
        return create_storage(storage)
    elif isinstance(storage, Storage):
        return storage
    else:
        raise TypeError("'storage' should be None, str, or Storage type.")
Esempio n. 8
0
def update_workflow_step_context(context: Optional[WorkflowStepContext],
                                 step_id: str):
    global _context
    _context = context
    _context.workflow_scope.append(step_id)
    # avoid cyclic import
    from ray.experimental.workflow import storage
    # TODO(suquark): [optimization] if the original storage has the same URL,
    # skip creating the new one
    storage.set_global_storage(storage.create_storage(context.storage_url))
Esempio n. 9
0
def s3_storage(aws_credentials, s3_server):
    with mock_s3():
        client = boto3.client("s3",
                              region_name="us-west-2",
                              endpoint_url=s3_server)
        client.create_bucket(Bucket="test_bucket")
        url = ("s3://test_bucket/workflow"
               f"?region_name=us-west-2&endpoint_url={s3_server}")
        storage.set_global_storage(storage.create_storage(url))
        yield storage.get_global_storage()
Esempio n. 10
0
def get_actor(
        actor_id: str,
        storage: "Optional[Union[str, Storage]]" = None) -> "VirtualActor":
    """Get an virtual actor.

    Args:
        actor_id: The ID of the actor.
        storage: The storage of the actor.

    Returns:
        A virtual actor.
    """
    if storage is None:
        storage = storage_base.get_global_storage()
    elif isinstance(storage, str):
        storage = storage_base.create_storage(storage)
    return virtual_actor_class.get_actor(actor_id, storage)
Esempio n. 11
0
def resume(workflow_id: str,
           storage: Optional[Union[str, Storage]] = None) -> ray.ObjectRef:
    """Resume a workflow asynchronously. See "api.resume()" for details.
    """
    if isinstance(storage, str):
        store = create_storage(storage)
    elif isinstance(storage, Storage):
        store = storage
    elif storage is None:
        store = get_global_storage()
    else:
        raise TypeError("'storage' should be None, str, or Storage type.")
    logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url="
                f"\"{store.storage_url}\"].")
    output = recovery.resume_workflow_job(workflow_id, store)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return output
Esempio n. 12
0
def _resume_workflow_step_executor(
        workflow_id: str, step_id: "StepID",
        store_url: str) -> Tuple[ray.ObjectRef, ray.ObjectRef]:
    try:
        store = storage.create_storage(store_url)
        wf_store = workflow_storage.WorkflowStorage(workflow_id, store)
        r = _construct_resume_workflow_from_step(wf_store, step_id)
    except Exception as e:
        raise WorkflowNotResumableError(workflow_id) from e

    if isinstance(r, Workflow):
        with workflow_context.workflow_step_context(workflow_id,
                                                    store.storage_url):
            from ray.experimental.workflow.step_executor import (
                execute_workflow)
            result = execute_workflow(r, last_step_of_workflow=True)
            return result.persisted_output, result.volatile_output
    return wf_store.load_step_output(r), None
Esempio n. 13
0
    def run_or_resume(self, workflow_id: str,
                      storage_url: str) -> ray.ObjectRef:
        """Run or resume a workflow.

        Args:
            workflow_id: The ID of the workflow.
            storage_url: A string that represents the storage.

        Returns:
            An object reference that can be used to retrieve the
            workflow result.
        """
        if workflow_id in self._workflow_outputs:
            raise ValueError(f"The output of workflow[id={workflow_id}] "
                             "already exists.")
        store = storage.create_storage(storage_url)
        output = recovery.resume_workflow_job(workflow_id, store)
        self._workflow_outputs[workflow_id] = output
        logger.info(f"Workflow job [id={workflow_id}] started.")
        return output
Esempio n. 14
0
def init(storage: "Optional[Union[str, Storage]]" = None) -> None:
    """Initialize workflow.

    Args:
        storage: The external storage URL or a custom storage class. If not
            specified, ``/tmp/ray/workflow_data`` will be used.
    """
    if storage is None:
        storage = os.environ.get("RAY_WORKFLOW_STORAGE")

    if storage is None:
        # We should use get_temp_dir_path, but for ray client, we don't
        # have this one. We need a flag to tell whether it's a client
        # or a driver to use the right dir.
        # For now, just use /tmp/ray/workflow_data
        logger.warning("Using default local dir: `/tmp/ray/workflow_data`. "
                       "This should only be used for testing purposes.")
        storage = "file:///tmp/ray/workflow_data"
    if isinstance(storage, str):
        storage = storage_base.create_storage(storage)
    elif not isinstance(storage, Storage):
        raise TypeError("'storage' should be None, str, or Storage type.")

    try:
        _storage = storage_base.get_global_storage()
    except RuntimeError:
        pass
    else:
        # we have to use the 'else' branch because we would raise a
        # runtime error, but we do not want to be captured by 'except'
        if _storage.storage_url == storage.storage_url:
            logger.warning("Calling 'workflow.init()' again with the same "
                           "storage.")
        else:
            raise RuntimeError("Calling 'workflow.init()' again with a "
                               "different storage")
    storage_base.set_global_storage(storage)
    workflow_access.init_management_actor()
Esempio n. 15
0
def resume(workflow_id: str, workflow_root_dir=None) -> ray.ObjectRef:
    """
    Resume a workflow asynchronously. This workflow maybe fail previously.

    Args:
        workflow_id: The ID of the workflow. The ID is used to identify
            the workflow.
        workflow_root_dir: The path of an external storage used for
            checkpointing.

    Returns:
        The execution result of the workflow, represented by Ray ObjectRef.
    """
    assert ray.is_initialized()
    if workflow_root_dir is not None:
        store = storage.create_storage(workflow_root_dir)
    else:
        store = storage.get_global_storage()
    r = recovery.resume_workflow_job(workflow_id, store)
    if isinstance(r, ray.ObjectRef):
        return r
    # skip saving the DAG of a recovery workflow
    r.skip_saving_workflow_dag = True
    return run(r, workflow_root_dir, workflow_id)
Esempio n. 16
0
def filesystem_storage(tmp_path):
    storage.set_global_storage(
        storage.create_storage(f"{str(tmp_path)}/workflow_data"))
    yield storage.get_global_storage()