def test_nested_workflow_no_download(workflow_start_regular): """Test that we _only_ load from storage on recovery. For a nested workflow step, we should checkpoint the input/output, but continue to reuse the in-memory value. """ @ray.remote def recursive(ref, count): if count == 0: return ref return workflow.continuation(recursive.bind(ref, count - 1)) with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(get_global_storage(), temp_dir) utils._alter_storage(debug_store) ref = ray.put("hello") result = workflow.create(recursive.bind([ref], 10)).run() ops = debug_store._logged_storage.get_op_counter() get_objects_count = 0 for key in ops["get"]: if "objects" in key: get_objects_count += 1 assert get_objects_count == 1, "We should only get once when resuming." put_objects_count = 0 for key in ops["put"]: if "objects" in key: print(key) put_objects_count += 1 assert (put_objects_count == 1 ), "We should detect the object exists before uploading" assert ray.get(result) == ["hello"]
def step(method_name, method, *args, **kwargs): readonly = getattr(method, "__virtual_actor_readonly__", False) flattened_args = self.flatten_args(method_name, args, kwargs) actor_id = workflow_context.get_current_workflow_id() if not readonly: if method_name == "__init__": state_ref = None else: ws = WorkflowStorage(actor_id, get_global_storage()) state_ref = WorkflowRef(ws.get_entrypoint_step_id()) # This is a hack to insert a positional argument. flattened_args = [signature.DUMMY_TYPE, state_ref ] + flattened_args workflow_inputs = serialization_context.make_workflow_inputs( flattened_args) if readonly: _actor_method = _wrap_readonly_actor_method( actor_id, self.cls, method_name) step_type = StepType.READONLY_ACTOR_METHOD else: _actor_method = _wrap_actor_method(self.cls, method_name) step_type = StepType.ACTOR_METHOD # TODO(suquark): Support actor options. workflow_data = WorkflowData( func_body=_actor_method, step_type=step_type, inputs=workflow_inputs, max_retries=1, catch_exceptions=False, ray_options={}, name=None, ) wf = Workflow(workflow_data) return wf
def test_wait_recovery_step_id(workflow_start_regular_shared): # This test ensures workflow reuse the original directory and # step id for "workflow.wait" during recovery. @workflow.step def identity(x: int): # block the step by a global mark assert utils.check_global_mark() return x w = workflow.wait([identity.step(42)], num_returns=1, timeout=None) utils.unset_global_mark() with pytest.raises(RaySystemError): _ = w.run(workflow_id="test_wait_recovery_step_id") utils.set_global_mark() ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id")) assert ready == [42] from ray.workflow import storage, workflow_storage global_storage = storage.get_global_storage() wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id", global_storage) index = wf_storage.gen_step_id("workflow.wait") # no new step id assert index <= 1
def test_crash_after_commit(workflow_start_regular_shared): _storage = storage.get_global_storage() """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash. Here we must call `event_checkpointed` twice, because there's no way to know if we called it after checkpointing. """ class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000) event_promise = workflow.wait_for_event(MyEventListener) event_promise.run_async("workflow") while not utils.check_global_mark("first"): time.sleep(0.1) ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) ray.init(num_cpus=4) workflow.init(storage=_storage) workflow.resume("workflow") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def test_failure_with_storage(workflow_start_regular): with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(get_global_storage(), temp_dir) _alter_storage(debug_store) wf = construct_workflow(length=3) result = wf.run(workflow_id="complex_workflow") index = _locate_initial_commit(debug_store) + 1 debug_store.log_off() def resume(num_records_replayed): key = debug_store.wrapped_storage.make_key("complex_workflow") asyncio_run(debug_store.wrapped_storage.delete_prefix(key)) async def replay(): # We need to replay one by one to avoid conflict for i in range(num_records_replayed): await debug_store.replay(i) asyncio_run(replay()) return ray.get(workflow.resume(workflow_id="complex_workflow")) with pytest.raises(ValueError): # in cases, the replayed records are too few to resume the # workflow. resume(index - 1) if isinstance(debug_store.wrapped_storage, FilesystemStorageImpl): # filesystem is faster, so we can cover all cases step_len = 1 else: step_len = max((len(debug_store) - index) // 5, 1) for j in range(index, len(debug_store), step_len): assert resume(j) == result
def test_embedded_objectrefs(workflow_start_regular): workflow_id = test_embedded_objectrefs.__name__ base_storage = storage.get_global_storage() class ObjectRefsWrapper: def __init__(self, refs): self.refs = refs url = base_storage.storage_url wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)]) promise = serialization.dump_to_storage(["key"], wrapped, workflow_id, base_storage) workflow_storage.asyncio_run(promise) # Be extremely explicit about shutting down. We want to make sure the # `_get` call deserializes the full object and puts it in the object store. # Shutting down the cluster should guarantee we don't accidently get the # old object and pass the test. ray.shutdown() subprocess.check_output("ray stop --force", shell=True) workflow.init(url) storage2 = workflow_storage.get_workflow_storage(workflow_id) result = workflow_storage.asyncio_run(storage2._get(["key"])) assert ray.get(result.refs) == [1, 2]
def step(self, *args, **kwargs): flattened_args = signature.flatten_args(self._signature, args, kwargs) actor_id = workflow_context.get_current_workflow_id() if not self.readonly: if self._method_name == "__init__": state_ref = None else: ws = WorkflowStorage(actor_id, get_global_storage()) state_ref = WorkflowRef(ws.get_entrypoint_step_id()) # This is a hack to insert a positional argument. flattened_args = [signature.DUMMY_TYPE, state_ref] + flattened_args workflow_inputs = serialization_context.make_workflow_inputs( flattened_args) if self.readonly: _actor_method = _wrap_readonly_actor_method( actor_id, self._original_class, self._method_name) else: _actor_method = _wrap_actor_method(self._original_class, self._method_name) workflow_data = WorkflowData( func_body=_actor_method, inputs=workflow_inputs, name=self._name, step_options=self._options, user_metadata=self._user_metadata, ) wf = Workflow(workflow_data) return wf
def run(entry_workflow: Workflow, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None) -> ray.ObjectRef: """Run a workflow asynchronously. """ if metadata is not None: if not isinstance(metadata, dict): raise ValueError("metadata must be a dict.") for k, v in metadata.items(): try: json.dumps(v) except TypeError as e: raise ValueError("metadata values must be JSON serializable, " "however '{}' has a value whose {}.".format( k, e)) metadata = metadata or {} store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" logger.info( f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"]. Type: {entry_workflow.data.step_type} ") with workflow_context.workflow_step_context(workflow_id, store.storage_url): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists: commit_step(ws, "", entry_workflow, exception=None) workflow_manager = get_or_create_management_actor() ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION) # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing)) if entry_workflow.data.step_type == StepType.FUNCTION: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def run( entry_workflow: Workflow, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None, ) -> ray.ObjectRef: """Run a workflow asynchronously.""" validate_user_metadata(metadata) metadata = metadata or {} store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" step_type = entry_workflow.data.step_options.step_type logger.info( f'Workflow job created. [id="{workflow_id}", storage_url=' f'"{store.storage_url}"]. Type: {step_type}.' ) with workflow_context.workflow_step_context(workflow_id, store.storage_url): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # "Is growing" means we could adding steps to the (top-level) # workflow to grow the workflow dynamically at runtime. is_growing = step_type not in (StepType.FUNCTION, StepType.WAIT) # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if is_growing or not wf_exists: # We must checkpoint entry workflow. commit_step(ws, "", entry_workflow, exception=None) workflow_manager = get_or_create_management_actor() ignore_existing = is_growing # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing) ) if not is_growing: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def get_actor(actor_id: str) -> "VirtualActor": """Get an virtual actor. Args: actor_id: The ID of the actor. Returns: A virtual actor. """ ensure_ray_initialized() return virtual_actor_class.get_actor(actor_id, storage_base.get_global_storage())
def get_workflow_storage(workflow_id: Optional[str] = None) -> WorkflowStorage: """Get the storage for the workflow. Args: workflow_id: The ID of the storage. Returns: A workflow storage. """ store = storage.get_global_storage() if workflow_id is None: workflow_id = workflow_context.get_workflow_step_context().workflow_id return WorkflowStorage(workflow_id, store)
def _readonly_actor_method(*args, **kwargs): storage = get_global_storage() instance = cls.__new__(cls) try: state = get_latest_output(actor_id, storage) except Exception as e: raise VirtualActorNotInitializedError( f"Virtual actor '{actor_id}' has not been initialized. " "We cannot get the latest state for the " "readonly virtual actor.") from e __setstate(instance, state) method = getattr(instance, method_name) return method(*args, **kwargs)
def test_checkpoint_dag_full(workflow_start_regular): global_storage = storage.get_global_storage() outputs = utils.run_workflow_dag_with_options( checkpoint_dag, (True,), workflow_id="checkpoint_whole", name="checkpoint_dag" ) assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_whole")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole", global_storage) _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed") _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def resume(workflow_id: str) -> ray.ObjectRef: """Resume a workflow asynchronously. See "api.resume()" for details.""" storage = get_global_storage() logger.info(f'Resuming workflow [id="{workflow_id}", storage_url=' f'"{storage.storage_url}"].') workflow_manager = get_or_create_management_actor() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing=False)) logger.info(f"Workflow job {workflow_id} resumed.") return flatten_workflow_output(workflow_id, result.persisted_output)
def test_dedupe_download_raw_ref(workflow_start_regular): with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(get_global_storage(), temp_dir) utils._alter_storage(debug_store) ref = ray.put("hello") workflows = [identity.bind(ref) for _ in range(100)] workflow.create(gather.bind(*workflows)).run() ops = debug_store._logged_storage.get_op_counter() get_objects_count = 0 for key in ops["get"]: if "objects" in key: get_objects_count += 1 assert get_objects_count == 1
def test_dedupe_downloads_list(workflow_start_regular): with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(get_global_storage(), temp_dir) utils._alter_storage(debug_store) numbers = [ray.put(i) for i in range(5)] workflows = [identity.step(numbers) for _ in range(100)] gather.step(*workflows).run() ops = debug_store._logged_storage.get_op_counter() get_objects_count = 0 for key in ops["get"]: if "objects" in key: get_objects_count += 1 assert get_objects_count == 5
async def test_kv_storage(workflow_start_regular): kv_store = storage.get_global_storage() json_data = {"hello": "world"} bin_data = (31416).to_bytes(8, "big") key_1 = kv_store.make_key("aaa", "bbb", "ccc") key_2 = kv_store.make_key("aaa", "ddd") key_3 = kv_store.make_key("aaa", "eee") await kv_store.put(key_1, json_data, is_json=True) await kv_store.put(key_2, bin_data, is_json=False) assert json_data == await kv_store.get(key_1, is_json=True) assert bin_data == await kv_store.get(key_2, is_json=False) with pytest.raises(storage.KeyNotFoundError): await kv_store.get(key_3) prefix = kv_store.make_key("aaa") assert set(await kv_store.scan_prefix(prefix)) == {"bbb", "ddd"} assert set(await kv_store.scan_prefix(kv_store.make_key(""))) == {"aaa"}
def test_checkpoint_dag_skip_partial(workflow_start_regular): global_storage = storage.get_global_storage() outputs = ( checkpoint_dag.options(name="checkpoint_dag") .step(False) .run(workflow_id="checkpoint_partial") ) assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_partial")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial", global_storage) _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped") _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def init_management_actor() -> None: """Initialize WorkflowManagementActor""" store = storage.get_global_storage() try: workflow_manager = get_management_actor() storage_url = ray.get(workflow_manager.get_storage_url.remote()) if storage_url != store.storage_url: raise RuntimeError("The workflow is using a storage " f"({store.storage_url}) different from the " f"workflow manager({storage_url}).") except ValueError: logger.info("Initializing workflow manager...") # the actor does not exist WorkflowManagementActor.options( name=common.MANAGEMENT_ACTOR_NAME, namespace=common.MANAGEMENT_ACTOR_NAMESPACE, lifetime="detached").remote(store)
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" _storage = storage.get_global_storage() """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @workflow.step def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) wait_then_finish.step(event_promise).run_async("workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4) workflow.init(storage=_storage) workflow.resume("workflow") utils.set_global_mark("resume") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def run(entry_workflow: Workflow, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. """ store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") with workflow_context.workflow_step_context(workflow_id, store.storage_url): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists: commit_step(ws, "", entry_workflow, None) workflow_manager = get_or_create_management_actor() ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION) # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing)) if entry_workflow.data.step_type == StepType.FUNCTION: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def get_or_create_management_actor() -> "ActorHandle": """Get or create WorkflowManagementActor""" # TODO(suquark): We should not get the actor everytime. We also need to # resume the actor if it failed. Using a global variable to cache the # actor seems not enough to resume the actor, because there is no # aliveness detection for an actor. try: workflow_manager = get_management_actor() except ValueError: store = storage.get_global_storage() # the actor does not exist logger.warning("Cannot access workflow manager. It could be because " "the workflow manager exited unexpectedly. A new " "workflow manager is being created with storage " f"'{store}'.") workflow_manager = WorkflowManagementActor.options( name=common.MANAGEMENT_ACTOR_NAME, namespace=common.MANAGEMENT_ACTOR_NAMESPACE, lifetime="detached").remote(store) return workflow_manager
def get_or_create_manager(warn_on_creation: bool = True) -> "ActorHandle": """Get or create the storage manager.""" # TODO(suquark): We should not get the actor everytime. We also need to # resume the actor if it failed. Using a global variable to cache the # actor seems not enough to resume the actor, because there is no # aliveness detection for an actor. try: return ray.get_actor(common.STORAGE_ACTOR_NAME, namespace=common.MANAGEMENT_ACTOR_NAMESPACE) except ValueError: store = storage.get_global_storage() if warn_on_creation: logger.warning("Cannot access workflow serialization manager. It " "could be because " "the workflow manager exited unexpectedly. A new " "workflow manager is being created with storage " f"'{store}'.") handle = Manager.options(name=common.STORAGE_ACTOR_NAME, namespace=common.MANAGEMENT_ACTOR_NAMESPACE, lifetime="detached").remote(store) ray.get(handle.ping.remote()) return handle
def init(storage: "Optional[Union[str, Storage]]" = None) -> None: """Initialize workflow. Args: storage: The external storage URL or a custom storage class. If not specified, ``/tmp/ray/workflow_data`` will be used. """ if storage is None: storage = os.environ.get("RAY_WORKFLOW_STORAGE") if storage is None: # We should use get_temp_dir_path, but for ray client, we don't # have this one. We need a flag to tell whether it's a client # or a driver to use the right dir. # For now, just use /tmp/ray/workflow_data storage = "file:///tmp/ray/workflow_data" if isinstance(storage, str): logger.info(f"Using storage: {storage}") storage = storage_base.create_storage(storage) elif not isinstance(storage, Storage): raise TypeError("'storage' should be None, str, or Storage type.") try: _storage = storage_base.get_global_storage() except RuntimeError: pass else: # we have to use the 'else' branch because we would raise a # runtime error, but we do not want to be captured by 'except' if _storage.storage_url == storage.storage_url: logger.warning("Calling 'workflow.init()' again with the same " "storage.") else: raise RuntimeError("Calling 'workflow.init()' again with a " "different storage") storage_base.set_global_storage(storage) workflow_access.init_management_actor() serialization.init_manager()
def get_or_create(self, actor_id: str, *args, **kwargs): return actor_cls._get_or_create(actor_id, args=args, kwargs=kwargs, storage=get_global_storage())
def get_or_create(self, actor_id: str, *args, **kwargs) -> "VirtualActor": """Create an actor. See `VirtualActorClassBase.create()`.""" return self._get_or_create(actor_id, args=args, kwargs=kwargs, storage=get_global_storage())
def test_workflow_storage(workflow_start_regular): workflow_id = test_workflow_storage.__name__ wf_storage = workflow_storage.WorkflowStorage(workflow_id, storage.get_global_storage()) step_id = "some_step" step_options = WorkflowStepRuntimeOptions( step_type=StepType.FUNCTION, catch_exceptions=False, max_retries=1, ray_options={}) input_metadata = { "name": "test_basic_workflows.append1", "workflows": ["def"], "workflow_refs": ["some_ref"], "step_options": step_options.to_dict(), } output_metadata = { "output_step_id": "a12423", "dynamic_output_step_id": "b1234" } flattened_args = [ signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "2", "k", b"543" ] args = signature.recover_args(flattened_args) output = ["the_answer"] object_resolved = 42 obj_ref = ray.put(object_resolved) # test basics asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) asyncio_run( wf_storage._put(wf_storage._key_step_args(step_id), flattened_args)) asyncio_run( wf_storage._put( wf_storage._key_obj_id(obj_ref.hex()), ray.get(obj_ref))) asyncio_run( wf_storage._put( wf_storage._key_step_output_metadata(step_id), output_metadata, True)) asyncio_run(wf_storage._put(wf_storage._key_step_output(step_id), output)) assert wf_storage.load_step_output(step_id) == output assert wf_storage.load_step_args(step_id, [], []) == args assert wf_storage.load_step_func_body(step_id)(33) == 34 assert ray.get(wf_storage.load_object_ref( obj_ref.hex())) == object_resolved # test "inspect_step" inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_object_valid=True) assert inspect_result.is_recoverable() step_id = "some_step2" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args)) asyncio_run( wf_storage._put( wf_storage._key_step_output_metadata(step_id), output_metadata, True)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_step_id=output_metadata["dynamic_output_step_id"]) assert inspect_result.is_recoverable() step_id = "some_step3" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args)) inspect_result = wf_storage.inspect_step(step_id) step_options = WorkflowStepRuntimeOptions( step_type=StepType.FUNCTION, catch_exceptions=False, max_retries=1, ray_options={}) assert inspect_result == workflow_storage.StepInspectResult( args_valid=True, func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options) assert inspect_result.is_recoverable() step_id = "some_step4" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options) assert not inspect_result.is_recoverable() step_id = "some_step5" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options) assert not inspect_result.is_recoverable() step_id = "some_step6" inspect_result = wf_storage.inspect_step(step_id) print(inspect_result) assert inspect_result == workflow_storage.StepInspectResult() assert not inspect_result.is_recoverable()
def get_metadata(paths, is_json=True): store = get_global_storage() key = store.make_key(*paths) return asyncio.get_event_loop().run_until_complete(store.get(key, is_json))
def test_delete(workflow_start_regular): _storage = storage.get_global_storage() # Try deleting a random workflow that never existed. with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_existed") # Delete a workflow that has not finished and is not running. @workflow.step def never_ends(x): utils.set_global_mark() time.sleep(1000000) return x never_ends.step("hello world").run_async("never_finishes") # Make sure the step is actualy executing before killing the cluster while not utils.check_global_mark(): time.sleep(0.1) # Restart ray.shutdown() subprocess.check_output("ray stop --force", shell=True) workflow.init(storage=_storage) with pytest.raises(ray.exceptions.RaySystemError): result = workflow.get_output("never_finishes") ray.get(result) workflow.delete("never_finishes") with pytest.raises(ValueError): ouput = workflow.get_output("never_finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(WorkflowNotFoundError): # workflow.resume("never_finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_finishes") # Delete a workflow which has finished. @workflow.step def basic_step(arg): return arg result = basic_step.step("hello world").run(workflow_id="finishes") assert result == "hello world" ouput = workflow.get_output("finishes") assert ray.get(ouput) == "hello world" workflow.delete(workflow_id="finishes") with pytest.raises(ValueError): ouput = workflow.get_output("finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(ValueError): # workflow.resume("finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="finishes") assert workflow.list_all() == [] # The workflow can be re-run as if it was never run before. assert basic_step.step("123").run(workflow_id="finishes") == "123"
def test_workflow_storage(workflow_start_regular): workflow_id = test_workflow_storage.__name__ wf_storage = workflow_storage.WorkflowStorage(workflow_id, storage.get_global_storage()) step_id = "some_step" step_options = WorkflowStepRuntimeOptions.make(step_type=StepType.FUNCTION) input_metadata = { "name": "test_basic_workflows.append1", "workflows": ["def"], "workflow_refs": ["some_ref"], "step_options": step_options.to_dict(), } output_metadata = { "output_step_id": "a12423", "dynamic_output_step_id": "b1234" } root_output_metadata = {"output_step_id": "c123"} flattened_args = [ signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "2", "k", b"543" ] args = signature.recover_args(flattened_args) output = ["the_answer"] object_resolved = 42 obj_ref = ray.put(object_resolved) # test basics asyncio_run( wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)) asyncio_run( wf_storage._put(wf_storage._key_step_args(step_id), flattened_args)) asyncio_run( wf_storage._put(wf_storage._key_obj_id(obj_ref.hex()), ray.get(obj_ref))) asyncio_run( wf_storage._put(wf_storage._key_step_output_metadata(step_id), output_metadata, True)) asyncio_run( wf_storage._put(wf_storage._key_step_output_metadata(""), root_output_metadata, True)) asyncio_run(wf_storage._put(wf_storage._key_step_output(step_id), output)) assert wf_storage.load_step_output(step_id) == output assert wf_storage.load_step_args(step_id, [], []) == args assert wf_storage.load_step_func_body(step_id)(33) == 34 assert ray.get(wf_storage.load_object_ref( obj_ref.hex())) == object_resolved # test s3 path # here we hardcode the path to make sure s3 path is parsed correctly if isinstance(wf_storage._storage, S3StorageImpl): assert (asyncio_run( wf_storage._storage.get( "workflow/test_workflow_storage/steps/outputs.json", True)) == root_output_metadata) # test "inspect_step" inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_object_valid=True) assert inspect_result.is_recoverable() step_id = "some_step2" asyncio_run( wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)) asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args)) asyncio_run( wf_storage._put(wf_storage._key_step_output_metadata(step_id), output_metadata, True)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_step_id=output_metadata["dynamic_output_step_id"]) assert inspect_result.is_recoverable() step_id = "some_step3" asyncio_run( wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)) asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( args_valid=True, func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options, ) assert inspect_result.is_recoverable() step_id = "some_step4" asyncio_run( wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options, ) assert not inspect_result.is_recoverable() step_id = "some_step5" asyncio_run( wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options, ) assert not inspect_result.is_recoverable() step_id = "some_step6" inspect_result = wf_storage.inspect_step(step_id) print(inspect_result) assert inspect_result == workflow_storage.StepInspectResult() assert not inspect_result.is_recoverable()