def test_crash_after_commit(workflow_start_regular_shared): _storage = storage.get_global_storage() """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash. Here we must call `event_checkpointed` twice, because there's no way to know if we called it after checkpointing. """ class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000) event_promise = workflow.wait_for_event(MyEventListener) event_promise.run_async("workflow") while not utils.check_global_mark("first"): time.sleep(0.1) ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) ray.init(num_cpus=4) workflow.init(storage=_storage) workflow.resume("workflow") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def test_wait_for_multiple_events(workflow_start_regular_shared): """If a workflow has multiple event arguments, it should wait for them at the same time. """ class EventListener1(workflow.EventListener): async def poll_for_event(self): utils.set_global_mark("listener1") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event1" class EventListener2(workflow.EventListener): async def poll_for_event(self): utils.set_global_mark("listener2") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event2" @ray.remote def trivial_step(arg1, arg2): return f"{arg1} {arg2}" event1_promise = workflow.wait_for_event(EventListener1) event2_promise = workflow.wait_for_event(EventListener2) promise = workflow.create(trivial_step.bind(event1_promise, event2_promise)).run_async() while not (utils.check_global_mark("listener1") and utils.check_global_mark("listener2")): time.sleep(0.1) utils.set_global_mark("trigger_event") assert ray.get(promise) == "event1 event2"
async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1)
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" from ray._private import storage storage_uri = storage._storage_uri """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @ray.remote def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume_async("workflow") utils.set_global_mark("resume") workflow.get_output("workflow") assert utils.check_global_mark("second")
async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000)
def identity2(x): if not utils.check_global_mark(): import os os.kill(os.getpid(), 9) return x
def test_delete(workflow_start_regular): from ray._private.storage import _storage_uri # Try deleting a random workflow that never existed. with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_existed") # Delete a workflow that has not finished and is not running. @ray.remote def never_ends(x): utils.set_global_mark() time.sleep(1000000) return x workflow.create(never_ends.bind("hello world")).run_async("never_finishes") # Make sure the step is actualy executing before killing the cluster while not utils.check_global_mark(): time.sleep(0.1) # Restart ray.shutdown() subprocess.check_output("ray stop --force", shell=True) ray.init(storage=_storage_uri) workflow.init() with pytest.raises(ray.exceptions.RaySystemError): result = workflow.get_output("never_finishes") ray.get(result) workflow.delete("never_finishes") with pytest.raises(ValueError): ouput = workflow.get_output("never_finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(WorkflowNotFoundError): # workflow.resume("never_finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_finishes") # Delete a workflow which has finished. @ray.remote def basic_step(arg): return arg result = workflow.create( basic_step.bind("hello world")).run(workflow_id="finishes") assert result == "hello world" ouput = workflow.get_output("finishes") assert ray.get(ouput) == "hello world" workflow.delete(workflow_id="finishes") with pytest.raises(ValueError): ouput = workflow.get_output("finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(ValueError): # workflow.resume("finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="finishes") assert workflow.list_all() == [] # The workflow can be re-run as if it was never run before. assert workflow.create( basic_step.bind("123")).run(workflow_id="finishes") == "123"
async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(0.1) # Give the other step time to finish. await asyncio.sleep(1)
async def poll_for_event(self): utils.set_global_mark("listener2") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event2"
def identity(x: int): # block the step by a global mark assert utils.check_global_mark() return x
async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(1)
async def poll_for_event(self): assert not utils.check_global_mark("committed")
def filter_all_2(wait_results): assert wait_results[0] == [1, 3, 2] # failure point assert utils.check_global_mark() ready, unready = wait_results return get_all.step(ready, unready)
def triggers_event(): utils.set_global_mark() while not utils.check_global_mark("event_returning"): time.sleep(0.1)
async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(0.1) utils.set_global_mark("event_returning")
def the_failed_step(x): if not utils.check_global_mark(): import os os.kill(os.getpid(), 9) return "foo(" + x + ")"
def sleep_identity(x: int): # block the step by a global mark while not utils.check_global_mark(): time.sleep(0.1) time.sleep(x) return x