def test_crash_after_commit(workflow_start_regular_shared): _storage = storage.get_global_storage() """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash. Here we must call `event_checkpointed` twice, because there's no way to know if we called it after checkpointing. """ class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000) event_promise = workflow.wait_for_event(MyEventListener) event_promise.run_async("workflow") while not utils.check_global_mark("first"): time.sleep(0.1) ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) ray.init(num_cpus=4) workflow.init(storage=_storage) workflow.resume("workflow") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def test_init_twice_2(call_ray_start, reset_workflow, tmp_path): with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}): run_string_as_driver(driver_script) with pytest.raises( RuntimeError, match=".*different from the workflow manager.*" ): workflow.init(str(tmp_path))
def test_embedded_objectrefs(workflow_start_regular): workflow_id = test_embedded_objectrefs.__name__ class ObjectRefsWrapper: def __init__(self, refs): self.refs = refs from ray.internal.storage import _storage_uri wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)]) store = workflow_storage.get_workflow_storage(workflow_id) serialization.dump_to_storage("key", wrapped, workflow_id, store) # Be extremely explicit about shutting down. We want to make sure the # `_get` call deserializes the full object and puts it in the object store. # Shutting down the cluster should guarantee we don't accidently get the # old object and pass the test. ray.shutdown() subprocess.check_output("ray stop --force", shell=True) ray.init(storage=_storage_uri) workflow.init() storage2 = workflow_storage.get_workflow_storage(workflow_id) result = storage2._get("key") assert ray.get(result.refs) == [1, 2]
def test_dedupe_cluster_failure(tmp_path): ray.shutdown() """ ======== driver 1 =========== 1. Checkpoing the input args * Uploads 2. Begin to run step * Crash ====== driver 2 ============ 1. Recover inputs * Creates a new object ref 2. Finish running step 3. Checkpoint step output * Should not trigger upload """ lock_file = tmp_path / "lock" workflow_dir = tmp_path / "workflow" driver_script = f""" import time import ray from ray import workflow from filelock import FileLock @ray.remote def foo(objrefs): with FileLock("{str(lock_file)}"): return objrefs if __name__ == "__main__": ray.init(storage="{str(workflow_dir)}") workflow.init() arg = ray.put("hello world") workflow.create(foo.bind([arg, arg])).run() assert False """ lock = FileLock(lock_file) lock.acquire() run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop", "--force"]) lock.release() ray.init(storage=str(workflow_dir)) workflow.init() resumed = workflow.resume_all() assert len(resumed) == 1 objref = resumed.pop()[1] ray.get(objref) # The object ref will be different before and after recovery, so it will # get uploaded twice. assert get_num_uploads() == 1 ray.shutdown()
def test_embedded_objectrefs(workflow_start_regular): workflow_id = test_embedded_objectrefs.__name__ base_storage = storage.get_global_storage() class ObjectRefsWrapper: def __init__(self, refs): self.refs = refs url = base_storage.storage_url wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)]) promise = serialization.dump_to_storage(["key"], wrapped, workflow_id, base_storage) workflow_storage.asyncio_run(promise) # Be extremely explicit about shutting down. We want to make sure the # `_get` call deserializes the full object and puts it in the object store. # Shutting down the cluster should guarantee we don't accidently get the # old object and pass the test. ray.shutdown() subprocess.check_output("ray stop --force", shell=True) workflow.init(url) storage2 = workflow_storage.get_workflow_storage(workflow_id) result = workflow_storage.asyncio_run(storage2._get(["key"])) assert ray.get(result.refs) == [1, 2]
def test_recovery_cluster_failure(tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking(f""" import time import ray from ray import workflow @ray.remote def foo(x): print("Executing", x) time.sleep(1) if x < 20: return workflow.continuation(foo.bind(x + 1)) else: return 20 if __name__ == "__main__": ray.init(storage="{tmp_path}") workflow.init() assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20 """) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) ray.init(storage=str(tmp_path)) workflow.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_workflow_lifetime_1(call_ray_start, reset_workflow): # Case 1: driver exits normally with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}): run_string_as_driver(driver_script.format(5)) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_workflow_concurrency_limit_reinit(shutdown_only): workflow.init(max_running_workflows=5, max_pending_workflows=6) workflow.init(max_running_workflows=5, max_pending_workflows=6) with pytest.raises(ValueError): workflow.init(max_running_workflows=7, max_pending_workflows=8) workflow.init() workflow.init(max_running_workflows=None, max_pending_workflows=None)
def test_workflow_queuing_3(shutdown_only, tmp_path): """This test ensures the queuing workflow is indeed pending.""" ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=1, max_pending_workflows=1) import time import filelock from ray.exceptions import GetTimeoutError lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): (tmp_path / str(x)).write_text(str(x)) with filelock.FileLock(lock_path): return x workflow_id = "test_workflow_queuing_3" with filelock.FileLock(lock_path): wf_1 = workflow.run_async(long_running.bind(1), workflow_id=f"{workflow_id}_1") wf_2 = workflow.run_async(long_running.bind(2), workflow_id=f"{workflow_id}_2") time.sleep(5) assert (tmp_path / str(1)).exists() assert not (tmp_path / str(2)).exists() assert workflow.get_status( workflow_id=f"{workflow_id}_1") == workflow.RUNNING assert workflow.get_status( workflow_id=f"{workflow_id}_2") == workflow.PENDING with pytest.raises(GetTimeoutError): ray.get(wf_2, timeout=5) assert ray.get([wf_1, wf_2]) == [1, 2]
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow): @ray.remote def constant(): return 31416 workflow.init(storage=str(tmp_path)) workflow.create(constant.bind()).run(workflow_id="const") assert ray.get(workflow.resume(workflow_id="const")) == 31416 workflow.storage.set_global_storage(None)
def test_resume_different_storage(shutdown_only, tmp_path): @ray.remote def constant(): return 31416 ray.init(storage=str(tmp_path)) workflow.init() workflow.create(constant.bind()).run(workflow_id="const") assert ray.get(workflow.resume(workflow_id="const")) == 31416
def test_workflow_lifetime_2(call_ray_start, reset_workflow): # Case 2: driver terminated proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_workflow_lifetime_2(call_ray_start, reset_workflow): # Case 2: driver terminated with patch.dict(os.environ, {"RAY_ADDRESS": call_ray_start}): proc = run_string_as_driver_nonblocking(driver_script.format(100)) time.sleep(10) proc.kill() time.sleep(1) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20
def test_workflow_error_message(): storage_url = r"c:\ray" expected_error_msg = "Invalid url: {}.".format(storage_url) if os.name == "nt": expected_error_msg += ( " Try using file://{} or file:///{} for Windows file paths.". format(storage_url, storage_url)) with pytest.raises(ValueError) as e: workflow.init(storage_url) assert str(e.value) == expected_error_msg
def test_recovery_cluster_failure(reset_workflow, tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking( driver_script.format(tmp_path=str(tmp_path))) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init(str(tmp_path)) assert ray.get(workflow.resume("cluster_failure")) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" from ray._private import storage storage_uri = storage._storage_uri """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @ray.remote def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume_async("workflow") utils.set_global_mark("resume") workflow.get_output("workflow") assert utils.check_global_mark("second")
def test_workflow_queuing_1(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ] for i in range(5): assert workflow.get_output(f"workflow_{i}") == i
def test_recovery_cluster_failure_resume_all(tmp_path, shutdown_only): ray.shutdown() tmp_path = tmp_path subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) workflow_dir = tmp_path / "workflow" lock_file = tmp_path / "lock_file" lock = FileLock(lock_file) lock.acquire() proc = run_string_as_driver_nonblocking( f""" import time import ray from ray import workflow from filelock import FileLock @ray.remote def foo(x): with FileLock("{str(lock_file)}"): return 20 if __name__ == "__main__": ray.init(storage="{str(workflow_dir)}") workflow.init() assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20 """ ) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) lock.release() ray.init(storage=str(workflow_dir)) workflow.init() resumed = workflow.resume_all() assert len(resumed) == 1 (wid, obj_ref) = resumed[0] assert wid == "cluster_failure" assert ray.get(obj_ref) == 20
def test_workflow_queuing_2(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) @ray.remote def short_running(x): return x wfs = [short_running.bind(i) for i in range(5)] refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] for i in range(4): assert workflow.get_output(f"workflow_{i}") == i assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]
def test_recovery_cluster_failure_resume_all(reset_workflow, tmp_path): tmp_path = tmp_path subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) workflow_dir = tmp_path / "workflow" lock_file = tmp_path / "lock_file" driver_script = f""" import time from ray import workflow from filelock import FileLock @workflow.step def foo(x): with FileLock("{str(lock_file)}"): return 20 if __name__ == "__main__": workflow.init("{str(workflow_dir)}") assert foo.step(0).run(workflow_id="cluster_failure") == 20 """ lock = FileLock(lock_file) lock.acquire() proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) lock.release() workflow.init(str(workflow_dir)) resumed = workflow.resume_all() assert len(resumed) == 1 (wid, obj_ref) = resumed[0] assert wid == "cluster_failure" assert ray.get(obj_ref) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
from ray import workflow @workflow.step def hello(msg: str) -> None: print(msg) @workflow.step def wait_all(*args) -> None: pass if __name__ == "__main__": workflow.init() children = [] for msg in ["hello world", "goodbye world"]: children.append(hello.step(msg)) wait_all.step(*children).run()
def test_workflow_concurrency_limit_argument(shutdown_only): with pytest.raises(TypeError): workflow.init(1, 2) with pytest.raises(TypeError): workflow.init(max_running_workflows=1.7) with pytest.raises(TypeError): workflow.init(max_pending_workflows=1.7) with pytest.raises(ValueError): workflow.init(max_running_workflows=-2) with pytest.raises(ValueError): workflow.init(max_pending_workflows=-2) with pytest.raises(ValueError): workflow.init(max_running_workflows=0)
def test_delete(workflow_start_regular): from ray._private.storage import _storage_uri # Try deleting a random workflow that never existed. with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_existed") # Delete a workflow that has not finished and is not running. @ray.remote def never_ends(x): utils.set_global_mark() time.sleep(1000000) return x workflow.create(never_ends.bind("hello world")).run_async("never_finishes") # Make sure the step is actualy executing before killing the cluster while not utils.check_global_mark(): time.sleep(0.1) # Restart ray.shutdown() subprocess.check_output("ray stop --force", shell=True) ray.init(storage=_storage_uri) workflow.init() with pytest.raises(ray.exceptions.RaySystemError): result = workflow.get_output("never_finishes") ray.get(result) workflow.delete("never_finishes") with pytest.raises(ValueError): ouput = workflow.get_output("never_finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(WorkflowNotFoundError): # workflow.resume("never_finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_finishes") # Delete a workflow which has finished. @ray.remote def basic_step(arg): return arg result = workflow.create( basic_step.bind("hello world")).run(workflow_id="finishes") assert result == "hello world" ouput = workflow.get_output("finishes") assert ray.get(ouput) == "hello world" workflow.delete(workflow_id="finishes") with pytest.raises(ValueError): ouput = workflow.get_output("finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(ValueError): # workflow.resume("finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="finishes") assert workflow.list_all() == [] # The workflow can be re-run as if it was never run before. assert workflow.create( basic_step.bind("123")).run(workflow_id="finishes") == "123"
def test_init_twice(call_ray_start, reset_workflow, tmp_path): workflow.init() with pytest.raises(RuntimeError): workflow.init(str(tmp_path))
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow): workflow.init(storage=str(tmp_path)) constant.step().run(workflow_id="const") assert ray.get(workflow.resume(workflow_id="const")) == 31416 workflow.storage.set_global_storage(None)
def test_init_twice_2(call_ray_start, reset_workflow, tmp_path): run_string_as_driver(driver_script) with pytest.raises(RuntimeError): workflow.init(str(tmp_path))
def _alter_storage(new_storage): set_global_storage(new_storage) # alter the storage ray.shutdown() os.system("ray stop --force") workflow.init(new_storage)
def test_workflow_queuing_resume_all(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): _refs = [ # noqa: F841 workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") # kill all workflows ray.shutdown() ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) with filelock.FileLock(lock_path): workflow_ids, outputs = zip(*sorted(workflow.resume_all())) # We should have the same running and pending workflows, because when # resume_all(), running workflows have higher priority. assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] assert workflow_ids == ( "workflow_0", "workflow_1", "workflow_2", "workflow_3", ) assert ray.get(list(outputs)) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]
def test_workflow_lifetime_1(call_ray_start, reset_workflow): # Case 1: driver exits normally run_string_as_driver(driver_script.format(5)) workflow.init() output = workflow.get_output("driver_terminated") assert ray.get(output) == 20