def test_run_or_resume_during_running(workflow_start_regular_shared): @ray.remote def source1(): return "[source1]" @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def simple_sequential(): x = source1.bind() y = append1.bind(x) return workflow.continuation(append2.bind(y)) output = workflow.create( simple_sequential.bind()).run_async(workflow_id="running_workflow") with pytest.raises(RuntimeError): workflow.create( simple_sequential.bind()).run_async(workflow_id="running_workflow") with pytest.raises(RuntimeError): workflow.resume(workflow_id="running_workflow") assert ray.get(output) == "[source1][append1][append2]"
def test_crash_after_commit(workflow_start_regular_shared): _storage = storage.get_global_storage() """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash. Here we must call `event_checkpointed` twice, because there's no way to know if we called it after checkpointing. """ class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000) event_promise = workflow.wait_for_event(MyEventListener) event_promise.run_async("workflow") while not utils.check_global_mark("first"): time.sleep(0.1) ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) ray.init(num_cpus=4) workflow.init(storage=_storage) workflow.resume("workflow") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def test_recovery_simple(workflow_start_regular): @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def simple(x): x = append1.bind(x) y = the_failed_step.bind(x) z = append2.bind(y) return workflow.continuation(z) utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError workflow.create(simple.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status( workflow_id) == workflow.WorkflowStatus.RESUMABLE utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_run_or_resume_during_running(workflow_start_regular_shared): output = simple_sequential.step().run_async(workflow_id="running_workflow") with pytest.raises(RuntimeError): simple_sequential.step().run_async(workflow_id="running_workflow") with pytest.raises(RuntimeError): workflow.resume(workflow_id="running_workflow") assert ray.get(output) == "[source1][append1][append2]"
def test_recovery_simple_1(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple_1" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.run(the_failed_step.bind("x"), workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() assert workflow.resume(workflow_id) == "foo(x)" utils.unset_global_mark() # resume from workflow output checkpoint assert workflow.resume(workflow_id) == "foo(x)"
def test_recovery_complex(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError complex.step("x").run(workflow_id=workflow_id) utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r
def test_wait_recovery_step_id(workflow_start_regular_shared): # This test ensures workflow reuse the original directory and # step id for "workflow.wait" during recovery. @workflow.step def identity(x: int): # block the step by a global mark assert utils.check_global_mark() return x w = workflow.wait([identity.step(42)], num_returns=1, timeout=None) utils.unset_global_mark() with pytest.raises(RaySystemError): _ = w.run(workflow_id="test_wait_recovery_step_id") utils.set_global_mark() ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id")) assert ready == [42] from ray.workflow import storage, workflow_storage global_storage = storage.get_global_storage() wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id", global_storage) index = wf_storage.gen_step_id("workflow.wait") # no new step id assert index <= 1
def test_failed_and_resumed_workflow(workflow_start_regular, tmp_path): workflow_id = "simple" error_flag = tmp_path / "error" error_flag.touch() @ray.remote def simple(): if error_flag.exists(): raise ValueError() return 0 with pytest.raises(workflow.WorkflowExecutionError): workflow.create(simple.bind()).run(workflow_id) workflow_metadata_failed = workflow.get_metadata(workflow_id) assert workflow_metadata_failed["status"] == "FAILED" error_flag.unlink() ref = workflow.resume(workflow_id) assert ray.get(ref) == 0 workflow_metadata_resumed = workflow.get_metadata(workflow_id) assert workflow_metadata_resumed["status"] == "SUCCESSFUL" # make sure resume updated running metrics assert ( workflow_metadata_resumed["stats"]["start_time"] > workflow_metadata_failed["stats"]["start_time"] ) assert ( workflow_metadata_resumed["stats"]["end_time"] > workflow_metadata_failed["stats"]["end_time"] )
def test_wait_failure_recovery_1(workflow_start_regular_shared): # This tests that if a step using the output of "workflow.wait" as its # input, it can be recovered after failure. @workflow.step def get_all(ready, unready): return ready, unready @workflow.step def filter_all_2(wait_results): assert wait_results[0] == [1, 3, 2] # failure point assert utils.check_global_mark() ready, unready = wait_results return get_all.step(ready, unready) @workflow.step def composite_2(): w = wait_multiple_steps.step() return filter_all_2.step(w) utils.unset_global_mark() with pytest.raises(RaySystemError): composite_2.step().run(workflow_id="wait_failure_recovery") utils.set_global_mark() ready, unready = ray.get(workflow.resume("wait_failure_recovery")) assert ready == [1, 3, 2] assert unready == [10, 12]
def test_wait_failure_recovery_2(workflow_start_regular_shared): # Test failing "workflow.wait" and its input steps. @workflow.step def sleep_identity(x: int): # block the step by a global mark while not utils.check_global_mark(): time.sleep(0.1) time.sleep(x) return x @workflow.step def identity(x): return x ws = [ sleep_identity.step(2), sleep_identity.step(5), sleep_identity.step(1), ] w = workflow.wait(ws, num_returns=2, timeout=None) utils.unset_global_mark() _ = identity.step(w).run_async(workflow_id="wait_failure_recovery_2") # wait util "workflow.wait" has been running time.sleep(10) workflow.cancel("wait_failure_recovery_2") time.sleep(2) utils.set_global_mark() ready, unready = ray.get(workflow.resume("wait_failure_recovery_2")) assert ready == [2, 1]
def test_failed_and_resumed_workflow(workflow_start_regular, tmp_path): workflow_id = "simple" error_flag = tmp_path / "error" error_flag.touch() @workflow.step def simple(): if error_flag.exists(): raise ValueError() return 0 with pytest.raises(ray.exceptions.RaySystemError): simple.step().run(workflow_id) workflow_metadata_failed = workflow.get_metadata(workflow_id) assert workflow_metadata_failed["status"] == "FAILED" error_flag.unlink() ref = workflow.resume(workflow_id) assert ray.get(ref) == 0 workflow_metadata_resumed = workflow.get_metadata(workflow_id) assert workflow_metadata_resumed["status"] == "SUCCESSFUL" # make sure resume updated running metrics assert workflow_metadata_resumed["stats"]["start_time"] \ > workflow_metadata_failed["stats"]["start_time"] assert workflow_metadata_resumed["stats"]["end_time"] \ > workflow_metadata_failed["stats"]["end_time"]
def test_recovery_cluster_failure(tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking(f""" import time import ray from ray import workflow @ray.remote def foo(x): print("Executing", x) time.sleep(1) if x < 20: return workflow.continuation(foo.bind(x + 1)) else: return 20 if __name__ == "__main__": ray.init(storage="{tmp_path}") workflow.init() assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20 """) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) ray.init(storage=str(tmp_path)) workflow.init() assert ray.get(workflow.resume("cluster_failure")) == 20 ray.shutdown()
def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @ray.remote def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(workflow.WorkflowExecutionError): workflow.run(incr.options(max_retries=0).bind(), workflow_id="incr") assert cnt_file.read_text() == "1" from ray.exceptions import RaySystemError # TODO(suquark): We should prevent Ray from raising "RaySystemError", # in workflow, because "RaySystemError" does not inherit the underlying # error, so users and developers cannot catch the expected error. # I feel this issue is a very annoying. with pytest.raises((RaySystemError, ValueError)): workflow.get_output("incr") assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises((RaySystemError, ValueError)): workflow.get_output("incr") assert workflow.resume("incr") == 10
def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @workflow.step def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(ray.exceptions.RaySystemError): incr.options(max_retries=1).step().run("incr") assert cnt_file.read_text() == "1" with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert ray.get(workflow.resume("incr")) == 10
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" from ray.internal import storage storage_uri = storage._storage_uri """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @ray.remote def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) workflow.create(wait_then_finish.bind(event_promise)).run_async("workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume("workflow") utils.set_global_mark("resume") ray.get(workflow.get_output("workflow")) assert utils.check_global_mark("second")
def test_recovery_simple(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError simple.step("x").run(workflow_id=workflow_id) assert workflow.get_status( workflow_id) == workflow.WorkflowStatus.RESUMABLE utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_resume_different_storage(ray_start_regular, tmp_path, reset_workflow): @ray.remote def constant(): return 31416 workflow.init(storage=str(tmp_path)) workflow.create(constant.bind()).run(workflow_id="const") assert ray.get(workflow.resume(workflow_id="const")) == 31416 workflow.storage.set_global_storage(None)
def test_resume_different_storage(shutdown_only, tmp_path): @ray.remote def constant(): return 31416 ray.init(storage=str(tmp_path)) workflow.init() workflow.create(constant.bind()).run(workflow_id="const") assert ray.get(workflow.resume(workflow_id="const")) == 31416
def test_recovery_complex(workflow_start_regular): @ray.remote def source1(): return "[source1]" @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def join(x, y): return f"join({x}, {y})" @ray.remote def complex(x1): x2 = source1.bind() v = join.bind(x1, x2) y = append1.bind(x1) y = the_failed_step.bind(y) z = append2.bind(x2) u = join.bind(y, z) return workflow.continuation(join.bind(u, v)) utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.create(complex.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r
def resume(num_records_replayed): key = debug_store.wrapped_storage.make_key("complex_workflow") asyncio_run(debug_store.wrapped_storage.delete_prefix(key)) async def replay(): # We need to replay one by one to avoid conflict for i in range(num_records_replayed): await debug_store.replay(i) asyncio_run(replay()) return ray.get(workflow.resume(workflow_id="complex_workflow"))
def test_recovery_simple_2(workflow_start_regular): @ray.remote def simple(x): return workflow.continuation(the_failed_step.bind(x)) utils.unset_global_mark() workflow_id = "test_recovery_simple_2" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.create(simple.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x)" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x)"
def test_checkpoint_dag_full(workflow_start_regular_shared): outputs = workflow.create( checkpoint_dag.options(**workflow.options(name="checkpoint_dag")).bind(True) ).run(workflow_id="checkpoint_whole") assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_whole")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole") _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed") _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def test_recovery_cluster_failure(reset_workflow, tmp_path): subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) proc = run_string_as_driver_nonblocking( driver_script.format(tmp_path=str(tmp_path))) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) workflow.init(str(tmp_path)) assert ray.get(workflow.resume("cluster_failure")) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_checkpoint_dag_skip_partial(workflow_start_regular_shared): outputs = workflow.run( checkpoint_dag.options(**workflow.options( name="checkpoint_dag")).bind(False), workflow_id="checkpoint_partial", ) assert np.isclose(outputs, 8388607.5) recovered = workflow.resume("checkpoint_partial") assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial") _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped") _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def test_checkpoint_dag_full(workflow_start_regular): global_storage = storage.get_global_storage() outputs = utils.run_workflow_dag_with_options( checkpoint_dag, (True,), workflow_id="checkpoint_whole", name="checkpoint_dag" ) assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_whole")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole", global_storage) _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed") _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def test_checkpoint_dag_skip_partial(workflow_start_regular): global_storage = storage.get_global_storage() outputs = ( checkpoint_dag.options(name="checkpoint_dag") .step(False) .run(workflow_id="checkpoint_partial") ) assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_partial")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial", global_storage) _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped") _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def test_checkpoint_dag_recovery_partial(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(RaySystemError): workflow.create(checkpoint_dag.bind(False)).run( workflow_id="checkpoint_partial_recovery") run_duration_partial = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_partial_recovery")) recover_duration_partial = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[partial] run_duration = {run_duration_partial}, " f"recover_duration = {recover_duration_partial}")
def test_checkpoint_dag_recovery_whole(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(workflow.WorkflowExecutionError): workflow.run(checkpoint_dag.bind(True), workflow_id="checkpoint_whole_recovery") run_duration_whole = time.time() - start utils.set_global_mark() start = time.time() recovered = workflow.resume("checkpoint_whole_recovery") recover_duration_whole = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[whole] run_duration = {run_duration_whole}, " f"recover_duration = {recover_duration_whole}")
def test_checkpoint_dag_recovery_skip(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(workflow.WorkflowExecutionError): workflow.create( checkpoint_dag.options(**workflow.options( checkpoint=False)).bind(False)).run( workflow_id="checkpoint_skip_recovery") run_duration_skipped = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_skip_recovery")) recover_duration_skipped = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[skipped] run_duration = {run_duration_skipped}, " f"recover_duration = {recover_duration_skipped}")
def test_workflow_manager(workflow_start_regular, tmp_path): # For sync between jobs tmp_file = str(tmp_path / "lock") lock = FileLock(tmp_file) lock.acquire() # For sync between jobs flag_file = tmp_path / "flag" flag_file.touch() @ray.remote def long_running(i): lock = FileLock(tmp_file) with lock.acquire(): pass if i % 2 == 0: if flag_file.exists(): raise ValueError() return 100 outputs = [ workflow.create(long_running.bind(i)).run_async(workflow_id=str(i)) for i in range(100) ] # Test list all, it should list all jobs running all_tasks = workflow.list_all() assert len(all_tasks) == 100 all_tasks_running = workflow.list_all(workflow.RUNNING) assert dict(all_tasks) == dict(all_tasks_running) assert workflow.get_status("0") == "RUNNING" # Release lock and make sure all tasks finished lock.release() for o in outputs: try: r = ray.get(o) except Exception: continue assert 100 == r all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING) assert len(all_tasks_running) == 0 # Half of them failed and half succeed failed_jobs = workflow.list_all("FAILED") assert len(failed_jobs) == 50 finished_jobs = workflow.list_all("SUCCESSFUL") assert len(finished_jobs) == 50 all_tasks_status = workflow.list_all( { workflow.WorkflowStatus.SUCCESSFUL, workflow.WorkflowStatus.FAILED, workflow.WorkflowStatus.RUNNING, } ) assert len(all_tasks_status) == 100 assert failed_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED ] assert finished_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL ] # Test get_status assert workflow.get_status("0") == "FAILED" assert workflow.get_status("1") == "SUCCESSFUL" lock.acquire() r = workflow.resume("0") assert workflow.get_status("0") == workflow.RUNNING flag_file.unlink() lock.release() assert 100 == ray.get(r) assert workflow.get_status("0") == workflow.SUCCESSFUL # Test cancel lock.acquire() workflow.resume("2") assert workflow.get_status("2") == workflow.RUNNING workflow.cancel("2") assert workflow.get_status("2") == workflow.CANCELED # Now resume_all resumed = workflow.resume_all(include_failed=True) assert len(resumed) == 48 lock.release() assert [ray.get(o) for (_, o) in resumed] == [100] * 48