def test_dedupe_cluster_failure(tmp_path): ray.shutdown() """ ======== driver 1 =========== 1. Checkpoing the input args * Uploads 2. Begin to run step * Crash ====== driver 2 ============ 1. Recover inputs * Creates a new object ref 2. Finish running step 3. Checkpoint step output * Should not trigger upload """ lock_file = tmp_path / "lock" workflow_dir = tmp_path / "workflow" driver_script = f""" import time import ray from ray import workflow from filelock import FileLock @ray.remote def foo(objrefs): with FileLock("{str(lock_file)}"): return objrefs if __name__ == "__main__": ray.init(storage="{str(workflow_dir)}") workflow.init() arg = ray.put("hello world") workflow.create(foo.bind([arg, arg])).run() assert False """ lock = FileLock(lock_file) lock.acquire() run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop", "--force"]) lock.release() ray.init(storage=str(workflow_dir)) workflow.init() resumed = workflow.resume_all() assert len(resumed) == 1 objref = resumed.pop()[1] ray.get(objref) # The object ref will be different before and after recovery, so it will # get uploaded twice. assert get_num_uploads() == 1 ray.shutdown()
def test_no_init(shutdown_only): @ray.remote def f(): pass fail_wf_init_error_msg = re.escape( "`workflow.init()` must be called prior to using the workflows API.") with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.create(f.bind()).run() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.list_all() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.resume_all() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.cancel("wf") with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.get_actor("wf")
def test_recovery_cluster_failure_resume_all(tmp_path, shutdown_only): ray.shutdown() tmp_path = tmp_path subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) workflow_dir = tmp_path / "workflow" lock_file = tmp_path / "lock_file" lock = FileLock(lock_file) lock.acquire() proc = run_string_as_driver_nonblocking( f""" import time import ray from ray import workflow from filelock import FileLock @ray.remote def foo(x): with FileLock("{str(lock_file)}"): return 20 if __name__ == "__main__": ray.init(storage="{str(workflow_dir)}") workflow.init() assert workflow.create(foo.bind(0)).run(workflow_id="cluster_failure") == 20 """ ) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) lock.release() ray.init(storage=str(workflow_dir)) workflow.init() resumed = workflow.resume_all() assert len(resumed) == 1 (wid, obj_ref) = resumed[0] assert wid == "cluster_failure" assert ray.get(obj_ref) == 20
def test_recovery_cluster_failure_resume_all(reset_workflow, tmp_path): tmp_path = tmp_path subprocess.check_call(["ray", "start", "--head"]) time.sleep(1) workflow_dir = tmp_path / "workflow" lock_file = tmp_path / "lock_file" driver_script = f""" import time from ray import workflow from filelock import FileLock @workflow.step def foo(x): with FileLock("{str(lock_file)}"): return 20 if __name__ == "__main__": workflow.init("{str(workflow_dir)}") assert foo.step(0).run(workflow_id="cluster_failure") == 20 """ lock = FileLock(lock_file) lock.acquire() proc = run_string_as_driver_nonblocking(driver_script) time.sleep(10) subprocess.check_call(["ray", "stop"]) proc.kill() time.sleep(1) lock.release() workflow.init(str(workflow_dir)) resumed = workflow.resume_all() assert len(resumed) == 1 (wid, obj_ref) = resumed[0] assert wid == "cluster_failure" assert ray.get(obj_ref) == 20 workflow.storage.set_global_storage(None) ray.shutdown()
def test_workflow_manager(workflow_start_regular, tmp_path): # For sync between jobs tmp_file = str(tmp_path / "lock") lock = FileLock(tmp_file) lock.acquire() # For sync between jobs flag_file = tmp_path / "flag" flag_file.touch() @ray.remote def long_running(i): lock = FileLock(tmp_file) with lock.acquire(): pass if i % 2 == 0: if flag_file.exists(): raise ValueError() return 100 outputs = [ workflow.create(long_running.bind(i)).run_async(workflow_id=str(i)) for i in range(100) ] # Test list all, it should list all jobs running all_tasks = workflow.list_all() assert len(all_tasks) == 100 all_tasks_running = workflow.list_all(workflow.RUNNING) assert dict(all_tasks) == dict(all_tasks_running) assert workflow.get_status("0") == "RUNNING" # Release lock and make sure all tasks finished lock.release() for o in outputs: try: r = ray.get(o) except Exception: continue assert 100 == r all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING) assert len(all_tasks_running) == 0 # Half of them failed and half succeed failed_jobs = workflow.list_all("FAILED") assert len(failed_jobs) == 50 finished_jobs = workflow.list_all("SUCCESSFUL") assert len(finished_jobs) == 50 all_tasks_status = workflow.list_all( { workflow.WorkflowStatus.SUCCESSFUL, workflow.WorkflowStatus.FAILED, workflow.WorkflowStatus.RUNNING, } ) assert len(all_tasks_status) == 100 assert failed_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED ] assert finished_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL ] # Test get_status assert workflow.get_status("0") == "FAILED" assert workflow.get_status("1") == "SUCCESSFUL" lock.acquire() r = workflow.resume("0") assert workflow.get_status("0") == workflow.RUNNING flag_file.unlink() lock.release() assert 100 == ray.get(r) assert workflow.get_status("0") == workflow.SUCCESSFUL # Test cancel lock.acquire() workflow.resume("2") assert workflow.get_status("2") == workflow.RUNNING workflow.cancel("2") assert workflow.get_status("2") == workflow.CANCELED # Now resume_all resumed = workflow.resume_all(include_failed=True) assert len(resumed) == 48 lock.release() assert [ray.get(o) for (_, o) in resumed] == [100] * 48
def test_workflow_queuing_resume_all(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): _refs = [ # noqa: F841 workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") # kill all workflows ray.shutdown() ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) with filelock.FileLock(lock_path): workflow_ids, outputs = zip(*sorted(workflow.resume_all())) # We should have the same running and pending workflows, because when # resume_all(), running workflows have higher priority. assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] assert workflow_ids == ( "workflow_0", "workflow_1", "workflow_2", "workflow_3", ) assert ray.get(list(outputs)) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]