def test_cancellation(tmp_path, workflow_start_regular): lock_a = tmp_path / "lock_a" lock_b = tmp_path / "lock_b" @ray.remote def simple(): with filelock.FileLock(lock_a): with filelock.FileLock(lock_b): pass workflow_id = "test_cancellation" with filelock.FileLock(lock_b): r = workflow.run_async(simple.bind(), workflow_id=workflow_id) try: ray.get(r, timeout=5) except GetTimeoutError: pass else: assert False assert workflow.get_status(workflow_id) == WorkflowStatus.RUNNING workflow.cancel(workflow_id) with pytest.raises(workflow.WorkflowCancellationError): ray.get(r) lock = filelock.FileLock(lock_a) lock.acquire(timeout=5) assert workflow.get_status(workflow_id) == WorkflowStatus.CANCELED
def test_workflow_queuing_3(shutdown_only, tmp_path): """This test ensures the queuing workflow is indeed pending.""" ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=1, max_pending_workflows=1) import time import filelock from ray.exceptions import GetTimeoutError lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): (tmp_path / str(x)).write_text(str(x)) with filelock.FileLock(lock_path): return x workflow_id = "test_workflow_queuing_3" with filelock.FileLock(lock_path): wf_1 = workflow.run_async(long_running.bind(1), workflow_id=f"{workflow_id}_1") wf_2 = workflow.run_async(long_running.bind(2), workflow_id=f"{workflow_id}_2") time.sleep(5) assert (tmp_path / str(1)).exists() assert not (tmp_path / str(2)).exists() assert workflow.get_status( workflow_id=f"{workflow_id}_1") == workflow.RUNNING assert workflow.get_status( workflow_id=f"{workflow_id}_2") == workflow.PENDING with pytest.raises(GetTimeoutError): ray.get(wf_2, timeout=5) assert ray.get([wf_1, wf_2]) == [1, 2]
def test_recovery_simple(workflow_start_regular): @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def simple(x): x = append1.bind(x) y = the_failed_step.bind(x) z = append2.bind(y) return workflow.continuation(z) utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError workflow.create(simple.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status( workflow_id) == workflow.WorkflowStatus.RESUMABLE utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_event_as_workflow(workflow_start_regular_shared): class MyEventListener(workflow.EventListener): async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(1) utils.unset_global_mark() promise = workflow.wait_for_event(MyEventListener).run_async("wf") assert workflow.get_status("wf") == workflow.WorkflowStatus.RUNNING utils.set_global_mark() assert ray.get(promise) is None
def test_recovery_simple_1(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple_1" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.run(the_failed_step.bind("x"), workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() assert workflow.resume(workflow_id) == "foo(x)" utils.unset_global_mark() # resume from workflow output checkpoint assert workflow.resume(workflow_id) == "foo(x)"
def test_recovery_simple(workflow_start_regular): utils.unset_global_mark() workflow_id = "test_recovery_simple" with pytest.raises(RaySystemError): # internally we get WorkerCrashedError simple.step("x").run(workflow_id=workflow_id) assert workflow.get_status( workflow_id) == workflow.WorkflowStatus.RESUMABLE utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x[append1])[append2]"
def test_recovery_complex(workflow_start_regular): @ray.remote def source1(): return "[source1]" @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def join(x, y): return f"join({x}, {y})" @ray.remote def complex(x1): x2 = source1.bind() v = join.bind(x1, x2) y = append1.bind(x1) y = the_failed_step.bind(y) z = append2.bind(x2) u = join.bind(y, z) return workflow.continuation(join.bind(u, v)) utils.unset_global_mark() workflow_id = "test_recovery_complex" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.create(complex.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) r = "join(join(foo(x[append1]), [source1][append2]), join(x, [source1]))" assert ray.get(output) == r
def test_run_off_main_thread(workflow_start_regular): @workflow.step def fake_data(num: int): return list(range(num)) succ = False # Start new thread here ⚠️ def run(): global succ # Setup the workflow. data = fake_data.step(10) assert data.run(workflow_id="run") == list(range(10)) import threading t = threading.Thread(target=run) t.start() t.join() assert workflow.get_status("run") == workflow.SUCCESSFUL
def test_run_off_main_thread(workflow_start_regular_shared): @ray.remote def fake_data(num: int): return list(range(num)) succ = False # Start new thread here ⚠️ def run(): global succ # Setup the workflow. assert workflow.run(fake_data.bind(10), workflow_id="run") == list(range(10)) import threading t = threading.Thread(target=run) t.start() t.join() assert workflow.get_status("run") == workflow.SUCCESSFUL
def test_recovery_simple_2(workflow_start_regular): @ray.remote def simple(x): return workflow.continuation(the_failed_step.bind(x)) utils.unset_global_mark() workflow_id = "test_recovery_simple_2" with pytest.raises(workflow.WorkflowExecutionError): # internally we get WorkerCrashedError workflow.create(simple.bind("x")).run(workflow_id=workflow_id) assert workflow.get_status(workflow_id) == workflow.WorkflowStatus.FAILED utils.set_global_mark() output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x)" utils.unset_global_mark() # resume from workflow output checkpoint output = workflow.resume(workflow_id) assert ray.get(output) == "foo(x)"
def test_workflow_manager_simple(workflow_start_regular): assert [] == workflow.list_all() with pytest.raises(workflow.common.WorkflowNotFoundError): workflow.get_status("X")
def test_workflow_manager(workflow_start_regular, tmp_path): # For sync between jobs tmp_file = str(tmp_path / "lock") lock = FileLock(tmp_file) lock.acquire() # For sync between jobs flag_file = tmp_path / "flag" flag_file.touch() @ray.remote def long_running(i): lock = FileLock(tmp_file) with lock.acquire(): pass if i % 2 == 0: if flag_file.exists(): raise ValueError() return 100 outputs = [ workflow.create(long_running.bind(i)).run_async(workflow_id=str(i)) for i in range(100) ] # Test list all, it should list all jobs running all_tasks = workflow.list_all() assert len(all_tasks) == 100 all_tasks_running = workflow.list_all(workflow.RUNNING) assert dict(all_tasks) == dict(all_tasks_running) assert workflow.get_status("0") == "RUNNING" # Release lock and make sure all tasks finished lock.release() for o in outputs: try: r = ray.get(o) except Exception: continue assert 100 == r all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING) assert len(all_tasks_running) == 0 # Half of them failed and half succeed failed_jobs = workflow.list_all("FAILED") assert len(failed_jobs) == 50 finished_jobs = workflow.list_all("SUCCESSFUL") assert len(finished_jobs) == 50 all_tasks_status = workflow.list_all( { workflow.WorkflowStatus.SUCCESSFUL, workflow.WorkflowStatus.FAILED, workflow.WorkflowStatus.RUNNING, } ) assert len(all_tasks_status) == 100 assert failed_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED ] assert finished_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL ] # Test get_status assert workflow.get_status("0") == "FAILED" assert workflow.get_status("1") == "SUCCESSFUL" lock.acquire() r = workflow.resume("0") assert workflow.get_status("0") == workflow.RUNNING flag_file.unlink() lock.release() assert 100 == ray.get(r) assert workflow.get_status("0") == workflow.SUCCESSFUL # Test cancel lock.acquire() workflow.resume("2") assert workflow.get_status("2") == workflow.RUNNING workflow.cancel("2") assert workflow.get_status("2") == workflow.CANCELED # Now resume_all resumed = workflow.resume_all(include_failed=True) assert len(resumed) == 48 lock.release() assert [ray.get(o) for (_, o) in resumed] == [100] * 48
def test_workflow_manager_simple(workflow_start_regular): assert [] == workflow.list_all() with pytest.raises(ValueError): workflow.get_status("X")
def test_workflow_manager_simple(workflow_start_regular): from ray.workflow.exceptions import WorkflowNotFoundError assert [] == workflow.list_all() with pytest.raises(WorkflowNotFoundError): workflow.get_status("X")