def test_actor_manager(workflow_start_regular, tmp_path): lock_file = tmp_path / "lock" @workflow.virtual_actor class LockCounter: def __init__(self, lck): self.counter = 0 self.lck = lck @workflow.virtual_actor.readonly def val(self): with FileLock(self.lck): return self.counter def incr(self): with FileLock(self.lck): self.counter += 1 return self.counter def __getstate__(self): return (self.lck, self.counter) def __setstate__(self, state): self.lck, self.counter = state actor = LockCounter.get_or_create("counter", str(lock_file)) ray.get(actor.ready()) lock = FileLock(lock_file) lock.acquire() assert [("counter", workflow.SUCCESSFUL)] == workflow.list_all() v = actor.val.run_async() # Readonly function won't make the workflow running assert [("counter", workflow.SUCCESSFUL)] == workflow.list_all() lock.release() assert ray.get(v) == 0 # Writer function would make the workflow running lock.acquire() v = actor.incr.run_async() time.sleep(2) assert [("counter", workflow.RUNNING)] == workflow.list_all() lock.release() assert ray.get(v) == 1
def test_workflow_queuing_1(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ] for i in range(5): assert workflow.get_output(f"workflow_{i}") == i
def test_no_init(shutdown_only): @ray.remote def f(): pass fail_wf_init_error_msg = re.escape( "`workflow.init()` must be called prior to using the workflows API.") with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.create(f.bind()).run() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.list_all() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.resume_all() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.cancel("wf") with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.get_actor("wf")
def test_workflow_queuing_2(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) @ray.remote def short_running(x): return x wfs = [short_running.bind(i) for i in range(5)] refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] for i in range(4): assert workflow.get_output(f"workflow_{i}") == i assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]
def test_workflow_manager_simple(workflow_start_regular): assert [] == workflow.list_all() with pytest.raises(workflow.common.WorkflowNotFoundError): workflow.get_status("X")
def test_workflow_manager(workflow_start_regular, tmp_path): # For sync between jobs tmp_file = str(tmp_path / "lock") lock = FileLock(tmp_file) lock.acquire() # For sync between jobs flag_file = tmp_path / "flag" flag_file.touch() @ray.remote def long_running(i): lock = FileLock(tmp_file) with lock.acquire(): pass if i % 2 == 0: if flag_file.exists(): raise ValueError() return 100 outputs = [ workflow.create(long_running.bind(i)).run_async(workflow_id=str(i)) for i in range(100) ] # Test list all, it should list all jobs running all_tasks = workflow.list_all() assert len(all_tasks) == 100 all_tasks_running = workflow.list_all(workflow.RUNNING) assert dict(all_tasks) == dict(all_tasks_running) assert workflow.get_status("0") == "RUNNING" # Release lock and make sure all tasks finished lock.release() for o in outputs: try: r = ray.get(o) except Exception: continue assert 100 == r all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING) assert len(all_tasks_running) == 0 # Half of them failed and half succeed failed_jobs = workflow.list_all("FAILED") assert len(failed_jobs) == 50 finished_jobs = workflow.list_all("SUCCESSFUL") assert len(finished_jobs) == 50 all_tasks_status = workflow.list_all( { workflow.WorkflowStatus.SUCCESSFUL, workflow.WorkflowStatus.FAILED, workflow.WorkflowStatus.RUNNING, } ) assert len(all_tasks_status) == 100 assert failed_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED ] assert finished_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL ] # Test get_status assert workflow.get_status("0") == "FAILED" assert workflow.get_status("1") == "SUCCESSFUL" lock.acquire() r = workflow.resume("0") assert workflow.get_status("0") == workflow.RUNNING flag_file.unlink() lock.release() assert 100 == ray.get(r) assert workflow.get_status("0") == workflow.SUCCESSFUL # Test cancel lock.acquire() workflow.resume("2") assert workflow.get_status("2") == workflow.RUNNING workflow.cancel("2") assert workflow.get_status("2") == workflow.CANCELED # Now resume_all resumed = workflow.resume_all(include_failed=True) assert len(resumed) == 48 lock.release() assert [ray.get(o) for (_, o) in resumed] == [100] * 48
def test_workflow_manager_simple(workflow_start_regular): assert [] == workflow.list_all() with pytest.raises(ValueError): workflow.get_status("X")
def test_delete(workflow_start_regular): from ray._private.storage import _storage_uri # Try deleting a random workflow that never existed. with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_existed") # Delete a workflow that has not finished and is not running. @ray.remote def never_ends(x): utils.set_global_mark() time.sleep(1000000) return x workflow.create(never_ends.bind("hello world")).run_async("never_finishes") # Make sure the step is actualy executing before killing the cluster while not utils.check_global_mark(): time.sleep(0.1) # Restart ray.shutdown() subprocess.check_output("ray stop --force", shell=True) ray.init(storage=_storage_uri) workflow.init() with pytest.raises(ray.exceptions.RaySystemError): result = workflow.get_output("never_finishes") ray.get(result) workflow.delete("never_finishes") with pytest.raises(ValueError): ouput = workflow.get_output("never_finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(WorkflowNotFoundError): # workflow.resume("never_finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_finishes") # Delete a workflow which has finished. @ray.remote def basic_step(arg): return arg result = workflow.create( basic_step.bind("hello world")).run(workflow_id="finishes") assert result == "hello world" ouput = workflow.get_output("finishes") assert ray.get(ouput) == "hello world" workflow.delete(workflow_id="finishes") with pytest.raises(ValueError): ouput = workflow.get_output("finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(ValueError): # workflow.resume("finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="finishes") assert workflow.list_all() == [] # The workflow can be re-run as if it was never run before. assert workflow.create( basic_step.bind("123")).run(workflow_id="finishes") == "123"
def test_no_init_api(shutdown_only): workflow.list_all()
def test_workflow_manager_simple(workflow_start_regular): from ray.workflow.exceptions import WorkflowNotFoundError assert [] == workflow.list_all() with pytest.raises(WorkflowNotFoundError): workflow.get_status("X")
def test_workflow_queuing_resume_all(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): _refs = [ # noqa: F841 workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") # kill all workflows ray.shutdown() ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) with filelock.FileLock(lock_path): workflow_ids, outputs = zip(*sorted(workflow.resume_all())) # We should have the same running and pending workflows, because when # resume_all(), running workflows have higher priority. assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] assert workflow_ids == ( "workflow_0", "workflow_1", "workflow_2", "workflow_3", ) assert ray.get(list(outputs)) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]