def test_workflow_queuing_3(shutdown_only, tmp_path): """This test ensures the queuing workflow is indeed pending.""" ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=1, max_pending_workflows=1) import time import filelock from ray.exceptions import GetTimeoutError lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): (tmp_path / str(x)).write_text(str(x)) with filelock.FileLock(lock_path): return x workflow_id = "test_workflow_queuing_3" with filelock.FileLock(lock_path): wf_1 = workflow.run_async(long_running.bind(1), workflow_id=f"{workflow_id}_1") wf_2 = workflow.run_async(long_running.bind(2), workflow_id=f"{workflow_id}_2") time.sleep(5) assert (tmp_path / str(1)).exists() assert not (tmp_path / str(2)).exists() assert workflow.get_status( workflow_id=f"{workflow_id}_1") == workflow.RUNNING assert workflow.get_status( workflow_id=f"{workflow_id}_2") == workflow.PENDING with pytest.raises(GetTimeoutError): ray.get(wf_2, timeout=5) assert ray.get([wf_1, wf_2]) == [1, 2]
def test_running_and_canceled_workflow(workflow_start_regular, tmp_path): workflow_id = "simple" flag = tmp_path / "flag" @ray.remote def simple(): flag.touch() time.sleep(1000) return 0 workflow.run_async(simple.bind(), workflow_id=workflow_id) # Wait until step runs to make sure pre-run metadata is written while not flag.exists(): time.sleep(1) workflow_metadata = workflow.get_metadata(workflow_id) assert workflow_metadata["status"] == "RUNNING" assert "start_time" in workflow_metadata["stats"] assert "end_time" not in workflow_metadata["stats"] workflow.cancel(workflow_id) workflow_metadata = workflow.get_metadata(workflow_id) assert workflow_metadata["status"] == "CANCELED" assert "start_time" in workflow_metadata["stats"] assert "end_time" not in workflow_metadata["stats"]
def test_run_or_resume_during_running(workflow_start_regular_shared): @ray.remote def source1(): return "[source1]" @ray.remote def append1(x): return x + "[append1]" @ray.remote def append2(x): return x + "[append2]" @ray.remote def simple_sequential(): x = source1.bind() y = append1.bind(x) return workflow.continuation(append2.bind(y)) output = workflow.run_async(simple_sequential.bind(), workflow_id="running_workflow") with pytest.raises(RuntimeError): workflow.run_async(simple_sequential.bind(), workflow_id="running_workflow") with pytest.raises(RuntimeError): workflow.resume_async(workflow_id="running_workflow") assert ray.get(output) == "[source1][append1][append2]"
def test_output_with_name(workflow_start_regular): @ray.remote def double(v): return 2 * v inner_task = double.options(**workflow.options(name="inner")).bind(1) outer_task = double.options(**workflow.options( name="outer")).bind(inner_task) result = workflow.run_async(outer_task, workflow_id="double") inner = workflow.get_output_async("double", name="inner") outer = workflow.get_output_async("double", name="outer") assert ray.get(inner) == 2 assert ray.get(outer) == 4 assert ray.get(result) == 4 @workflow.options(name="double") @ray.remote def double_2(s): return s * 2 inner_task = double_2.bind(1) outer_task = double_2.bind(inner_task) workflow_id = "double_2" result = workflow.run_async(outer_task, workflow_id=workflow_id) inner = workflow.get_output_async(workflow_id, name="double") outer = workflow.get_output_async(workflow_id, name="double_1") assert ray.get(inner) == 2 assert ray.get(outer) == 4 assert ray.get(result) == 4
def test_user_metadata_not_dict(workflow_start_regular): @ray.remote def simple(): return 0 with pytest.raises(ValueError): workflow.run_async( simple.options(**workflow.options(metadata="x")).bind()) with pytest.raises(ValueError): workflow.run(simple.bind(), metadata="x")
def test_user_metadata_not_json_serializable(workflow_start_regular): @ray.remote def simple(): return 0 class X: pass with pytest.raises(ValueError): workflow.run_async( simple.options(**workflow.options(metadata={"x": X()})).bind()) with pytest.raises(ValueError): workflow.run(simple.bind(), metadata={"x": X()})
def test_crash_during_event_checkpointing(workflow_start_regular_shared): """Ensure that if the cluster dies while the event is being checkpointed, we properly re-poll for the event.""" from ray._private import storage storage_uri = storage._storage_uri """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash.""" class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") utils.set_global_mark("first") utils.set_global_mark("time_to_die") while not utils.check_global_mark("resume"): time.sleep(0.1) async def event_checkpointed(self, event): utils.set_global_mark("committed") @ray.remote def wait_then_finish(arg): pass event_promise = workflow.wait_for_event(MyEventListener) workflow.run_async(wait_then_finish.bind(event_promise), workflow_id="workflow") while not utils.check_global_mark("time_to_die"): time.sleep(0.1) assert utils.check_global_mark("first") ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) # Give the workflow some time to kill the cluster. # time.sleep(3) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume_async("workflow") utils.set_global_mark("resume") workflow.get_output("workflow") assert utils.check_global_mark("second")
def test_get_output_5(workflow_start_regular, tmp_path): """Test getting output of a workflow task immediately after executing it asynchronously.""" @ray.remote def simple(): return 314 workflow_id = "test_get_output_5_{}" outputs = [] for i in range(20): workflow.run_async(simple.bind(), workflow_id=workflow_id.format(i)) outputs.append(workflow.get_output_async(workflow_id.format(i))) assert ray.get(outputs) == [314] * len(outputs)
def test_step_resources(workflow_start_regular, tmp_path): lock_path = str(tmp_path / "lock") # We use signal actor here because we can't guarantee the order of tasks # sent from worker to raylet. signal_actor = SignalActor.remote() @ray.remote def step_run(): ray.wait([signal_actor.send.remote()]) with FileLock(lock_path): return None @ray.remote(num_cpus=1) def remote_run(): return None lock = FileLock(lock_path) lock.acquire() ret = workflow.run_async(step_run.options(num_cpus=2).bind()) ray.wait([signal_actor.wait.remote()]) obj = remote_run.remote() with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(obj, timeout=2) lock.release() assert ray.get(ret) is None assert ray.get(obj) is None
def test_get_output_4(workflow_start_regular, tmp_path): """Test getting output of a workflow tasks that are dynamically generated.""" lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) @ray.remote def recursive(n): if n <= 0: with FileLock(lock_path): return 42 return workflow.continuation( recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1)) workflow_id = "test_get_output_4" lock.acquire() obj = workflow.run_async( recursive.options(**workflow.options(name="10")).bind(10), workflow_id=workflow_id, ) outputs = [ workflow.get_output_async(workflow_id, name=str(i)) for i in range(11) ] outputs.append(obj) import time # wait so that 'get_output' is scheduled before executing the workflow time.sleep(3) lock.release() assert ray.get(outputs) == [42] * len(outputs)
def test_cancellation(tmp_path, workflow_start_regular): lock_a = tmp_path / "lock_a" lock_b = tmp_path / "lock_b" @ray.remote def simple(): with filelock.FileLock(lock_a): with filelock.FileLock(lock_b): pass workflow_id = "test_cancellation" with filelock.FileLock(lock_b): r = workflow.run_async(simple.bind(), workflow_id=workflow_id) try: ray.get(r, timeout=5) except GetTimeoutError: pass else: assert False assert workflow.get_status(workflow_id) == WorkflowStatus.RUNNING workflow.cancel(workflow_id) with pytest.raises(workflow.WorkflowCancellationError): ray.get(r) lock = filelock.FileLock(lock_a) lock.acquire(timeout=5) assert workflow.get_status(workflow_id) == WorkflowStatus.CANCELED
def test_wait_for_multiple_events(workflow_start_regular_shared): """If a workflow has multiple event arguments, it should wait for them at the same time. """ class EventListener1(workflow.EventListener): async def poll_for_event(self): utils.set_global_mark("listener1") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event1" class EventListener2(workflow.EventListener): async def poll_for_event(self): utils.set_global_mark("listener2") while not utils.check_global_mark("trigger_event"): await asyncio.sleep(0.1) return "event2" @ray.remote def trivial_step(arg1, arg2): return f"{arg1} {arg2}" event1_promise = workflow.wait_for_event(EventListener1) event2_promise = workflow.wait_for_event(EventListener2) promise = workflow.run_async(trivial_step.bind(event1_promise, event2_promise)) while not ( utils.check_global_mark("listener1") and utils.check_global_mark("listener2") ): time.sleep(0.1) utils.set_global_mark("trigger_event") assert ray.get(promise) == "event1 event2"
def test_async_execution(workflow_start_regular_shared): @ray.remote def blocking(): time.sleep(10) return 314 start = time.time() output = workflow.run_async(blocking.bind()) duration = time.time() - start assert duration < 5 # workflow.run is not blocked assert ray.get(output) == 314
def test_crash_after_commit(workflow_start_regular_shared): """Ensure that we don't re-call poll_for_event after `event_checkpointed` returns, even after a crash. Here we must call `event_checkpointed` twice, because there's no way to know if we called it after checkpointing. """ from ray._private import storage storage_uri = storage._storage_uri class MyEventListener(workflow.EventListener): async def poll_for_event(self): assert not utils.check_global_mark("committed") async def event_checkpointed(self, event): utils.set_global_mark("committed") if utils.check_global_mark("first"): utils.set_global_mark("second") else: utils.set_global_mark("first") await asyncio.sleep(1000000) event_promise = workflow.wait_for_event(MyEventListener) workflow.run_async(event_promise, workflow_id="workflow") while not utils.check_global_mark("first"): time.sleep(0.1) ray.shutdown() subprocess.check_output(["ray", "stop", "--force"]) ray.init(num_cpus=4, storage=storage_uri) workflow.init() workflow.resume_async("workflow") workflow.get_output("workflow") assert utils.check_global_mark("second")
def test_get_output_2(workflow_start_regular, tmp_path): lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) @ray.remote def simple(v): with FileLock(lock_path): return v lock.acquire() obj = workflow.run_async(simple.bind(0), workflow_id="simple") obj2 = workflow.get_output_async("simple") lock.release() assert ray.get([obj, obj2]) == [0, 0]
def test_event_as_workflow(workflow_start_regular_shared): class MyEventListener(workflow.EventListener): async def poll_for_event(self): while not utils.check_global_mark(): await asyncio.sleep(1) utils.unset_global_mark() promise = workflow.run_async( workflow.wait_for_event(MyEventListener), workflow_id="wf" ) assert workflow.get_status("wf") == workflow.WorkflowStatus.RUNNING utils.set_global_mark() assert ray.get(promise) is None
def test_workflow_with_pressure(workflow_start_regular_shared): pressure_level = 10 dags = [ generate_chain(), generate_continuation(), generate_random_dag(gather_and_hash), generate_layered_dag(gather_and_hash), ] ans = ray.get([d.execute() for d in dags]) outputs = [] for _ in range(pressure_level): for w in dags: outputs.append(workflow.run_async(w)) assert ray.get(outputs) == ans * pressure_level
def test_get_named_step_output_running(workflow_start_regular, tmp_path): @ray.remote def double(v, lock=None): if lock is not None: with FileLock(lock_path): return 2 * v else: return 2 * v # Get the result from named step after workflow before it's finished lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) lock.acquire() output = workflow.run_async( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind( 1, lock_path), lock_path, ), workflow_id="double-2", ) inner = workflow.get_output_async("double-2", name="inner") outer = workflow.get_output_async("double-2", name="outer") @ray.remote def wait(obj_ref): return ray.get(obj_ref[0]) # Make sure nothing is finished. ready, waiting = ray.wait( [wait.remote([output]), wait.remote([inner]), wait.remote([outer])], timeout=1) assert 0 == len(ready) assert 3 == len(waiting) # Once job finished, we'll be able to get the result. lock.release() assert [4, 2, 4] == ray.get([output, inner, outer]) inner = workflow.get_output_async("double-2", name="inner") outer = workflow.get_output_async("double-2", name="outer") assert [2, 4] == ray.get([inner, outer])
def test_workflow_queuing_1(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ] for i in range(5): assert workflow.get_output(f"workflow_{i}") == i
def test_task_id_generation(workflow_start_regular_shared, request): @ray.remote def simple(x): return x + 1 x = simple.options(**workflow.options(name="simple")).bind(-1) n = 20 for i in range(1, n): x = simple.options(**workflow.options(name="simple")).bind(x) workflow_id = "test_task_id_generation" ret = workflow.run_async(x, workflow_id=workflow_id) outputs = [workflow.get_output_async(workflow_id, name="simple")] for i in range(1, n): outputs.append( workflow.get_output_async(workflow_id, name=f"simple_{i}")) assert ray.get(ret) == n - 1 assert ray.get(outputs) == list(range(n))
def test_get_non_exist_output(workflow_start_regular, tmp_path): lock_path = str(tmp_path / "lock") @ray.remote def simple(): with FileLock(lock_path): return "hello" workflow_id = "test_get_non_exist_output" with FileLock(lock_path): dag = simple.options(**workflow.options(name="simple")).bind() ret = workflow.run_async(dag, workflow_id=workflow_id) exist = workflow.get_output_async(workflow_id, name="simple") non_exist = workflow.get_output_async(workflow_id, name="non_exist") assert ray.get(ret) == "hello" assert ray.get(exist) == "hello" with pytest.raises(ValueError, match="non_exist"): ray.get(non_exist)
def _trigger_lineage_reconstruction(with_workflow): (tmp_path / "f2").unlink(missing_ok=True) (tmp_path / "num_executed").write_text("0") worker_node_1 = cluster.add_node( num_cpus=2, resources={"worker_1": 1}, storage=str(tmp_path) ) worker_node_2 = cluster.add_node( num_cpus=2, resources={"worker_2": 1}, storage=str(tmp_path) ) worker_node_id_1 = ray.get( get_node_id.options(num_cpus=0, resources={"worker_1": 1}).remote() ) worker_node_id_2 = ray.get( get_node_id.options(num_cpus=0, resources={"worker_2": 1}).remote() ) dag = f2.options( scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id_2, soft=True ) ).bind( f1.options( scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id_1, soft=True ) ).bind() ) with FileLock(lock_path): if with_workflow: ref = workflow.run_async(dag) else: ref = dag.execute() while not (tmp_path / "f2").exists(): time.sleep(0.1) cluster.remove_node(worker_node_1, allow_graceful=False) cluster.remove_node(worker_node_2, allow_graceful=False) return ray.get(ref).sum()
def test_workflow_queuing_2(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) @ray.remote def short_running(x): return x wfs = [short_running.bind(i) for i in range(5)] refs = [ workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] for i in range(4): assert workflow.get_output(f"workflow_{i}") == i assert ray.get(refs) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]
def test_delete(workflow_start_regular): from ray._private.storage import _storage_uri # Try deleting a random workflow that never existed. with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_existed") # Delete a workflow that has not finished and is not running. @ray.remote def never_ends(x): utils.set_global_mark() time.sleep(1000000) return x workflow.run_async(never_ends.bind("hello world"), workflow_id="never_finishes") # Make sure the step is actualy executing before killing the cluster while not utils.check_global_mark(): time.sleep(0.1) # Restart ray.shutdown() subprocess.check_output("ray stop --force", shell=True) ray.init(storage=_storage_uri) workflow.init() with pytest.raises(ray.exceptions.RaySystemError): workflow.get_output("never_finishes") workflow.delete("never_finishes") with pytest.raises(ray.exceptions.RaySystemError): # TODO(suquark): we should raise "ValueError" without # been blocking over the result. workflow.get_output("never_finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(WorkflowNotFoundError): # workflow.resume("never_finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="never_finishes") # Delete a workflow which has finished. @ray.remote def basic_step(arg): return arg result = workflow.run(basic_step.bind("hello world"), workflow_id="finishes") assert result == "hello world" assert workflow.get_output("finishes") == "hello world" workflow.delete(workflow_id="finishes") with pytest.raises(ray.exceptions.RaySystemError): # TODO(suquark): we should raise "ValueError" without # blocking over the result. workflow.get_output("finishes") # TODO(Alex): Uncomment after # https://github.com/ray-project/ray/issues/19481. # with pytest.raises(ValueError): # workflow.resume("finishes") with pytest.raises(WorkflowNotFoundError): workflow.delete(workflow_id="finishes") assert workflow.list_all() == [] # The workflow can be re-run as if it was never run before. assert workflow.run(basic_step.bind("123"), workflow_id="finishes") == "123"
def test_workflow_queuing_resume_all(shutdown_only, tmp_path): ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) import queue import filelock lock_path = str(tmp_path / ".lock") @ray.remote def long_running(x): with filelock.FileLock(lock_path): return x wfs = [long_running.bind(i) for i in range(5)] with filelock.FileLock(lock_path): _refs = [ # noqa: F841 workflow.run_async(wfs[i], workflow_id=f"workflow_{i}") for i in range(4) ] assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] with pytest.raises(queue.Full, match="Workflow queue has been full"): workflow.run(wfs[4], workflow_id="workflow_4") # kill all workflows ray.shutdown() ray.init(storage=str(tmp_path)) workflow.init(max_running_workflows=2, max_pending_workflows=2) with filelock.FileLock(lock_path): workflow_ids, outputs = zip(*sorted(workflow.resume_all())) # We should have the same running and pending workflows, because when # resume_all(), running workflows have higher priority. assert sorted(x[0] for x in workflow.list_all({workflow.RUNNING})) == [ "workflow_0", "workflow_1", ] assert sorted(x[0] for x in workflow.list_all({workflow.PENDING})) == [ "workflow_2", "workflow_3", ] assert workflow_ids == ( "workflow_0", "workflow_1", "workflow_2", "workflow_3", ) assert ray.get(list(outputs)) == [0, 1, 2, 3] assert workflow.run(wfs[4], workflow_id="workflow_4") == 4 assert sorted(x[0] for x in workflow.list_all({workflow.SUCCESSFUL})) == [ "workflow_0", "workflow_1", "workflow_2", "workflow_3", "workflow_4", ]
def test_workflow_manager(workflow_start_regular, tmp_path): # For sync between jobs tmp_file = str(tmp_path / "lock") lock = FileLock(tmp_file) lock.acquire() # For sync between jobs flag_file = tmp_path / "flag" flag_file.touch() @ray.remote def long_running(i): lock = FileLock(tmp_file) with lock.acquire(): pass if i % 2 == 0: if flag_file.exists(): raise ValueError() return 100 outputs = [ workflow.run_async(long_running.bind(i), workflow_id=str(i)) for i in range(100) ] # Test list all, it should list all jobs running all_tasks = workflow.list_all() assert len(all_tasks) == 100 all_tasks_running = workflow.list_all(workflow.RUNNING) assert dict(all_tasks) == dict(all_tasks_running) assert workflow.get_status("0") == "RUNNING" # Release lock and make sure all tasks finished lock.release() for o in outputs: try: r = ray.get(o) except Exception: continue assert 100 == r all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING) assert len(all_tasks_running) == 0 # Half of them failed and half succeed failed_jobs = workflow.list_all("FAILED") assert len(failed_jobs) == 50 finished_jobs = workflow.list_all("SUCCESSFUL") assert len(finished_jobs) == 50 all_tasks_status = workflow.list_all({ workflow.WorkflowStatus.SUCCESSFUL, workflow.WorkflowStatus.FAILED, workflow.WorkflowStatus.RUNNING, }) assert len(all_tasks_status) == 100 assert failed_jobs == [(k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED] assert finished_jobs == [(k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL] # Test get_status assert workflow.get_status("0") == "FAILED" assert workflow.get_status("1") == "SUCCESSFUL" lock.acquire() r = workflow.resume_async("0") assert workflow.get_status("0") == workflow.RUNNING flag_file.unlink() lock.release() assert 100 == ray.get(r) assert workflow.get_status("0") == workflow.SUCCESSFUL # Test cancel lock.acquire() workflow.resume_async("2") assert workflow.get_status("2") == workflow.RUNNING workflow.cancel("2") assert workflow.get_status("2") == workflow.CANCELED # Now resume_all resumed = workflow.resume_all(include_failed=True) assert len(resumed) == 48 lock.release() assert [ray.get(o) for (_, o) in resumed] == [100] * 48