def test_running_and_canceled_workflow(workflow_start_regular, tmp_path): workflow_id = "simple" flag = tmp_path / "flag" @ray.remote def simple(): flag.touch() time.sleep(1000) return 0 workflow.create(simple.bind()).run_async(workflow_id) # Wait until step runs to make sure pre-run metadata is written while not flag.exists(): time.sleep(1) workflow_metadata = workflow.get_metadata(workflow_id) assert workflow_metadata["status"] == "RUNNING" assert "start_time" in workflow_metadata["stats"] assert "end_time" not in workflow_metadata["stats"] workflow.cancel(workflow_id) workflow_metadata = workflow.get_metadata(workflow_id) assert workflow_metadata["status"] == "CANCELED" assert "start_time" in workflow_metadata["stats"] assert "end_time" not in workflow_metadata["stats"]
def test_wait_failure_recovery_2(workflow_start_regular_shared): # Test failing "workflow.wait" and its input steps. @workflow.step def sleep_identity(x: int): # block the step by a global mark while not utils.check_global_mark(): time.sleep(0.1) time.sleep(x) return x @workflow.step def identity(x): return x ws = [ sleep_identity.step(2), sleep_identity.step(5), sleep_identity.step(1), ] w = workflow.wait(ws, num_returns=2, timeout=None) utils.unset_global_mark() _ = identity.step(w).run_async(workflow_id="wait_failure_recovery_2") # wait util "workflow.wait" has been running time.sleep(10) workflow.cancel("wait_failure_recovery_2") time.sleep(2) utils.set_global_mark() ready, unready = ray.get(workflow.resume("wait_failure_recovery_2")) assert ready == [2, 1]
def test_cancellation(tmp_path, workflow_start_regular): lock_a = tmp_path / "lock_a" lock_b = tmp_path / "lock_b" @ray.remote def simple(): with filelock.FileLock(lock_a): with filelock.FileLock(lock_b): pass workflow_id = "test_cancellation" with filelock.FileLock(lock_b): r = workflow.run_async(simple.bind(), workflow_id=workflow_id) try: ray.get(r, timeout=5) except GetTimeoutError: pass else: assert False assert workflow.get_status(workflow_id) == WorkflowStatus.RUNNING workflow.cancel(workflow_id) with pytest.raises(workflow.WorkflowCancellationError): ray.get(r) lock = filelock.FileLock(lock_a) lock.acquire(timeout=5) assert workflow.get_status(workflow_id) == WorkflowStatus.CANCELED
def test_no_init(shutdown_only): @ray.remote def f(): pass fail_wf_init_error_msg = re.escape( "`workflow.init()` must be called prior to using the workflows API.") with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.create(f.bind()).run() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.list_all() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.resume_all() with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.cancel("wf") with pytest.raises(RuntimeError, match=fail_wf_init_error_msg): workflow.get_actor("wf")
def test_workflow_manager(workflow_start_regular, tmp_path): # For sync between jobs tmp_file = str(tmp_path / "lock") lock = FileLock(tmp_file) lock.acquire() # For sync between jobs flag_file = tmp_path / "flag" flag_file.touch() @ray.remote def long_running(i): lock = FileLock(tmp_file) with lock.acquire(): pass if i % 2 == 0: if flag_file.exists(): raise ValueError() return 100 outputs = [ workflow.create(long_running.bind(i)).run_async(workflow_id=str(i)) for i in range(100) ] # Test list all, it should list all jobs running all_tasks = workflow.list_all() assert len(all_tasks) == 100 all_tasks_running = workflow.list_all(workflow.RUNNING) assert dict(all_tasks) == dict(all_tasks_running) assert workflow.get_status("0") == "RUNNING" # Release lock and make sure all tasks finished lock.release() for o in outputs: try: r = ray.get(o) except Exception: continue assert 100 == r all_tasks_running = workflow.list_all(workflow.WorkflowStatus.RUNNING) assert len(all_tasks_running) == 0 # Half of them failed and half succeed failed_jobs = workflow.list_all("FAILED") assert len(failed_jobs) == 50 finished_jobs = workflow.list_all("SUCCESSFUL") assert len(finished_jobs) == 50 all_tasks_status = workflow.list_all( { workflow.WorkflowStatus.SUCCESSFUL, workflow.WorkflowStatus.FAILED, workflow.WorkflowStatus.RUNNING, } ) assert len(all_tasks_status) == 100 assert failed_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.FAILED ] assert finished_jobs == [ (k, v) for (k, v) in all_tasks_status if v == workflow.WorkflowStatus.SUCCESSFUL ] # Test get_status assert workflow.get_status("0") == "FAILED" assert workflow.get_status("1") == "SUCCESSFUL" lock.acquire() r = workflow.resume("0") assert workflow.get_status("0") == workflow.RUNNING flag_file.unlink() lock.release() assert 100 == ray.get(r) assert workflow.get_status("0") == workflow.SUCCESSFUL # Test cancel lock.acquire() workflow.resume("2") assert workflow.get_status("2") == workflow.RUNNING workflow.cancel("2") assert workflow.get_status("2") == workflow.CANCELED # Now resume_all resumed = workflow.resume_all(include_failed=True) assert len(resumed) == 48 lock.release() assert [ray.get(o) for (_, o) in resumed] == [100] * 48