def test_output_with_name(workflow_start_regular): @ray.remote def double(v): return 2 * v inner_task = double.options(**workflow.options(name="inner")).bind(1) outer_task = double.options(**workflow.options( name="outer")).bind(inner_task) result = workflow.run_async(outer_task, workflow_id="double") inner = workflow.get_output_async("double", name="inner") outer = workflow.get_output_async("double", name="outer") assert ray.get(inner) == 2 assert ray.get(outer) == 4 assert ray.get(result) == 4 @workflow.options(name="double") @ray.remote def double_2(s): return s * 2 inner_task = double_2.bind(1) outer_task = double_2.bind(inner_task) workflow_id = "double_2" result = workflow.run_async(outer_task, workflow_id=workflow_id) inner = workflow.get_output_async(workflow_id, name="double") outer = workflow.get_output_async(workflow_id, name="double_1") assert ray.get(inner) == 2 assert ray.get(outer) == 4 assert ray.get(result) == 4
def test_nested_catch_exception_3(workflow_start_regular_shared, tmp_path): """Test the case where the exception is not raised by the output task of a nested DAG.""" @ray.remote def f3(): return 10 @ray.remote def f3_exc(): raise ValueError() @ray.remote def f2(x): return x @ray.remote def f1(exc): if exc: return workflow.continuation(f2.bind(f3_exc.bind())) else: return workflow.continuation(f2.bind(f3.bind())) ret, err = workflow.run( f1.options(**workflow.options(catch_exceptions=True)).bind(True) ) assert ret is None assert isinstance(err, ValueError) assert (10, None) == workflow.run( f1.options(**workflow.options(catch_exceptions=True)).bind(False) )
def test_step_failure(workflow_start_regular_shared, tmp_path): (tmp_path / "test").write_text("0") @ray.remote def unstable_step(): v = int((tmp_path / "test").read_text()) (tmp_path / "test").write_text(f"{v + 1}") if v < 10: raise ValueError("Invalid") return v with pytest.raises(Exception): workflow.create( unstable_step.options(**workflow.options(max_retries=-2).bind())) with pytest.raises(Exception): workflow.create( unstable_step.options(**workflow.options( max_retries=2)).bind()).run() assert (10 == workflow.create( unstable_step.options(**workflow.options(max_retries=7)).bind()).run()) (tmp_path / "test").write_text("0") (ret, err) = workflow.create( unstable_step.options(**workflow.options( max_retries=2, catch_exceptions=True)).bind()).run() assert ret is None assert isinstance(err, ValueError) (ret, err) = workflow.create( unstable_step.options(**workflow.options( max_retries=7, catch_exceptions=True)).bind()).run() assert ret == 10 assert err is None
def inplace_test(): from ray.worker import global_worker worker_id = global_worker.worker_id x = check_and_update.options(**workflow.options(allow_inplace=True)).bind( "@", worker_id) y = check_and_update.bind(x, worker_id) z = check_and_update.options(**workflow.options(allow_inplace=True)).bind( y, worker_id) return workflow.continuation(z)
def test_get_named_step_output_running(workflow_start_regular, tmp_path): @ray.remote def double(v, lock=None): if lock is not None: with FileLock(lock_path): return 2 * v else: return 2 * v # Get the result from named step after workflow before it's finished lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) lock.acquire() output = workflow.create( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind( 1, lock_path), lock_path, )).run_async("double-2") inner = workflow.get_output("double-2", name="inner") outer = workflow.get_output("double-2", name="outer") @ray.remote def wait(obj_ref): return ray.get(obj_ref[0]) # Make sure nothing is finished. ready, waiting = ray.wait( [wait.remote([output]), wait.remote([inner]), wait.remote([outer])], timeout=1) assert 0 == len(ready) assert 3 == len(waiting) # Once job finished, we'll be able to get the result. lock.release() assert 4 == ray.get(output) # Here sometimes inner will not be generated when we call # run_async. So there is a race condition here. try: v = ray.get(inner) except Exception: v = None if v is not None: assert 2 == v assert 4 == ray.get(outer) inner = workflow.get_output("double-2", name="inner") outer = workflow.get_output("double-2", name="outer") assert 2 == ray.get(inner) assert 4 == ray.get(outer)
def test_get_named_step_output_finished(workflow_start_regular, tmp_path): @ray.remote def double(v): return 2 * v # Get the result from named step after workflow finished assert 4 == workflow.create( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options( name="inner")).bind(1))).run("double") assert ray.get(workflow.get_output("double", name="inner")) == 2 assert ray.get(workflow.get_output("double", name="outer")) == 4
def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @ray.remote def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(workflow.WorkflowExecutionError): workflow.create(incr.options(**workflow.options(max_retries=0)).bind()).run( "incr" ) assert cnt_file.read_text() == "1" from ray.exceptions import RaySystemError # TODO(suquark): We should prevent Ray from raising "RaySystemError", # in workflow, because "RaySystemError" does not inherit the underlying # error, so users and developers cannot catch the expected error. # I feel this issue is a very annoying. with pytest.raises((RaySystemError, ValueError)): ray.get(workflow.get_output("incr")) assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises((RaySystemError, ValueError)): ray.get(workflow.get_output("incr")) assert ray.get(workflow.resume("incr")) == 10
def recursive(n): if n <= 0: with FileLock(lock_path): return 42 return workflow.continuation( recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1) )
def test_get_output_3(workflow_start_regular, tmp_path): cnt_file = tmp_path / "counter" cnt_file.write_text("0") error_flag = tmp_path / "error" error_flag.touch() @ray.remote def incr(): v = int(cnt_file.read_text()) cnt_file.write_text(str(v + 1)) if error_flag.exists(): raise ValueError() return 10 with pytest.raises(ray.exceptions.RaySystemError): workflow.create( incr.options(**workflow.options(max_retries=0)).bind()).run("incr") assert cnt_file.read_text() == "1" with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert cnt_file.read_text() == "1" error_flag.unlink() with pytest.raises(ray.exceptions.RaySystemError): ray.get(workflow.get_output("incr")) assert ray.get(workflow.resume("incr")) == 10
def test_get_output_4(workflow_start_regular, tmp_path): """Test getting output of a workflow tasks that are dynamically generated.""" lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) @ray.remote def recursive(n): if n <= 0: with FileLock(lock_path): return 42 return workflow.continuation( recursive.options(**workflow.options(name=str(n - 1))).bind(n - 1)) workflow_id = "test_get_output_4" lock.acquire() obj = workflow.run_async( recursive.options(**workflow.options(name="10")).bind(10), workflow_id=workflow_id, ) outputs = [ workflow.get_output_async(workflow_id, name=str(i)) for i in range(11) ] outputs.append(obj) import time # wait so that 'get_output' is scheduled before executing the workflow time.sleep(3) lock.release() assert ray.get(outputs) == [42] * len(outputs)
def test_options_update(): from ray.workflow.common import WORKFLOW_OPTIONS # Options are given in decorator first, then in the first .options() # and finally in the second .options() @workflow.options(name="old_name", metadata={"k": "v"}) @ray.remote(num_cpus=2, max_retries=1) def f(): return # name is updated from the old name in the decorator to the new name in the first # .options(), then preserved in the second options. # metadata and ray_options are "updated" # max_retries only defined in the decorator and it got preserved all the way new_f = f.options( num_returns=2, **workflow.options(name="new_name", metadata={"extra_k2": "extra_v2"}), ) options = new_f.bind().get_options() assert options == { "num_cpus": 2, "num_returns": 2, "max_retries": 1, "_metadata": { WORKFLOW_OPTIONS: { "name": "new_name", "metadata": { "extra_k2": "extra_v2" }, } }, }
def test_dynamic_output(workflow_start_regular_shared): @ray.remote def exponential_fail(k, n): if n > 0: if n < 3: raise Exception("Failed intentionally") return workflow.continuation( exponential_fail.options(**workflow.options( name=f"step_{n}")).bind(k * 2, n - 1)) return k # When workflow fails, the dynamic output should points to the # latest successful step. try: workflow.run( exponential_fail.options(**workflow.options(name="step_0")).bind( 3, 10), workflow_id="dynamic_output", ) except Exception: pass from ray.workflow.workflow_storage import get_workflow_storage wf_storage = get_workflow_storage(workflow_id="dynamic_output") result = wf_storage.inspect_step("step_0") assert result.output_step_id == "step_3"
def exponential_fail(k, n): if n > 0: if n < 3: raise Exception("Failed intentionally") return workflow.continuation( exponential_fail.options(**workflow.options( name=f"step_{n}")).bind(k * 2, n - 1)) return k
def test_task_id_generation(workflow_start_regular_shared, request): @ray.remote def simple(x): return x + 1 x = simple.options(**workflow.options(name="simple")).bind(-1) n = 20 for i in range(1, n): x = simple.options(**workflow.options(name="simple")).bind(x) workflow_id = "test_task_id_generation" ret = workflow.create(x).run_async(workflow_id=workflow_id) outputs = [workflow.get_output(workflow_id, name="simple")] for i in range(1, n): outputs.append(workflow.get_output(workflow_id, name=f"simple_{i}")) assert ray.get(ret) == n - 1 assert ray.get(outputs) == list(range(n))
def test_get_named_step_output_running(workflow_start_regular, tmp_path): @ray.remote def double(v, lock=None): if lock is not None: with FileLock(lock_path): return 2 * v else: return 2 * v # Get the result from named step after workflow before it's finished lock_path = str(tmp_path / "lock") lock = FileLock(lock_path) lock.acquire() output = workflow.run_async( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind( 1, lock_path), lock_path, ), workflow_id="double-2", ) inner = workflow.get_output_async("double-2", name="inner") outer = workflow.get_output_async("double-2", name="outer") @ray.remote def wait(obj_ref): return ray.get(obj_ref[0]) # Make sure nothing is finished. ready, waiting = ray.wait( [wait.remote([output]), wait.remote([inner]), wait.remote([outer])], timeout=1) assert 0 == len(ready) assert 3 == len(waiting) # Once job finished, we'll be able to get the result. lock.release() assert [4, 2, 4] == ray.get([output, inner, outer]) inner = workflow.get_output_async("double-2", name="inner") outer = workflow.get_output_async("double-2", name="outer") assert [2, 4] == ray.get([inner, outer])
def checkpoint_dag(checkpoint): @ray.remote def large_input(): return np.arange(2**24) @ray.remote def identity(x): return x @ray.remote def average(x): return np.mean(x) x = large_input.options( **workflow.options(name="large_input", checkpoint=checkpoint)).bind() y = identity.options( **workflow.options(name="identity", checkpoint=checkpoint)).bind(x) return workflow.continuation( average.options(**workflow.options(name="average")).bind(y))
def test_user_metadata_not_dict(workflow_start_regular): @ray.remote def simple(): return 0 with pytest.raises(ValueError): workflow.create(simple.options(**workflow.options(metadata="x")).bind()) with pytest.raises(ValueError): workflow.create(simple.bind()).run(metadata="x")
def tail_recursion(n): import inspect # check if the stack is growing assert len(inspect.stack(0)) < 20 if n <= 0: return "ok" return workflow.continuation( tail_recursion.options(**workflow.options( allow_inplace=True)).bind(n - 1))
def test_get_named_step_output_error(workflow_start_regular, tmp_path): @ray.remote def double(v, error): if error: raise Exception() return v + v # Force it to fail for the outer step with pytest.raises(Exception): workflow.create( double.options(**workflow.options(name="outer")).bind( double.options(**workflow.options(name="inner")).bind( 1, False), True)).run("double") # For the inner step, it should have already been executed. assert 2 == ray.get(workflow.get_output("double", name="inner")) outer = workflow.get_output("double", name="outer") with pytest.raises(Exception): ray.get(outer)
def checkpoint_dag(checkpoint): @ray.remote def large_input(): return np.arange(SIZE) @ray.remote def identity(x): if not utils.check_global_mark(): import os os.kill(os.getpid(), 9) return x @ray.remote def average(x): return np.mean(x) x = large_input.options(**workflow.options(checkpoint=checkpoint)).bind() y = identity.options(**workflow.options(checkpoint=checkpoint)).bind(x) return workflow.continuation(average.bind(y))
def test_nested_catch_exception(workflow_start_regular_shared, tmp_path): @ray.remote def f2(): return 10 @ray.remote def f1(): return workflow.continuation(f2.bind()) assert (10, None) == workflow.create( f1.options(**workflow.options(catch_exceptions=True)).bind()).run()
def test_nested_catch_exception_2(workflow_start_regular_shared, tmp_path): @ray.remote def f1(n): if n == 0: raise ValueError() else: return workflow.continuation(f1.bind(n - 1)) ret, err = workflow.create( f1.options(**workflow.options(catch_exceptions=True)).bind(5)).run() assert ret is None assert isinstance(err, ValueError)
def test_user_metadata_not_json_serializable(workflow_start_regular): @ray.remote def simple(): return 0 class X: pass with pytest.raises(ValueError): workflow.create(simple.options(**workflow.options(metadata={"x": X()})).bind()) with pytest.raises(ValueError): workflow.create(simple.bind()).run(metadata={"x": X()})
def test_checkpoint_dag_full(workflow_start_regular_shared): outputs = workflow.create( checkpoint_dag.options(**workflow.options(name="checkpoint_dag")).bind(True) ).run(workflow_id="checkpoint_whole") assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_whole")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole") _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed") _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def exp_inplace(k, n, worker_id=None): from ray.worker import global_worker _worker_id = global_worker.worker_id if worker_id is not None: # sub-workflows running inplace assert _worker_id == worker_id worker_id = _worker_id if n == 0: return k return workflow.continuation( exp_inplace.options(**workflow.options(allow_inplace=True)).bind( 2 * k, n - 1, worker_id))
def test_checkpoint_dag_skip_partial(workflow_start_regular_shared): outputs = workflow.run( checkpoint_dag.options(**workflow.options( name="checkpoint_dag")).bind(False), workflow_id="checkpoint_partial", ) assert np.isclose(outputs, 8388607.5) recovered = workflow.resume("checkpoint_partial") assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial") _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped") _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def custom_retry_strategy(func: Any, num_retries: int, delay_s: int) -> str: import time @ray.remote def handle_result(res: Tuple[Optional[str], Optional[Exception]]) -> str: result, error = res if result: return res elif num_retries <= 0: raise error else: print("Retrying exception after delay", error) time.sleep(delay_s) return workflow.continuation( custom_retry_strategy.bind(func, num_retries - 1, delay_s)) res = func.options(**workflow.options(catch_exceptions=True)).bind() return workflow.continuation(handle_result.bind(res))
def test_checkpoint_dag_recovery_skip(workflow_start_regular_shared): utils.unset_global_mark() start = time.time() with pytest.raises(RaySystemError): workflow.create( checkpoint_dag.options(**workflow.options( checkpoint=False)).bind(False)).run( workflow_id="checkpoint_skip_recovery") run_duration_skipped = time.time() - start utils.set_global_mark() start = time.time() recovered = ray.get(workflow.resume("checkpoint_skip_recovery")) recover_duration_skipped = time.time() - start assert np.isclose(recovered, np.arange(SIZE).mean()) print(f"[skipped] run_duration = {run_duration_skipped}, " f"recover_duration = {recover_duration_skipped}")
def test_get_non_exist_output(workflow_start_regular, tmp_path): lock_path = str(tmp_path / "lock") @ray.remote def simple(): with FileLock(lock_path): return "hello" workflow_id = "test_get_non_exist_output" with FileLock(lock_path): dag = simple.options(**workflow.options(name="simple")).bind() ret = workflow.run_async(dag, workflow_id=workflow_id) exist = workflow.get_output_async(workflow_id, name="simple") non_exist = workflow.get_output_async(workflow_id, name="non_exist") assert ray.get(ret) == "hello" assert ray.get(exist) == "hello" with pytest.raises(ValueError, match="non_exist"): ray.get(non_exist)
@ray.remote def celebrate(result: str) -> None: print("Success!", result) @ray.remote def send_email(result: str) -> None: print("Sending email", result) @ray.remote def exit_handler(res: Tuple[Optional[str], Optional[Exception]]) -> None: result, error = res email = send_email.bind(f"Raw result: {result}, {error}") if error: handler = cry.bind(error) else: handler = celebrate.bind(result) return workflow.continuation(wait_all.bind(handler, email)) @ray.remote def wait_all(*deps): return "done" if __name__ == "__main__": res = intentional_fail.options(**workflow.options(catch_exceptions=True)).bind() print(workflow.run(exit_handler.bind(res)))