def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, "raylet_heartbeat_period_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) p = init_error_pubsub() errors = get_error_message(p, 1, timeout=5) assert len(errors) == 0 # This node will be killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0) cluster.remove_node(removing_node, allow_graceful=True) errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 # There is no connection error to a dead node. errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 p.close()
def test_warning_many_actor_tasks_queued(shutdown_only, sync: bool): ray.init(num_cpus=1) p = init_error_pubsub() @ray.remote(num_cpus=1) class SyncFoo: def f(self): import time time.sleep(1) @ray.remote(num_cpus=1) class AsyncFoo: async def f(self): import asyncio await asyncio.sleep(1) Foo = SyncFoo if sync else AsyncFoo a = Foo.remote() [a.f.remote() for _ in range(50000)] errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING) msgs = [e.error_message for e in errors] assert "Warning: More than 5000 tasks are pending submission to actor" in msgs[0] assert "Warning: More than 10000 tasks are pending submission to actor" in msgs[1] assert "Warning: More than 20000 tasks are pending submission to actor" in msgs[2] assert "Warning: More than 40000 tasks are pending submission to actor" in msgs[3]
def test_warning_for_too_many_actors(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() @ray.remote class Foo: def __init__(self): time.sleep(1000) # NOTE: We should save actor, otherwise it will be out of scope. actor_group1 = [Foo.remote() for _ in range(num_cpus * 10)] assert len(actor_group1) == num_cpus * 10 errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR actor_group2 = [Foo.remote() for _ in range(num_cpus * 3)] assert len(actor_group2) == num_cpus * 3 errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_agent_report_unexpected_raylet_death_large_file(shutdown_only): """Test agent reports Raylet death if it is not SIGTERM.""" ray.init(include_dashboard=True) p = init_error_pubsub() node = ray._private.worker._global_node all_processes = node.all_processes raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) wait_for_condition(lambda: search_agent(raylet_proc.children())) agent_proc = search_agent(raylet_proc.children()) agent_pid = agent_proc.pid check_agent_register(raylet_proc, agent_pid) # Append to the Raylet log file with data >> 1 MB. with open(os.path.join(node.get_session_dir_path(), "logs", "raylet.out"), "a") as f: f.write("test data\n" * 1024**2) # The agent should be dead if raylet exits. raylet_proc.kill() raylet_proc.wait() agent_proc.wait(5) # Reading and publishing logs should still work. errors = get_error_message(p, 1, ray_constants.RAYLET_DIED_ERROR) assert len(errors) == 1, errors err = errors[0] assert err.type == ray_constants.RAYLET_DIED_ERROR assert "Termination is unexpected." in err.error_message, err.error_message assert "Raylet logs:" in err.error_message, err.error_message
def test_detached_warning(shutdown_only): ray.init() @ray.remote class DetachedActor: def ping(self): return "pong" error_pubsub = init_error_pubsub() actor = DetachedActor.options( # noqa: F841 name="Pinger", lifetime="detached").remote() errors = get_error_message(error_pubsub, 1, None) error = errors.pop() assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR
def test_warning_for_too_many_nested_tasks(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() remote_wait = Semaphore.remote(value=0) nested_wait = Semaphore.remote(value=0) ray.get([ remote_wait.locked.remote(), nested_wait.locked.remote(), ]) @ray.remote(num_cpus=0.25) def f(): time.sleep(1000) return 1 @ray.remote(num_cpus=0.25) def h(nested_waits): nested_wait.release.remote() ray.get(nested_waits) ray.get(f.remote()) @ray.remote(num_cpus=0.25) def g(remote_waits, nested_waits): # Sleep so that the f tasks all get submitted to the scheduler after # the g tasks. remote_wait.release.remote() # wait until every lock is released. ray.get(remote_waits) ray.get(h.remote(nested_waits)) num_root_tasks = num_cpus * 4 # Lock remote task until everything is scheduled. remote_waits = [] nested_waits = [] for _ in range(num_root_tasks): remote_waits.append(remote_wait.acquire.remote()) nested_waits.append(nested_wait.acquire.remote()) [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_warning_actor_waiting_on_actor(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Actor: pass a = Actor.remote() # noqa b = Actor.remote() # noqa errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_for_infeasible_zero_cpu_actor(shutdown_only): # Check that we cannot place an actor on a 0 CPU machine and that we get an # infeasibility warning (even though the actor creation task itself # requires no CPUs). ray.init(num_cpus=0) p = init_error_pubsub() @ray.remote class Foo: pass # The actor creation should be infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR p.close()
def test_raylet_and_agent_share_fate(shutdown_only): """Test raylet and agent share fate.""" ray.init(include_dashboard=True) p = init_error_pubsub() node = ray._private.worker._global_node all_processes = node.all_processes raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) wait_for_condition(lambda: search_agent(raylet_proc.children())) agent_proc = search_agent(raylet_proc.children()) agent_pid = agent_proc.pid check_agent_register(raylet_proc, agent_pid) # The agent should be dead if raylet exits. raylet_proc.terminate() raylet_proc.wait() agent_proc.wait(5) # No error should be reported for graceful termination. errors = get_error_message(p, 1, ray_constants.RAYLET_DIED_ERROR) assert len(errors) == 0, errors ray.shutdown() ray.init(include_dashboard=True) all_processes = ray._private.worker._global_node.all_processes raylet_proc_info = all_processes[ray_constants.PROCESS_TYPE_RAYLET][0] raylet_proc = psutil.Process(raylet_proc_info.process.pid) wait_for_condition(lambda: search_agent(raylet_proc.children())) agent_proc = search_agent(raylet_proc.children()) agent_pid = agent_proc.pid check_agent_register(raylet_proc, agent_pid) # The raylet should be dead if agent exits. agent_proc.kill() agent_proc.wait() raylet_proc.wait(5)
def test_no_warning_many_actor_tasks_queued_when_sequential(shutdown_only, sync: bool): ray.init(num_cpus=1) p = init_error_pubsub() @ray.remote(num_cpus=1) class SyncFoo: def f(self): return 1 @ray.remote(num_cpus=1) class AsyncFoo: async def f(self): return 1 Foo = SyncFoo if sync else AsyncFoo a = Foo.remote() for _ in range(10000): assert ray.get(a.f.remote()) == 1 errors = get_error_message(p, 1, ray_constants.EXCESS_QUEUEING_WARNING, timeout=1) assert len(errors) == 0
def test_warning_all_tasks_blocked(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): return 0 @ray.remote def f(): # Creating both actors is not possible. actors = [Foo.remote() for _ in range(3)] for a in actors: ray.get(a.f.remote()) # Run in a task to check we handle the blocked task case correctly f.remote() errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_task_waiting_on_actor(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Actor: def hello(self): pass a = Actor.remote() # noqa ray.get(a.hello.remote()) @ray.remote(num_cpus=1) def f(): print("f running") time.sleep(999) ids = [f.remote()] # noqa errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_error_isolation(call_ray_start): address = call_ray_start # Connect a driver to the Ray cluster. ray.init(address=address) # If a GRPC call exceeds timeout, the calls is cancelled at client side but # server may still reply to it, leading to missed message. Using a sequence # number to ensure no message is dropped can be the long term solution, # but its complexity and the fact the Ray subscribers do not use deadline # in production makes it less preferred. # Therefore, a simpler workaround is used instead: a different subscriber # is used for each get_error_message() call. subscribers = [init_error_pubsub() for _ in range(3)] # There shouldn't be any errors yet. errors = get_error_message(subscribers[0], 1, timeout=2) assert len(errors) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. errors = get_error_message(subscribers[1], 1) # Make sure we got the error. assert len(errors) == 1 assert error_string1 in errors[0].error_message # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time from ray._private.test_utils import init_error_pubsub, get_error_message ray.init(address="{}") subscribers = [init_error_pubsub() for _ in range(2)] time.sleep(1) errors = get_error_message(subscribers[0], 1, timeout=2) assert len(errors) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass errors = get_error_message(subscribers[1], 1) assert len(errors) == 1 assert "{}" in errors[0].error_message print("success") """.format( address, error_string2, error_string2 ) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. errors = get_error_message(subscribers[2], 1) assert len(errors) == 1
def error_pubsub(): p = init_error_pubsub() yield p p.close()
def test_drain_api(shutdown_only): """E2E test of the autoscaler's use of the DrainNode API. Adapted from test_autoscaler_fake_multinode.py. The strategy is to mock out Ray node process termination in FakeMultiNodeProvider, leaving node termination to the DrainNode API. Scale-down is verified by `ray.cluster_resources`. It is verified that no removed_node errors are issued adter scale-down. Validity of this test depends on the current implementation of DrainNode. DrainNode currently works by asking the GCS to de-register and shut down Ray nodes. """ # Autoscaling cluster with Ray process termination mocked out in the node # provider. cluster = MockAutoscalingCluster( head_resources={"CPU": 1}, worker_node_types={ "gpu_node": { "resources": { "CPU": 1, "GPU": 1, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 0, "max_workers": 2, }, }, ) try: cluster.start() ray.init("auto") # Triggers the addition of a GPU node. @ray.remote(num_gpus=1) def f(): print("gpu ok") ray.get(f.remote()) # Verify scale-up assert ray.cluster_resources().get("GPU", 0) == 1 # Sleep for double the idle timeout of 6 seconds. time.sleep(12) # Verify scale-down assert ray.cluster_resources().get("GPU", 0) == 0 # Check that no errors were raised while draining nodes. # (Logic copied from test_failure4::test_gcs_drain.) try: p = init_error_pubsub() errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR, timeout=5) assert len(errors) == 0 finally: p.close() finally: cluster.shutdown()
def test_error_isolation(call_ray_start): address = call_ray_start # Connect a driver to the Ray cluster. ray.init(address=address) p = init_error_pubsub() # There shouldn't be any errors yet. errors = get_error_message(p, 1, 2) assert len(errors) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. errors = get_error_message(p, 1) # Make sure we got the error. assert len(errors) == 1 assert error_string1 in errors[0].error_message # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time from ray._private.test_utils import (init_error_pubsub, get_error_message) ray.init(address="{}") p = init_error_pubsub() time.sleep(1) errors = get_error_message(p, 1, 2) assert len(errors) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass errors = get_error_message(p, 1) assert len(errors) == 1 assert "{}" in errors[0].error_message print("success") """.format(address, error_string2, error_string2) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. errors = get_error_message(p, 1) assert len(errors) == 1 p.close()