def test_connect_with_disconnected_node(shutdown_only): config = { "num_heartbeats_timeout": 50, "raylet_heartbeat_timeout_milliseconds": 10, } cluster = Cluster() cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) p = init_error_pubsub() errors = get_error_message(p, 1, timeout=5) assert len(errors) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0) cluster.remove_node(dead_node, allow_graceful=False) errors = get_error_message(p, 1, ray_constants.REMOVED_NODE_ERROR) assert len(errors) == 1 # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0) cluster.remove_node(removing_node, allow_graceful=True) errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 # There is no connection error to a dead node. errors = get_error_message(p, 1, timeout=2) assert len(errors) == 0 p.close()
def test_warning_for_too_many_actors(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() @ray.remote class Foo: def __init__(self): time.sleep(1000) # NOTE: We should save actor, otherwise it will be out of scope. actor_group1 = [Foo.remote() for _ in range(num_cpus * 3)] assert len(actor_group1) == num_cpus * 3 errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR actor_group2 = [Foo.remote() for _ in range(num_cpus)] assert len(actor_group2) == num_cpus errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_warning_for_too_many_nested_tasks(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() @ray.remote def f(): time.sleep(1000) return 1 @ray.remote def h(): time.sleep(1) ray.get(f.remote()) @ray.remote def g(): # Sleep so that the f tasks all get submitted to the scheduler after # the g tasks. time.sleep(1) ray.get(h.remote()) [g.remote() for _ in range(num_cpus * 4)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_detached_warning(shutdown_only): ray.init() @ray.remote class DetachedActor: def ping(self): return "pong" error_pubsub = init_error_pubsub() actor = DetachedActor.options( # noqa: F841 name="Pinger", lifetime="detached").remote() errors = get_error_message(error_pubsub, 1, None) error = errors.pop() assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR
def test_warning_for_too_many_nested_tasks(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() remote_wait = Semaphore.remote(value=0) nested_wait = Semaphore.remote(value=0) ray.get([ remote_wait.locked.remote(), nested_wait.locked.remote(), ]) @ray.remote def f(): time.sleep(1000) return 1 @ray.remote def h(nested_waits): nested_wait.release.remote() ray.get(nested_waits) ray.get(f.remote()) @ray.remote def g(remote_waits, nested_waits): # Sleep so that the f tasks all get submitted to the scheduler after # the g tasks. remote_wait.release.remote() # wait until every lock is released. ray.get(remote_waits) ray.get(h.remote(nested_waits)) num_root_tasks = num_cpus * 4 # Lock remote task until everything is scheduled. remote_waits = [] nested_waits = [] for _ in range(num_root_tasks): remote_waits.append(remote_wait.acquire.remote()) nested_waits.append(nested_wait.acquire.remote()) [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_warning_actor_waiting_on_actor(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Actor: pass a = Actor.remote() # noqa b = Actor.remote() # noqa errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_for_infeasible_zero_cpu_actor(shutdown_only): # Check that we cannot place an actor on a 0 CPU machine and that we get an # infeasibility warning (even though the actor creation task itself # requires no CPUs). ray.init(num_cpus=0) p = init_error_pubsub() @ray.remote class Foo: pass # The actor creation should be infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR p.close()
def test_warning_all_tasks_blocked(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): return 0 @ray.remote def f(): # Creating both actors is not possible. actors = [Foo.remote() for _ in range(3)] for a in actors: ray.get(a.f.remote()) # Run in a task to check we handle the blocked task case correctly f.remote() errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_many_actor_tasks_queued(shutdown_only): ray.init(num_cpus=1) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): import time time.sleep(1) a = Foo.remote() [a.f.remote() for _ in range(50000)] errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING) msgs = [e.error_message for e in errors] assert ("Warning: More than 5000 tasks are pending submission to actor" in msgs[0]) assert ("Warning: More than 10000 tasks are pending submission to actor" in msgs[1]) assert ("Warning: More than 20000 tasks are pending submission to actor" in msgs[2]) assert ("Warning: More than 40000 tasks are pending submission to actor" in msgs[3])
def error_pubsub(): p = init_error_pubsub() yield p p.close()
def test_error_isolation(call_ray_start): address = call_ray_start # Connect a driver to the Ray cluster. ray.init(address=address) p = init_error_pubsub() # There shouldn't be any errors yet. errors = get_error_message(p, 1, 2) assert len(errors) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. errors = get_error_message(p, 1) # Make sure we got the error. assert len(errors) == 1 assert error_string1 in errors[0].error_message # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time from ray.test_utils import (init_error_pubsub, get_error_message) ray.init(address="{}") p = init_error_pubsub() time.sleep(1) errors = get_error_message(p, 1, 2) assert len(errors) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass errors = get_error_message(p, 1) assert len(errors) == 1 assert "{}" in errors[0].error_message print("success") """.format(address, error_string2, error_string2) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. errors = get_error_message(p, 1) assert len(errors) == 1 p.close()