def test_warning_for_too_many_actors(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() @ray.remote class Foo: def __init__(self): time.sleep(1000) [Foo.remote() for _ in range(num_cpus * 3)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR [Foo.remote() for _ in range(num_cpus)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_failed_function_to_run(ray_start_2_cpus, error_pubsub): p = error_pubsub def f(worker): if ray.worker.global_worker.mode == ray.WORKER_MODE: raise Exception("Function to run failed.") ray.worker.global_worker.run_function_on_all_workers(f) # Check that the error message is in the task info. errors = get_error_message(p, 2, ray_constants.FUNCTION_TO_RUN_PUSH_ERROR) assert len(errors) == 2 assert errors[0].type == ray_constants.FUNCTION_TO_RUN_PUSH_ERROR assert "Function to run failed." in errors[0].error_message assert "Function to run failed." in errors[1].error_message
def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub): p = error_pubsub # Check that we get warning messages for infeasible tasks. @ray.remote(num_gpus=1) def f(): pass @ray.remote(resources={"Custom": 1}) class Foo: pass # This task is infeasible. f.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # This actor placement task is infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR
def wait_for_errors(p, error_check): # Wait for errors from all the nondeterministic tasks. errors = [] time_left = 100 while time_left > 0: errors.extend(get_error_message(p, 1)) if error_check(errors): break time_left -= 1 time.sleep(1) # Make sure that enough errors came through. assert error_check(errors) return errors
def test_version_mismatch(error_pubsub, shutdown_only): ray_version = ray.__version__ ray.__version__ = "fake ray version" ray.init(num_cpus=1) p = error_pubsub errors = get_error_message(p, 1, ray_constants.VERSION_MISMATCH_PUSH_ERROR) assert False, errors assert len(errors) == 1 assert errors[0].type == ray_constants.VERSION_MISMATCH_PUSH_ERROR # Reset the version. ray.__version__ = ray_version
def test_detached_warning(shutdown_only): ray.init() @ray.remote class DetachedActor: def ping(self): return "pong" error_pubsub = init_error_pubsub() actor = DetachedActor.options( # noqa: F841 name="Pinger", lifetime="detached").remote() errors = get_error_message(error_pubsub, 1, None) error = errors.pop() assert error.type == ray_constants.DETACHED_ACTOR_ANONYMOUS_NAMESPACE_ERROR
def test_warning_for_too_many_nested_tasks(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() remote_wait = Semaphore.remote(value=0) nested_wait = Semaphore.remote(value=0) ray.get([ remote_wait.locked.remote(), nested_wait.locked.remote(), ]) @ray.remote def f(): time.sleep(1000) return 1 @ray.remote def h(nested_waits): nested_wait.release.remote() ray.get(nested_waits) ray.get(f.remote()) @ray.remote def g(remote_waits, nested_waits): # Sleep so that the f tasks all get submitted to the scheduler after # the g tasks. remote_wait.release.remote() # wait until every lock is released. ray.get(remote_waits) ray.get(h.remote(nested_waits)) num_root_tasks = num_cpus * 4 # Lock remote task until everything is scheduled. remote_waits = [] nested_waits = [] for _ in range(num_root_tasks): remote_waits.append(remote_wait.acquire.remote()) nested_waits.append(nested_wait.acquire.remote()) [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_warning_actor_waiting_on_actor(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Actor: pass a = Actor.remote() # noqa b = Actor.remote() # noqa errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_worker_dying(ray_start_regular, error_pubsub): p = error_pubsub # Define a remote function that will kill the worker that runs it. @ray.remote(max_retries=0) def f(): eval("exit()") with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(f.remote()) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR assert "died or was killed while executing" in errors[0].error_message
def test_actor_scope_or_intentionally_killed_message(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote class Actor: pass a = Actor.remote() a = Actor.remote() a.__ray_terminate__.remote() time.sleep(1) errors = get_error_message(p, 1) assert len(errors) == 0, "Should not have propogated an error - {}".format( errors)
def test_warning_for_dead_autoscaler(ray_start_regular, error_pubsub): # Terminate the autoscaler process. from ray.worker import _global_node autoscaler_process = _global_node.all_processes[ ray_constants.PROCESS_TYPE_MONITOR][0].process autoscaler_process.terminate() # Confirm that we receive an autoscaler failure error. errors = get_error_message( error_pubsub, 1, ray_constants.MONITOR_DIED_ERROR, timeout=5) assert len(errors) == 1 # Confirm that the autoscaler failure error is stored. error = _internal_kv_get(DEBUG_AUTOSCALING_ERROR) assert error is not None
def test_worker_raising_exception(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote(max_calls=2) def f(): # This is the only reasonable variable we can set here that makes the # execute_task function fail after the task got executed. worker = ray.worker.global_worker worker.function_actor_manager.increase_task_counter = None # Running this task should cause the worker to raise an exception after # the task has successfully completed. f.remote() errors = get_error_message(p, 1, ray_constants.WORKER_CRASH_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_CRASH_PUSH_ERROR
def test_fail_importing_remote_function(ray_start_2_cpus, error_pubsub): p = error_pubsub # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define a function that closes over this temporary module. This should # fail when it is unpickled. @ray.remote def g(x, y=3): try: module.temporary_python_file() except Exception: # This test is not concerned with the error from running this # function. Only from unpickling the remote function. pass # Invoke the function so that the definition is exported. g.remote(1, y=2) errors = get_error_message( p, 2, ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR) assert errors[0].type == ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR assert "No module named" in errors[0].error_message assert "No module named" in errors[1].error_message # Check that if we try to call the function it throws an exception and # does not hang. for _ in range(10): with pytest.raises(Exception, match="This function was not imported properly."): ray.get(g.remote(1, y=2)) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def test_warning_for_infeasible_zero_cpu_actor(shutdown_only): # Check that we cannot place an actor on a 0 CPU machine and that we get an # infeasibility warning (even though the actor creation task itself # requires no CPUs). ray.init(num_cpus=0) p = init_error_pubsub() @ray.remote class Foo: pass # The actor creation should be infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR p.close()
def test_put_error1(ray_start_object_store_memory, error_pubsub): p = error_pubsub num_objects = 3 object_size = 4 * 10**5 # Define a task with a single dependency, a numpy array, that returns # another array. @ray.remote def single_dependency(i, arg): arg = np.copy(arg) arg[0] = i return arg @ray.remote def put_arg_task(): # Launch num_objects instances of the remote task, each dependent # on the one before it. The result of the first task should get # evicted. args = [] arg = single_dependency.remote(0, np.zeros(object_size, dtype=np.uint8)) for i in range(num_objects): arg = single_dependency.remote(i, arg) args.append(arg) # Get the last value to force all tasks to finish. value = ray.get(args[-1]) assert value[0] == i # Get the first value (which should have been evicted) to force # reconstruction. Currently, since we're not able to reconstruct # `ray.put` objects that were evicted and whose originating tasks # are still running, this for-loop should hang and push an error to # the driver. ray.get(args[0]) put_arg_task.remote() # Make sure we receive the correct error message. errors = get_error_message(p, 1, ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.PUT_RECONSTRUCTION_PUSH_ERROR
def test_checkpointing_load_exception(ray_start_regular, error_pubsub, ray_checkpointable_actor_cls): """Test actor can still be recovered if checkpoints fail to load.""" p = error_pubsub @ray.remote(max_restarts=2) class RemoteCheckpointableActor(ray_checkpointable_actor_cls): def load_checkpoint(self, actor_id, checkpoints): raise Exception("Intentional error loading checkpoint.") actor = RemoteCheckpointableActor.remote() # Call increase 3 times, triggering a checkpoint that will succeed. expected = 0 for _ in range(3): ray.get(actor.increase.remote()) expected += 1 # Assert that the actor wasn't resumed from a checkpoint because loading # it failed. assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False # Kill actor process. kill_actor(actor) # Assert that the actor still wasn't resumed from a checkpoint and its # value is still correct. assert ray.get(actor.get.remote()) == expected assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False # Submit some more tasks. These should get replayed since they happen after # the checkpoint. for _ in range(3): ray.get(actor.increase.remote()) expected += 1 # Kill actor again, and check that restart still works and the actor # wasn't resumed from a checkpoint. kill_actor(actor) assert ray.get(actor.get.remote()) == expected assert ray.get(actor.was_resumed_from_checkpoint.remote()) is False # Check that the checkpoint error was pushed to the driver. errors = get_error_message(p, 1, ray_constants.CHECKPOINT_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.CHECKPOINT_PUSH_ERROR
def test_failed_actor_method(ray_start_regular, error_pubsub): p = error_pubsub error_message2 = "actor method failed" @ray.remote class FailedActor: def __init__(self): pass def fail_method(self): raise Exception(error_message2) a = FailedActor.remote() # Make sure that we get errors from a failed method. a.fail_method.remote() errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.TASK_PUSH_ERROR assert error_message2 in errors[0].error_message
def test_actor_worker_dying(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote class Actor: def kill(self): eval("exit()") @ray.remote def consume(x): pass a = Actor.remote() [obj], _ = ray.wait([a.kill.remote()], timeout=5) with pytest.raises(ray.exceptions.RayActorError): ray.get(obj) with pytest.raises(ray.exceptions.RayTaskError): ray.get(consume.remote(obj)) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR
def test_warning_many_actor_tasks_queued(shutdown_only): ray.init(num_cpus=1) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): import time time.sleep(1) a = Foo.remote() [a.f.remote() for _ in range(50000)] errors = get_error_message(p, 4, ray_constants.EXCESS_QUEUEING_WARNING) msgs = [e.error_message for e in errors] assert ("Warning: More than 5000 tasks are pending submission to actor" in msgs[0]) assert ("Warning: More than 10000 tasks are pending submission to actor" in msgs[1]) assert ("Warning: More than 20000 tasks are pending submission to actor" in msgs[2]) assert ("Warning: More than 40000 tasks are pending submission to actor" in msgs[3])
def test_warning_for_resource_deadlock(error_pubsub, shutdown_only): p = error_pubsub # Check that we get warning messages for infeasible tasks. ray.init(num_cpus=1) @ray.remote(num_cpus=1) class Foo: def f(self): return 0 @ray.remote def f(): # Creating both actors is not possible. actors = [Foo.remote() for _ in range(2)] for a in actors: ray.get(a.f.remote()) # Run in a task to check we handle the blocked task case correctly f.remote() errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_all_tasks_blocked(shutdown_only): ray.init(num_cpus=1, _system_config={"debug_dump_period_milliseconds": 500}) p = init_error_pubsub() @ray.remote(num_cpus=1) class Foo: def f(self): return 0 @ray.remote def f(): # Creating both actors is not possible. actors = [Foo.remote() for _ in range(3)] for a in actors: ray.get(a.f.remote()) # Run in a task to check we handle the blocked task case correctly f.remote() errors = get_error_message(p, 1, ray_constants.RESOURCE_DEADLOCK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.RESOURCE_DEADLOCK_ERROR
def test_warning_for_dead_node(ray_start_cluster_2_nodes, error_pubsub): cluster = ray_start_cluster_2_nodes cluster.wait_for_nodes() p = error_pubsub node_ids = {item["NodeID"] for item in ray.nodes()} # Try to make sure that the monitor has received at least one heartbeat # from the node. time.sleep(0.5) # Kill both raylets. cluster.list_all_nodes()[1].kill_raylet() cluster.list_all_nodes()[0].kill_raylet() # Check that we get warning messages for both raylets. errors = get_error_message(p, 2, ray_constants.REMOVED_NODE_ERROR, 40) # Extract the client IDs from the error messages. This will need to be # changed if the error message changes. warning_node_ids = {error.error_message.split(" ")[5] for error in errors} assert node_ids == warning_node_ids
def test_actor_scope_or_intentionally_killed_message(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote class Actor: def __init__(self): # This log is added to debug a flaky test issue. print(os.getpid()) def ping(self): pass a = Actor.remote() # Without this waiting, there seems to be race condition happening # in the CI. This is not a fundamental fix for that, but it at least # makes the test less flaky. ray.get(a.ping.remote()) a = Actor.remote() a.__ray_terminate__.remote() time.sleep(1) errors = get_error_message(p, 1) assert len(errors) == 0, "Should not have propogated an error - {}".format( errors)
def test_actor_worker_dying_future_tasks(ray_start_regular, error_pubsub): p = error_pubsub @ray.remote(max_restarts=0) class Actor: def getpid(self): return os.getpid() def sleep(self): time.sleep(1) a = Actor.remote() pid = ray.get(a.getpid.remote()) tasks1 = [a.sleep.remote() for _ in range(10)] os.kill(pid, 9) time.sleep(0.1) tasks2 = [a.sleep.remote() for _ in range(10)] for obj in tasks1 + tasks2: with pytest.raises(Exception): ray.get(obj) errors = get_error_message(p, 1, ray_constants.WORKER_DIED_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_DIED_PUSH_ERROR
def test_failed_task(ray_start_shared_local_modes, error_pubsub): @ray.remote def throw_exception_fct1(): raise Exception("Test function 1 intentionally failed.") @ray.remote def throw_exception_fct2(): raise Exception("Test function 2 intentionally failed.") @ray.remote(num_returns=3) def throw_exception_fct3(x): raise Exception("Test function 3 intentionally failed.") p = error_pubsub throw_exception_fct1.remote() throw_exception_fct1.remote() if ray.worker.global_worker.mode != ray.worker.LOCAL_MODE: msgs = get_error_message(p, 2, ray.ray_constants.TASK_PUSH_ERROR) assert len(msgs) == 2 for msg in msgs: assert "Test function 1 intentionally failed." in msg.error_message x = throw_exception_fct2.remote() try: ray.get(x) except Exception as e: assert "Test function 2 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False x, y, z = throw_exception_fct3.remote(1.0) for ref in [x, y, z]: try: ray.get(ref) except Exception as e: assert "Test function 3 intentionally failed." in str(e) else: # ray.get should throw an exception. assert False class CustomException(ValueError): def __init__(self, msg): super().__init__(msg) self.field = 1 def f(self): return 2 @ray.remote def f(): raise CustomException("This function failed.") try: ray.get(f.remote()) except Exception as e: assert "This function failed." in str(e) assert isinstance(e, ValueError) assert isinstance(e, CustomException) assert isinstance(e, ray.exceptions.RayTaskError) assert "RayTaskError(CustomException)" in repr(e) assert e.field == 1 assert e.f() == 2 else: # ray.get should throw an exception. assert False
def test_fail_importing_actor(ray_start_regular, error_pubsub): p = error_pubsub # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define an actor that closes over this temporary module. This should # fail when it is unpickled. @ray.remote class Foo: def __init__(self, arg1, arg2=3): self.x = module.temporary_python_file() def get_val(self, arg1, arg2=3): return 1 # There should be no errors yet. errors = get_error_message(p, 2) assert len(errors) == 0 # Create an actor. foo = Foo.remote(3, arg2=0) errors = get_error_message(p, 2) assert len(errors) == 2 for error in errors: # Wait for the error to arrive. if error.type == ray_constants.REGISTER_ACTOR_PUSH_ERROR: assert "No module named" in error.error_message else: # Wait for the error from when the __init__ tries to run. assert ("failed to be imported, and so cannot execute this method" in error.error_message) # Check that if we try to get the function it throws an exception and # does not hang. with pytest.raises(Exception, match="failed to be imported"): ray.get(foo.get_val.remote(1, arg2=2)) # Wait for the error from when the call to get_val. errors = get_error_message(p, 1, ray_constants.TASK_PUSH_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.TASK_PUSH_ERROR assert ("failed to be imported, and so cannot execute this method" in errors[0].error_message) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def test_error_isolation(call_ray_start): address = call_ray_start # Connect a driver to the Ray cluster. ray.init(address=address) p = init_error_pubsub() # There shouldn't be any errors yet. errors = get_error_message(p, 1, 2) assert len(errors) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. errors = get_error_message(p, 1) # Make sure we got the error. assert len(errors) == 1 assert error_string1 in errors[0].error_message # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time from ray.test_utils import (init_error_pubsub, get_error_message) ray.init(address="{}") p = init_error_pubsub() time.sleep(1) errors = get_error_message(p, 1, 2) assert len(errors) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass errors = get_error_message(p, 1) assert len(errors) == 1 assert "{}" in errors[0].error_message print("success") """.format(address, error_string2, error_string2) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. errors = get_error_message(p, 1) assert len(errors) == 1 p.close()