def test_task_crash_without_get(ray_start): # Get an error when task failed. @ray.remote def crashing_function(): raise Exception("exception message") object_id = crashing_function.remote() result_list = signal.receive([object_id], timeout=5) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ErrorSignal
def receive_all_signals(sources, timeout): # Get all signals from sources, until there is no signal for a time # period of timeout. results = [] while True: r = signal.receive(sources, timeout=timeout) if len(r) == 0: return results else: results.extend(r)
def test_task_crash_without_get(ray_start_regular): # Get an error when task failed. @ray.remote def crashing_function(): raise Exception("exception message") object_id = crashing_function.remote() result_list = signal.receive([object_id], timeout=5) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ErrorSignal
def test_task_to_driver(ray_start): # Send a signal from a task to the driver. @ray.remote def task_send_signal(value): signal.send(UserSignal(value)) return signal_value = "simple signal" object_id = task_send_signal.remote(signal_value) result_list = signal.receive([object_id], timeout=10) print(result_list[0][1]) assert len(result_list) == 1
def test_task_crash(ray_start): # Get an error when ray.get() is called on the return of a failed task. @ray.remote def crashing_function(): raise Exception("exception message") object_id = crashing_function.remote() try: ray.get(object_id) except Exception as e: assert type(e) == ray.exceptions.RayTaskError finally: result_list = signal.receive([object_id], timeout=5) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ErrorSignal
def test_actor_crash_init(ray_start): # Get an error when an actor's __init__ failed. @ray.remote class ActorCrashInit(object): def __init__(self): raise Exception("exception message") def m(self): return 1 # Do not catch the exception in the __init__. a = ActorCrashInit.remote() result_list = signal.receive([a], timeout=5) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ErrorSignal
def test_actor_crash_init3(ray_start): # Get errors when (1) __init__ fails, and (2) subsequently when # another method of the actor is invoked. @ray.remote class ActorCrashInit(object): def __init__(self): raise Exception("exception message") def method(self): return 1 a = ActorCrashInit.remote() a.method.remote() result_list = signal.receive([a], timeout=10) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ErrorSignal
def test_actor_crash_init3(ray_start_regular): # Get errors when (1) __init__ fails, and (2) subsequently when # another method of the actor is invoked. @ray.remote class ActorCrashInit: def __init__(self): raise Exception("exception message") def method(self): return 1 a = ActorCrashInit.remote() a.method.remote() # Wait for a.method.remote() to finish and generate an error. time.sleep(10) result_list = signal.receive([a], timeout=5) assert len(result_list) == 2 assert type(result_list[0][1]) == signal.ErrorSignal
def test_actor_crash_init3(ray_start): # Get errors when (1) __init__ fails, and (2) subsequently when # another method of the actor is invoked. @ray.remote class ActorCrashInit(object): def __init__(self): raise Exception("exception message") def method(self): return 1 a = ActorCrashInit.remote() a.method.remote() # Wait for a.method.remote() to finish and generate an error. time.sleep(10) result_list = signal.receive([a], timeout=5) assert len(result_list) == 2 assert type(result_list[0][1]) == signal.ErrorSignal
def test_actor_crash(ray_start): # Get an error when ray.get() is called on a return parameter # of a method that failed. @ray.remote class Actor(object): def __init__(self): pass def crash(self): raise Exception("exception message") a = Actor.remote() try: ray.get(a.crash.remote()) except Exception as e: assert type(e) == ray.exceptions.RayTaskError finally: result_list = signal.receive([a], timeout=5) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ErrorSignal
def test_send_signals_from_actor_to_driver(ray_start): # Send "count" signal at intervals from an actor and get # these signals in the driver. @ray.remote class ActorSendSignals(object): def __init__(self): pass def send_signals(self, value, count): for i in range(count): signal.send(UserSignal(value + str(i))) a = ActorSendSignals.remote() signal_value = "simple signal" count = 20 a.send_signals.remote(signal_value, count) received_count = 0 while True: result_list = signal.receive([a], timeout=5) received_count += len(result_list) if (received_count == count): break assert True
def test_signal_on_node_failure(two_node_cluster): """Test actor checkpointing on a remote node.""" class ActorSignal(object): def __init__(self): pass def node_id(self): return ray.worker.global_worker.node.unique_id # Place the actor on the remote node. cluster, remote_node = two_node_cluster actor_cls = ray.remote(max_reconstructions=0)(ActorSignal) actor = actor_cls.remote() # Try until we put an actor on a different node. while (ray.get(actor.node_id.remote()) != remote_node.unique_id): actor = actor_cls.remote() # Kill actor process. cluster.remove_node(remote_node) # Wait on signal from the actor on the failed node. result_list = signal.receive([actor], timeout=10) assert len(result_list) == 1 assert type(result_list[0][1]) == signal.ActorDiedSignal