def test_checkpoint_distributed_handle(ray_start_cluster_2_nodes): cluster = ray_start_cluster_2_nodes counter, ids = setup_counter_actor(test_checkpoint=True) @ray.remote def fork_many_incs(counter, num_incs): x = None for _ in range(num_incs): x = counter.inc.remote() # Only call ray.get() on the last task submitted. return ray.get(x) # Fork num_iters times. count = ray.get(ids[-1]) num_incs = 100 num_iters = 10 forks = [ fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) ] ray.wait(forks, num_returns=len(forks)) count += num_incs * num_iters # Kill the second plasma store to get rid of the cached objects and # trigger the corresponding raylet to exit. get_non_head_nodes(cluster)[0].kill_plasma_store(wait=True) # Check that the actor restored from a checkpoint. assert ray.get(counter.test_restore.remote()) # Check that we can submit another call on the actor and get the # correct counter result. x = ray.get(counter.inc.remote()) assert x == count + 1
def test_remote_checkpoint_distributed_handle(ray_start_cluster_2_nodes): cluster = ray_start_cluster_2_nodes counter, ids = setup_counter_actor(test_checkpoint=True) @ray.remote def fork_many_incs(counter, num_incs): x = None for _ in range(num_incs): x = counter.inc.remote() # Only call ray.get() on the last task submitted. return ray.get(x) # Fork num_iters times. count = ray.get(ids[-1]) num_incs = 100 num_iters = 10 forks = [ fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) ] ray.wait(forks, num_returns=len(forks)) ray.wait([counter.__ray_checkpoint__.remote()]) count += num_incs * num_iters # Kill the second plasma store to get rid of the cached objects and # trigger the corresponding raylet to exit. # TODO: kill raylet instead once this test is not skipped. get_non_head_nodes(cluster)[0].kill_plasma_store(wait=True) # Check that the actor restored from a checkpoint. assert ray.get(counter.test_restore.remote()) # Check that the number of inc calls since actor initialization is # exactly zero, since there could not have been another inc call since # the remote checkpoint. num_inc_calls = ray.get(counter.get_num_inc_calls.remote()) assert num_inc_calls == 0 # Check that we can submit another call on the actor and get the # correct counter result. x = ray.get(counter.inc.remote()) assert x == count + 1
def test_actor_restart_on_node_failure(ray_start_cluster): config = json.dumps({ "num_heartbeats_timeout": 10, "raylet_heartbeat_timeout_milliseconds": 100, "initial_reconstruction_timeout_milliseconds": 1000, "task_retry_delay_ms": 100, }) cluster = ray_start_cluster # Head node with no resources. cluster.add_node(num_cpus=0, _internal_config=config) # Node to place the actor. cluster.add_node(num_cpus=1, _internal_config=config) cluster.wait_for_nodes() ray.init(address=cluster.address) @ray.remote(num_cpus=1, max_restarts=1, max_task_retries=-1) class RestartableActor: """An actor that will be reconstructed at most once.""" def __init__(self): self.value = 0 def increase(self): self.value += 1 return self.value def ready(self): return actor = RestartableActor.remote() ray.get(actor.ready.remote()) results = [actor.increase.remote() for _ in range(100)] # Kill actor node, while the above task is still being executed. cluster.remove_node(get_non_head_nodes(cluster)[-1]) cluster.add_node(num_cpus=1, _internal_config=config) cluster.wait_for_nodes() # Check that none of the tasks failed and the actor is restarted. seq = list(range(1, 101)) results = ray.get(results) failed_task_index = None # Make sure that all tasks were executed in order before and after the # actor's death. for i, res in enumerate(results): elm = seq.pop(0) if res != elm: if failed_task_index is None: failed_task_index = i assert res + failed_task_index == elm # Check that we can still call the actor. result = ray.get(actor.increase.remote()) assert result == 1 or result == results[-1] + 1
iteration = 0 previous_ids = [1 for _ in range(100)] start_time = time.time() previous_time = start_time while True: for _ in range(100): previous_ids = [f.remote(previous_id) for previous_id in previous_ids] ray.get(previous_ids) for _ in range(100): previous_ids = [f.remote(previous_id) for previous_id in previous_ids] node_to_kill = get_non_head_nodes(cluster)[0] # Remove the first non-head node. cluster.remove_node(node_to_kill) cluster.add_node() new_time = time.time() print("Iteration {}:\n" " - Iteration time: {}.\n" " - Absolute time: {}.\n" " - Total elapsed time: {}.".format(iteration, new_time - previous_time, new_time, new_time - start_time)) previous_time = new_time iteration += 1