def check_components_alive(cluster, component_type, check_component_alive): """Check that a given component type is alive on all worker nodes.""" worker_nodes = get_other_nodes(cluster) assert len(worker_nodes) > 0 for node in worker_nodes: process = node.all_processes[component_type][0].process if check_component_alive: assert process.poll() is None else: print( "waiting for " + component_type + " with PID " + str(process.pid) + "to terminate" ) process.wait() print( "done waiting for " + component_type + " with PID " + str(process.pid) + "to terminate" ) assert not process.poll() is None
def test_bundle_recreated_when_raylet_fo_after_gcs_server_restart( ray_start_cluster_head_with_external_redis, ): cluster = ray_start_cluster_head_with_external_redis cluster.add_node(num_cpus=2) cluster.wait_for_nodes() # Create one placement group and make sure its creation successfully. placement_group = ray.util.placement_group([{"CPU": 2}]) ray.get(placement_group.ready(), timeout=10) table = ray.util.placement_group_table(placement_group) assert table["state"] == "CREATED" # Restart gcs server. cluster.head_node.kill_gcs_server() cluster.head_node.start_gcs_server() # Restart the raylet. cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1]) cluster.add_node(num_cpus=2) cluster.wait_for_nodes() # Schedule an actor and make sure its creaton successfully. actor = Increase.options(placement_group=placement_group, placement_group_bundle_index=0).remote() assert ray.get(actor.method.remote(1), timeout=5) == 3
def test_ray_wait_dead_actor(ray_start_cluster): """Tests that methods completed by dead actors are returned as ready""" cluster = ray_start_cluster @ray.remote(num_cpus=1) class Actor: def __init__(self): pass def node_id(self): return ray.worker.global_worker.node.unique_id def ping(self): time.sleep(1) # Create some actors and wait for them to initialize. num_nodes = len(cluster.list_all_nodes()) actors = [Actor.remote() for _ in range(num_nodes)] ray.get([actor.ping.remote() for actor in actors]) def actor_dead(): # Ping the actors and make sure the tasks complete. ping_ids = [actor.ping.remote() for actor in actors] unready = ping_ids[:] while unready: _, unready = ray.wait(unready, timeout=0) time.sleep(1) try: ray.get(ping_ids) return False except ray.exceptions.RayActorError: return True # Kill a node that must not be driver node or head node. cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1]) # Repeatedly submit tasks and call ray.wait until the exception for the # dead actor is received. wait_for_condition(actor_dead) # Create an actor on the local node that will call ray.wait in a loop. head_node_resource = "HEAD_NODE" ray.experimental.set_resource(head_node_resource, 1) @ray.remote(num_cpus=0, resources={head_node_resource: 1}) class ParentActor: def __init__(self): pass def wait(self): return actor_dead() def ping(self): return # Repeatedly call ray.wait through the local actor until the exception for # the dead actor is received. parent_actor = ParentActor.remote() wait_for_condition(lambda: ray.get(parent_actor.wait.remote()))
def test_actor_creation_node_failure(ray_start_cluster): # TODO(swang): Refactor test_raylet_failed, etc to reuse the below code. cluster = ray_start_cluster @ray.remote class Child: def __init__(self, death_probability): self.death_probability = death_probability def get_probability(self): return self.death_probability def ping(self): # Exit process with some probability. exit_chance = np.random.rand() if exit_chance < self.death_probability: sys.exit(-1) num_children = 25 # Children actors will die about half the time. death_probability = 0.5 children = [Child.remote(death_probability) for _ in range(num_children)] while len(cluster.list_all_nodes()) > 1: for j in range(2): # Submit some tasks on the actors. About half of the actors will # fail. children_out = [child.ping.remote() for child in children] # Wait a while for all the tasks to complete. This should trigger # reconstruction for any actor creation tasks that were forwarded # to nodes that then failed. ready, _ = ray.wait(children_out, num_returns=len(children_out), timeout=5 * 60.0) assert len(ready) == len(children_out) # Replace any actors that died. for i, out in enumerate(children_out): try: ray.get(out) except ray.exceptions.RayActorError: children[i] = Child.remote(death_probability) children_out = [ child.get_probability.remote() for child in children ] # Wait for new created actors to finish creation before # removing a node. This is needed because right now we don't # support reconstructing actors that died in the process of # being created. ready, _ = ray.wait(children_out, num_returns=len(children_out), timeout=5 * 60.0) assert len(ready) == len(children_out) # Remove a node. Any actor creation tasks that were forwarded to this # node must be restarted. cluster.remove_node(get_other_nodes(cluster, True)[-1])
def _test_component_failed(cluster, component_type): """Kill a component on all worker nodes and check workload succeeds.""" # Submit many tasks with many dependencies. @ray.remote def f(x): # Sleep to make sure that tasks actually fail mid-execution. time.sleep(0.01) return x @ray.remote def g(*xs): # Sleep to make sure that tasks actually fail mid-execution. We # only use it for direct calls because the test already takes a # long time to run with the raylet codepath. time.sleep(0.01) return 1 # Kill the component on all nodes except the head node as the tasks # execute. Do this in a loop while submitting tasks between each # component failure. time.sleep(0.1) worker_nodes = get_other_nodes(cluster) assert len(worker_nodes) > 0 for node in worker_nodes: process = node.all_processes[component_type][0].process # Submit a round of tasks with many dependencies. x = 1 for _ in range(1000): x = f.remote(x) xs = [g.remote(1)] for _ in range(100): xs.append(g.remote(*xs)) xs.append(g.remote(1)) # Kill a component on one of the nodes. process.terminate() time.sleep(1) process.kill() process.wait() assert not process.poll() is None # Make sure that we can still get the objects after the # executing tasks died. ray.get(x) ray.get(xs)
def test_object_reconstruction(ray_start_cluster): cluster = ray_start_cluster # Submit tasks with dependencies in plasma. @ray.remote def large_value(): # Sleep for a bit to force tasks onto different nodes. time.sleep(0.1) return np.zeros(10 * 1024 * 1024) @ray.remote def g(x): return # Kill the component on all nodes except the head node as the tasks # execute. Do this in a loop while submitting tasks between each # component failure. time.sleep(0.1) worker_nodes = get_other_nodes(cluster) assert len(worker_nodes) > 0 component_type = ray_constants.PROCESS_TYPE_RAYLET for node in worker_nodes: process = node.all_processes[component_type][0].process # Submit a round of tasks with many dependencies. num_tasks = len(worker_nodes) xs = [large_value.remote() for _ in range(num_tasks)] # Wait for the tasks to complete, then evict the objects from the local # node. for x in xs: ray.get(x) ray.internal.free([x], local_only=True) # Kill a component on one of the nodes. process.terminate() time.sleep(1) process.kill() process.wait() assert not process.poll() is None # Make sure that we can still get the objects after the # executing tasks died. print("F", xs) xs = [g.remote(x) for x in xs] print("G", xs) ray.get(xs)
def test_placement_group_reschedule_when_node_dead(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=1) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.wait_for_nodes() ray.init(address=cluster.address, namespace="default_test_namespace") # Make sure both head and worker node are alive. nodes = ray.nodes() assert len(nodes) == 3 assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"] with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group(name="name", strategy="SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }, { "CPU": 2 }]) actor_1 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached", ).remote() actor_2 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached", ).remote() actor_3 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached", ).remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) ray.get(actor_3.value.remote()) cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1]) cluster.wait_for_nodes() actor_4 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached", ).remote() actor_5 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached", ).remote() actor_6 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached", ).remote() ray.get(actor_4.value.remote()) ray.get(actor_5.value.remote()) ray.get(actor_6.value.remote()) placement_group_assert_no_leak([placement_group]) ray.shutdown()
return 1 iteration = 0 previous_ids = [1 for _ in range(100)] start_time = time.time() previous_time = start_time while True: for _ in range(100): previous_ids = [f.remote(previous_id) for previous_id in previous_ids] ray.get(previous_ids) for _ in range(100): previous_ids = [f.remote(previous_id) for previous_id in previous_ids] node_to_kill = get_other_nodes(cluster, exclude_head=True)[0] # Remove the first non-head node. cluster.remove_node(node_to_kill) cluster.add_node() new_time = time.time() print("Iteration {}:\n" " - Iteration time: {}.\n" " - Absolute time: {}.\n" " - Total elapsed time: {}.".format(iteration, new_time - previous_time, new_time, new_time - start_time)) update_progress({ "iteration": iteration,