def test_autoscaler_shutdown_node_http_everynode( shutdown_ray, call_ray_stop_only # noqa: F811 ): cluster = AutoscalingCluster( head_resources={"CPU": 2}, worker_node_types={ "cpu_node": { "resources": { "CPU": 4, "IS_WORKER": 100, }, "node_config": {}, "max_workers": 1, }, }, idle_timeout_minutes=0.05, ) cluster.start() ray.init(address="auto") serve.start(http_options={"location": "EveryNode"}) @ray.remote class Placeholder: def ready(self): return 1 a = Placeholder.options(resources={"IS_WORKER": 1}).remote() assert ray.get(a.ready.remote()) == 1 # 2 proxies, 1 controller, and one placeholder. wait_for_condition(lambda: len(ray._private.state.actors()) == 4) assert len(ray.nodes()) == 2 # Now make sure the placeholder actor exits. ray.kill(a) # The http proxy on worker node should exit as well. wait_for_condition( lambda: len( list( filter( lambda a: a["State"] == "ALIVE", ray._private.state.actors().values(), ) ) ) == 2 ) # Only head node should exist now. wait_for_condition( lambda: len(list(filter(lambda n: n["Alive"], ray.nodes()))) == 1 )
def test_scaledown_shared_objects(shutdown_only): cluster = AutoscalingCluster( head_resources={"CPU": 1}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 100 * 1024 * 1024, }, "node_config": {}, "min_workers": 0, "max_workers": 4, }, }, idle_timeout_minutes=0.05, ) try: cluster.start( _system_config={"scheduler_report_pinned_bytes_only": True}) ray.init("auto") # Triggers the addition of a GPU node. @ray.remote(num_cpus=1) class Actor: def f(self): pass def recv(self, obj): pass actors = [Actor.remote() for _ in range(5)] ray.get([a.f.remote() for a in actors]) print("All five nodes launched") # Verify scale-up. wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 5) data = ray.put(np.zeros(1024 * 1024 * 5)) ray.get([a.recv.remote(data) for a in actors]) print("Data broadcast successfully, deleting actors.") del actors # Verify scale-down. wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 1, timeout=30) finally: cluster.shutdown()
def test_fake_autoscaler_basic_e2e(shutdown_only): # __example_begin__ cluster = AutoscalingCluster( head_resources={"CPU": 2}, worker_node_types={ "cpu_node": { "resources": { "CPU": 4, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 0, "max_workers": 2, }, "gpu_node": { "resources": { "CPU": 2, "GPU": 1, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 0, "max_workers": 2, }, }, ) try: cluster.start() ray.init("auto") # Triggers the addition of a GPU node. @ray.remote(num_gpus=1) def f(): print("gpu ok") # Triggers the addition of a CPU node. @ray.remote(num_cpus=3) def g(): print("cpu ok") ray.get(f.remote()) ray.get(g.remote()) ray.shutdown() finally: cluster.shutdown()
def _ray_start_chaos_cluster(request): param = getattr(request, "param", {}) kill_interval = param.pop("kill_interval", None) config = param.pop("_system_config", {}) config.update( { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "task_retry_delay_ms": 100, } ) # Config of workers that are re-started. head_resources = param.pop("head_resources") worker_node_types = param.pop("worker_node_types") cluster = AutoscalingCluster( head_resources, worker_node_types, idle_timeout_minutes=10, # Don't take down nodes. **param, ) cluster.start(_system_config=config) ray.init("auto") nodes = ray.nodes() assert len(nodes) == 1 if kill_interval is not None: node_killer = get_and_run_node_killer(kill_interval) yield cluster if kill_interval is not None: ray.get(node_killer.stop_run.remote()) killed = ray.get(node_killer.get_total_killed_nodes.remote()) assert len(killed) > 0 died = {node["NodeID"] for node in ray.nodes() if not node["Alive"]} assert died.issubset(killed), ( f"Raylets {died - killed} that " "we did not kill crashed" ) ray.shutdown() cluster.shutdown()
def ray_start_chaos_cluster(request): """Returns the cluster and chaos thread. """ os.environ["RAY_num_heartbeats_timeout"] = "5" os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100" param = getattr(request, "param", {}) kill_interval = param.get("kill_interval", 2) # Config of workers that are re-started. head_resources = param["head_resources"] worker_node_types = param["worker_node_types"] cluster = AutoscalingCluster(head_resources, worker_node_types) cluster.start() ray.init("auto") nodes = ray.nodes() assert len(nodes) == 1 node_killer = get_and_run_node_killer(kill_interval) yield node_killer assert ray.get(node_killer.get_total_killed_nodes.remote()) > 0 ray.shutdown() cluster.shutdown() del os.environ["RAY_num_heartbeats_timeout"] del os.environ["RAY_raylet_heartbeat_period_milliseconds"]
def test_no_scaledown_with_spilled_objects(shutdown_only): cluster = AutoscalingCluster( head_resources={"CPU": 0}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 75 * 1024 * 1024, }, "node_config": {}, "min_workers": 0, "max_workers": 2, }, }, idle_timeout_minutes=0.05, ) try: cluster.start(_system_config={ "scheduler_report_pinned_bytes_only": True, "min_spilling_size": 0, }) ray.init("auto") actors = [Actor.remote() for _ in range(2)] ray.get([a.f.remote() for a in actors]) # Verify scale-up. wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 2) print("All nodes launched") # Put 10 x 80MiB objects into the object store with 75MiB memory limit. obj_size = 10 * 1024 * 1024 objs = [] for i in range(10): obj = actors[0].create.remote(obj_size) ray.get(actors[1].recv.remote(obj)) objs.append(obj) print(f"obj {i}={obj.hex()}") del obj # At least 9 out of the 10 objects should have spilled. check_memory([obj.hex() for obj in objs], num_spilled_objects=9) print("Objects spilled, deleting actors and object references.") # Assume the 1st object always gets spilled. spilled_obj = objs[0] del objs del actors # Verify scale-down to 1 node. def scaledown_to_one(): cpu = ray.cluster_resources().get("CPU", 0) assert cpu > 0, "Scale-down should keep at least 1 node" return cpu == 1 wait_for_condition(scaledown_to_one, timeout=30) # Verify the spilled object still exists, and there is no object in the # plasma store. check_memory([spilled_obj.hex()], num_plasma_objects=0) # Delete the spilled object, the remaining worker node should be scaled # down. del spilled_obj wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 0) check_memory([], num_plasma_objects=0) finally: cluster.shutdown()
def test_demand_report_when_scale_up(shutdown_only): # https://github.com/ray-project/ray/issues/22122 from ray.cluster_utils import AutoscalingCluster cluster = AutoscalingCluster( head_resources={"CPU": 0}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 10, "max_workers": 10, }, }, ) cluster.start() info = ray.init("auto") @ray.remote def f(): time.sleep(10000) @ray.remote def g(): ray.get(h.remote()) @ray.remote def h(): time.sleep(10000) tasks = [f.remote() for _ in range(5000)].extend( # noqa: F841 [g.remote() for _ in range(5000)]) global_state_accessor = make_global_state_accessor(info) def check_backlog_info(): message = global_state_accessor.get_all_resource_usage() if message is None: return 0 resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands if len(aggregate_resource_load) != 1: return False (backlog_size, num_ready_requests_queued, shape) = ( aggregate_resource_load[0].backlog_size, aggregate_resource_load[0].num_ready_requests_queued, aggregate_resource_load[0].shape, ) if backlog_size + num_ready_requests_queued != 9990: return False if shape != {"CPU": 1.0}: return False return True # In ASAN test it's slow. # Wait for 20s for the cluster to be up wait_for_condition(check_backlog_info, 20) cluster.shutdown()
def test_demand_report_for_node_affinity_scheduling_strategy(shutdown_only): from ray.cluster_utils import AutoscalingCluster cluster = AutoscalingCluster( head_resources={"CPU": 0}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 1024 * 1024 * 1024, }, "node_config": {}, "min_workers": 1, "max_workers": 1, }, }, ) cluster.start() info = ray.init(address="auto") @ray.remote(num_cpus=1) def f(sleep_s): time.sleep(sleep_s) return ray.get_runtime_context().node_id worker_node_id = ray.get(f.remote(0)) tasks = [] tasks.append(f.remote(10000)) # This is not reported since there is feasible node. tasks.append( f.options(scheduling_strategy=NodeAffinitySchedulingStrategy( worker_node_id, soft=False)).remote(0)) # This is reported since there is no feasible node and soft is True. tasks.append( f.options( num_gpus=1, scheduling_strategy=NodeAffinitySchedulingStrategy( ray.NodeID.from_random().hex(), soft=True), ).remote(0)) global_state_accessor = make_global_state_accessor(info) def check_resource_demand(): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) aggregate_resource_load = resource_usage.resource_load_by_shape.resource_demands if len(aggregate_resource_load) != 1: return False if aggregate_resource_load[0].num_infeasible_requests_queued != 1: return False if aggregate_resource_load[0].shape != {"CPU": 1.0, "GPU": 1.0}: return False return True wait_for_condition(check_resource_demand, 20) cluster.shutdown()
def ray_start_chaos_cluster(request): """Returns the cluster and chaos thread. Run chaos_thread.start() to start the chaos testing. NOTE: `cluster` is not thread-safe. `cluster` shouldn't be modified by other thread once chaos_thread.start() is called. """ os.environ["RAY_num_heartbeats_timeout"] = "5" os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100" param = getattr(request, "param", {}) kill_interval = param.get("kill_interval", 2) # Config of workers that are re-started. head_resources = param["head_resources"] worker_node_types = param["worker_node_types"] timeout = param["timeout"] # Use the shutdown RPC instead of signals because we can't # raise a signal in a non-main thread. def kill_raylet(ip, port, graceful=False): raylet_address = f"{ip}:{port}" channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) print(f"Sending a shutdown request to {ip}:{port}") stub.ShutdownRaylet( node_manager_pb2.ShutdownRayletRequest(graceful=graceful)) cluster = AutoscalingCluster(head_resources, worker_node_types) cluster.start() ray.init("auto") nodes = ray.nodes() assert len(nodes) == 1 head_node_port = nodes[0]["NodeManagerPort"] killed_port = set() def run_chaos_cluster(): start = time.time() while True: node_to_kill_ip = None node_to_kill_port = None for node in ray.nodes(): addr = node["NodeManagerAddress"] port = node["NodeManagerPort"] if (node["Alive"] and port != head_node_port and port not in killed_port): node_to_kill_ip = addr node_to_kill_port = port break if node_to_kill_port is not None: kill_raylet(node_to_kill_ip, node_to_kill_port, graceful=False) killed_port.add(node_to_kill_port) time.sleep(kill_interval) print(len(ray.nodes())) if time.time() - start > timeout: break assert len(killed_port) > 0, ( "None of nodes are killed by the conftest. It is a bug.") chaos_thread = threading.Thread(target=run_chaos_cluster) yield chaos_thread chaos_thread.join() ray.shutdown() cluster.shutdown() del os.environ["RAY_num_heartbeats_timeout"] del os.environ["RAY_raylet_heartbeat_period_milliseconds"]