def _get_target_nodes(self) -> List[Tuple[str, str]]: """Return the list of (id, resource_key) to deploy HTTP servers on.""" location = self._config.location target_nodes = get_all_node_ids() if location == DeploymentMode.NoServer: return [] if location == DeploymentMode.HeadOnly: head_node_resource_key = get_current_node_resource_key() return [(node_id, node_resource) for node_id, node_resource in target_nodes if node_resource == head_node_resource_key][:1] if location == DeploymentMode.FixedNumber: num_replicas = self._config.fixed_number_replicas if num_replicas > len(target_nodes): logger.warning( "You specified fixed_number_replicas=" f"{num_replicas} but there are only " f"{len(target_nodes)} total nodes. Serve will start one " "HTTP proxy per node.") num_replicas = len(target_nodes) # Seed the random state so sample is deterministic. # i.e. it will always return the same set of nodes. random.seed(self._config.fixed_number_selection_seed) return random.sample(sorted(target_nodes), k=num_replicas) return target_nodes
def get_proxy_names(): proxy_names = [] for node_id, _ in get_all_node_ids(): proxy_names.append( format_actor_name(SERVE_PROXY_NAME, client._controller_name, node_id)) return proxy_names
def _start_routers_if_needed(self, http_host: str, http_port: str, http_middlewares: List[Any]) -> None: """Start a router on every node if it doesn't already exist.""" for node_id, node_resource in get_all_node_ids(): if node_id in self.routers_cache: continue router_name = format_actor_name(SERVE_PROXY_NAME, self.controller_name, node_id) try: router = ray.get_actor(router_name) except ValueError: logger.info("Starting router with name '{}' on node '{}' " "listening on '{}:{}'".format( router_name, node_id, http_host, http_port)) router = HTTPProxyActor.options( name=router_name, lifetime="detached" if self.detached else None, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_resource: 0.01 }, ).remote(http_host, http_port, controller_name=self.controller_name, http_middlewares=http_middlewares) self.routers_cache[node_id] = router
def _start_routers_if_needed(self): """Start a router on every node if it doesn't already exist.""" for node_id, node_resource in get_all_node_ids(): if node_id in self.routers: continue router_name = format_actor_name(SERVE_PROXY_NAME, self.instance_name, node_id) try: router = ray.get_actor(router_name) except ValueError: logger.info("Starting router with name '{}' on node '{}' " "listening on '{}:{}'".format( router_name, node_id, self.http_host, self.http_port)) router = HTTPProxyActor.options( name=router_name, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_resource: 0.01 }, ).remote( node_id, self.http_host, self.http_port, instance_name=self.instance_name, _http_middlewares=self._http_middlewares) self.routers[node_id] = router
def _start_proxies_if_needed(self) -> None: """Start a proxy on every node if it doesn't already exist.""" if self._config.host is None: return for node_id, node_resource in get_all_node_ids(): if node_id in self._proxy_actors: continue name = format_actor_name(SERVE_PROXY_NAME, self._controller_name, node_id) try: proxy = ray.get_actor(name) except ValueError: logger.info("Starting HTTP proxy with name '{}' on node '{}' " "listening on '{}:{}'".format( name, node_id, self._config.host, self._config.port)) proxy = HTTPProxyActor.options( name=name, lifetime="detached" if self._detached else None, max_concurrency=ASYNC_CONCURRENCY, max_restarts=-1, max_task_retries=-1, resources={ node_resource: 0.01 }, ).remote(self._config.host, self._config.port, controller_name=self._controller_name, http_middlewares=self._config.middlewares) self._proxy_actors[node_id] = proxy
def test_shutdown(ray_shutdown): ray.init(num_cpus=16) serve.start(http_options=dict(port=8003)) @serve.deployment def f(): pass f.deploy() serve_controller_name = serve.context._global_client._controller_name actor_names = [ serve_controller_name, format_actor_name( SERVE_PROXY_NAME, serve.context._global_client._controller_name, get_all_node_ids()[0][0], ), ] def check_alive(): alive = True for actor_name in actor_names: try: if actor_name == serve_controller_name: ray.get_actor( actor_name, namespace=ray.get_runtime_context().namespace) else: ray.get_actor(actor_name) except ValueError: alive = False return alive wait_for_condition(check_alive) serve.shutdown() with pytest.raises(RayServeException): serve.list_deployments() def check_dead(): for actor_name in actor_names: try: if actor_name == serve_controller_name: ray.get_actor( actor_name, namespace=ray.get_runtime_context().namespace) else: ray.get_actor(actor_name) return False except ValueError: pass return True wait_for_condition(check_dead)
def _stop_proxies_if_needed(self) -> bool: """Removes proxy actors from any nodes that no longer exist.""" all_node_ids = {node_id for node_id, _ in get_all_node_ids()} to_stop = [] for node_id in self._proxy_actors: if node_id not in all_node_ids: logger.info("Removing HTTP proxy on removed node '{}'.".format(node_id)) to_stop.append(node_id) for node_id in to_stop: proxy = self._proxy_actors.pop(node_id) del self._proxy_actor_names[node_id] ray.kill(proxy, no_restart=True)
def _get_target_nodes(self) -> List[Tuple[str, str]]: """Return the list of (id, resource_key) to deploy HTTP servers on.""" location = self._config.location target_nodes = get_all_node_ids() if location == DeploymentMode.NoServer: return [] if location == DeploymentMode.HeadOnly: head_node_resource_key = get_current_node_resource_key() target_nodes = [(node_id, node_resource) for node_id, node_resource in target_nodes if node_resource == head_node_resource_key][:1] return target_nodes
def test_shutdown(ray_shutdown): ray.init(num_cpus=16) serve.start(http_port=8003) @serve.deployment def f(): pass f.deploy() actor_names = [ serve.api._global_client._controller_name, format_actor_name(SERVE_PROXY_NAME, serve.api._global_client._controller_name, get_all_node_ids()[0][0]) ] def check_alive(): alive = True for actor_name in actor_names: try: ray.get_actor(actor_name) except ValueError: alive = False return alive wait_for_condition(check_alive) serve.shutdown() with pytest.raises(RayServeException): serve.list_backends() def check_dead(): for actor_name in actor_names: try: ray.get_actor(actor_name) return False except ValueError: pass return True wait_for_condition(check_dead)
def _stop_routers_if_needed(self) -> bool: """Removes router actors from any nodes that no longer exist. Returns whether or not any actors were removed (a checkpoint should be taken). """ actor_stopped = False all_node_ids = {node_id for node_id, _ in get_all_node_ids()} to_stop = [] for node_id in self.routers_cache: if node_id not in all_node_ids: logger.info( "Removing router on removed node '{}'.".format(node_id)) to_stop.append(node_id) for node_id in to_stop: router_handle = self.routers_cache.pop(node_id) ray.kill(router_handle, no_restart=True) actor_stopped = True return actor_stopped
def _get_target_nodes(self) -> List[Tuple[str, str]]: """Return the list of (node_id, ip_address) to deploy HTTP servers on.""" location = self._config.location target_nodes = get_all_node_ids() if location == DeploymentMode.NoServer: return [] if location == DeploymentMode.HeadOnly: nodes = [ (node_id, ip_address) for node_id, ip_address in target_nodes if node_id == self._head_node_id ] assert len(nodes) == 1, ( f"Head node not found! Head node id: {self._head_node_id}, " f"all nodes: {target_nodes}." ) return nodes if location == DeploymentMode.FixedNumber: num_replicas = self._config.fixed_number_replicas if num_replicas > len(target_nodes): logger.warning( "You specified fixed_number_replicas=" f"{num_replicas} but there are only " f"{len(target_nodes)} total nodes. Serve will start one " "HTTP proxy per node." ) num_replicas = len(target_nodes) # Seed the random state so sample is deterministic. # i.e. it will always return the same set of nodes. random.seed(self._config.fixed_number_selection_seed) return random.sample(sorted(target_nodes), k=num_replicas) return target_nodes