async def _gcs_check_alive(self): try: self._gcs_check_alive_seq += 1 request = gcs_service_pb2.CheckAliveRequest( seq=self._gcs_check_alive_seq) reply = await self._gcs_heartbeat_info_stub.CheckAlive( request, timeout=2) if reply.status.code != 0: raise Exception( f"Failed to CheckAlive: {reply.status.message}") self._gcs_rpc_error_counter = 0 except aiogrpc.AioRpcError: logger.exception( "Got AioRpcError when checking GCS is alive, seq=%s.", self._gcs_check_alive_seq) self._gcs_rpc_error_counter += 1 if self._gcs_rpc_error_counter > \ dashboard_consts.GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR: logger.error( "Dashboard suicide, the GCS RPC error count %s > %s", self._gcs_rpc_error_counter, dashboard_consts.GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR) # TODO(fyrestone): Do not use ray.state in # PrometheusServiceDiscoveryWriter. # Currently, we use os._exit() here to avoid hanging at the ray # shutdown(). Please refer to: # https://github.com/ray-project/ray/issues/16328 os._exit(-1) except Exception: logger.exception("Error checking GCS is alive, seq=%s.", self._gcs_check_alive_seq)
def check_health(address: str, timeout=2) -> bool: """Checks Ray cluster health, before / without actually connecting to the cluster via ray.init(). Args: address: Ray cluster / GCS address string, e.g. ip:port. timeout: request timeout. Returns: Returns True if the cluster is running and has matching Ray version. Returns False if no service is running. Raises an exception otherwise. """ req = gcs_service_pb2.CheckAliveRequest() try: channel = create_gcs_channel(address) stub = gcs_service_pb2_grpc.HeartbeatInfoGcsServiceStub(channel) resp = stub.CheckAlive(req, timeout=timeout) except grpc.RpcError: return False if resp.status.code != GcsCode.OK: raise RuntimeError( f"GCS running at {address} is unhealthy: {resp.status}") if resp.ray_version is None: resp.ray_version = "<= 1.12" if resp.ray_version != ray.__version__: raise RuntimeError(f"Ray cluster at {address} has version " f"{resp.ray_version}, but this process is running " f"Ray version {ray.__version__}.") return True
async def check_alive(self, node_ips: List[bytes], timeout: Optional[float] = None) -> List[bool]: req = gcs_service_pb2.CheckAliveRequest(raylet_address=node_ips) reply = await self._heartbeat_info_stub.CheckAlive(req, timeout=timeout) if reply.status.code != GcsCode.OK: raise RuntimeError( f"GCS running at {self._channel.address} is unhealthy: {reply.status}" ) return list(reply.raylet_alive)
async def _check_once(self) -> bool: request = gcs_service_pb2.CheckAliveRequest() try: reply = await self.gcs_heartbeat_info_stub.CheckAlive( request, timeout=dashboard_consts.GCS_CHECK_ALIVE_RPC_TIMEOUT) if reply.status.code != 0: logger.exception( f"Failed to CheckAlive: {reply.status.message}") return False except aiogrpc.AioRpcError: # Deadline Exceeded logger.exception("Got AioRpcError when checking GCS is alive") return False return True