def __init__(self, conn_str: str = "", secure: bool = False, metadata: List[Tuple[str, str]] = None): """Initializes the worker side grpc client. Args: secure: whether to use SSL secure channel or not. metadata: additional metadata passed in the grpc request headers. """ self.metadata = metadata self.channel = None self._client_id = make_client_id() if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(conn_str, credentials) else: self.channel = grpc.insecure_channel(conn_str) self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) self.data_client = DataClient(self.channel, self._client_id) self.reference_count: Dict[bytes, int] = defaultdict(int) self.log_client = LogstreamClient(self.channel) self.log_client.set_logstream_level(logging.INFO) self.closed = False
def _call_inner_function( self, request, context, method: str) -> Optional[ray_client_pb2_grpc.RayletDriverStub]: client_id = _get_client_id_from_context(context) chan = self.proxy_manager.get_channel(client_id) if not chan: logger.error(f"Channel for Client: {client_id} not found!") context.set_code(grpc.StatusCode.NOT_FOUND) return None stub = ray_client_pb2_grpc.RayletDriverStub(chan) return getattr(stub, method)(request, metadata=[("client_id", client_id)])
def __init__(self, conn_str: str = "", secure: bool = False, metadata: List[Tuple[str, str]] = None, connection_retries: int = 3): """Initializes the worker side grpc client. Args: conn_str: The host:port connection string for the ray server. secure: whether to use SSL secure channel or not. metadata: additional metadata passed in the grpc request headers. connection_retries: Number of times to attempt to reconnect to the ray server if it doesn't respond immediately. Setting to 0 tries at least once. For infinite retries, catch the ConnectionError exception. """ self.metadata = metadata if metadata else [] self.channel = None self._client_id = make_client_id() if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(conn_str, credentials) else: self.channel = grpc.insecure_channel(conn_str) conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC while conn_attempts < connection_retries + 1: conn_attempts += 1 try: grpc.channel_ready_future(self.channel).result(timeout=timeout) break except grpc.FutureTimeoutError: if conn_attempts >= connection_retries: raise ConnectionError("ray client connection timeout") logger.info(f"Couldn't connect in {timeout} seconds, retrying") timeout = timeout + 5 if timeout > MAX_TIMEOUT_SEC: timeout = MAX_TIMEOUT_SEC self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) self.data_client = DataClient(self.channel, self._client_id, self.metadata) self.reference_count: Dict[bytes, int] = defaultdict(int) self.log_client = LogstreamClient(self.channel, self.metadata) self.log_client.set_logstream_level(logging.INFO) self.closed = False
def _call_inner_function( self, request, context, method: str) -> Optional[ray_client_pb2_grpc.RayletDriverStub]: client_id = _get_client_id_from_context(context) chan = self.proxy_manager.get_channel(client_id) if not chan: logger.error(f"Channel for Client: {client_id} not found!") context.set_code(grpc.StatusCode.NOT_FOUND) return None stub = ray_client_pb2_grpc.RayletDriverStub(chan) try: metadata = [("client_id", client_id)] if context: metadata = context.invocation_metadata() return getattr(stub, method)(request, metadata=metadata) except Exception as e: # Error while proxying -- propagate the error's context to user logger.exception(f"Proxying call to {method} failed!") _propagate_error_in_context(e, context)
def __init__(self, conn_str: str = "", secure: bool = False, metadata: List[Tuple[str, str]] = None, stub=None): """Initializes the worker side grpc client. Args: stub: custom grpc stub. secure: whether to use SSL secure channel or not. metadata: additional metadata passed in the grpc request headers. """ self.metadata = metadata if stub is None: if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(conn_str, credentials) else: self.channel = grpc.insecure_channel(conn_str) self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) else: self.server = stub
def __init__(self, conn_str: str = "", secure: bool = False, metadata: List[Tuple[str, str]] = None, connection_retries: int = 3): """Initializes the worker side grpc client. Args: conn_str: The host:port connection string for the ray server. secure: whether to use SSL secure channel or not. metadata: additional metadata passed in the grpc request headers. connection_retries: Number of times to attempt to reconnect to the ray server if it doesn't respond immediately. Setting to 0 tries at least once. For infinite retries, catch the ConnectionError exception. """ self._client_id = make_client_id() self.metadata = [("client_id", self._client_id)] + (metadata if metadata else []) self.channel = None self.server = None self._conn_state = grpc.ChannelConnectivity.IDLE self._converted: Dict[str, ClientStub] = {} if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel( conn_str, credentials, options=GRPC_OPTIONS) else: self.channel = grpc.insecure_channel( conn_str, options=GRPC_OPTIONS) self.channel.subscribe(self._on_channel_state_change) # Retry the connection until the channel responds to something # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC service_ready = False while conn_attempts < max(connection_retries, 1): conn_attempts += 1 try: # Let gRPC wait for us to see if the channel becomes ready. # If it throws, we couldn't connect. grpc.channel_ready_future(self.channel).result(timeout=timeout) # The HTTP2 channel is ready. Wrap the channel with the # RayletDriverStub, allowing for unary requests. self.server = ray_client_pb2_grpc.RayletDriverStub( self.channel) service_ready = bool(self.ping_server()) if service_ready: break # Ray is not ready yet, wait a timeout time.sleep(timeout) except grpc.FutureTimeoutError: logger.info( f"Couldn't connect channel in {timeout} seconds, retrying") # Note that channel_ready_future constitutes its own timeout, # which is why we do not sleep here. except grpc.RpcError as e: logger.info("Ray client server unavailable, " f"retrying in {timeout}s...") logger.debug(f"Received when checking init: {e.details()}") # Ray is not ready yet, wait a timeout. time.sleep(timeout) # Fallthrough, backoff, and retry at the top of the loop logger.info("Waiting for Ray to become ready on the server, " f"retry in {timeout}s...") timeout = backoff(timeout) # If we made it through the loop without service_ready # it means we've used up our retries and # should error back to the user. if not service_ready: raise ConnectionError("ray client connection timeout") # Initialize the streams to finish protocol negotiation. self.data_client = DataClient(self.channel, self._client_id, self.metadata) self.reference_count: Dict[bytes, int] = defaultdict(int) self.log_client = LogstreamClient(self.channel, self.metadata) self.log_client.set_logstream_level(logging.INFO) self.closed = False # Track these values to raise a warning if many tasks are being # scheduled self.total_num_tasks_scheduled = 0 self.total_outbound_message_size_bytes = 0
def _connect_channel(self, reconnecting=False) -> None: """ Attempts to connect to the server specified by conn_str. If reconnecting after an RPC error, cleans up the old channel and continues to attempt to connect until the grace period is over. """ if self.channel is not None: self.channel.unsubscribe(self._on_channel_state_change) self.channel.close() if self._secure: if self._credentials is not None: credentials = self._credentials elif os.environ.get("RAY_USE_TLS", "0").lower() in ("1", "true"): ( server_cert_chain, private_key, ca_cert, ) = ray._private.utils.load_certs_from_env() credentials = grpc.ssl_channel_credentials( certificate_chain=server_cert_chain, private_key=private_key, root_certificates=ca_cert, ) else: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(self._conn_str, credentials, options=GRPC_OPTIONS) else: self.channel = grpc.insecure_channel(self._conn_str, options=GRPC_OPTIONS) self.channel.subscribe(self._on_channel_state_change) # Retry the connection until the channel responds to something # looking like a gRPC connection, though it may be a proxy. start_time = time.time() conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC service_ready = False while conn_attempts < max(self._connection_retries, 1) or reconnecting: conn_attempts += 1 if self._in_shutdown: # User manually closed the worker before connection finished break elapsed_time = time.time() - start_time if reconnecting and elapsed_time > self._reconnect_grace_period: self._in_shutdown = True raise ConnectionError( "Failed to reconnect within the reconnection grace period " f"({self._reconnect_grace_period}s)") try: # Let gRPC wait for us to see if the channel becomes ready. # If it throws, we couldn't connect. grpc.channel_ready_future(self.channel).result(timeout=timeout) # The HTTP2 channel is ready. Wrap the channel with the # RayletDriverStub, allowing for unary requests. self.server = ray_client_pb2_grpc.RayletDriverStub( self.channel) service_ready = bool(self.ping_server()) if service_ready: break # Ray is not ready yet, wait a timeout time.sleep(timeout) except grpc.FutureTimeoutError: logger.debug( f"Couldn't connect channel in {timeout} seconds, retrying") # Note that channel_ready_future constitutes its own timeout, # which is why we do not sleep here. except grpc.RpcError as e: logger.debug("Ray client server unavailable, " f"retrying in {timeout}s...") logger.debug(f"Received when checking init: {e.details()}") # Ray is not ready yet, wait a timeout. time.sleep(timeout) # Fallthrough, backoff, and retry at the top of the loop logger.debug("Waiting for Ray to become ready on the server, " f"retry in {timeout}s...") if not reconnecting: # Don't increase backoff when trying to reconnect -- # we already know the server exists, attempt to reconnect # as soon as we can timeout = backoff(timeout) # If we made it through the loop without service_ready # it means we've used up our retries and # should error back to the user. if not service_ready: self._in_shutdown = True if log_once("ray_client_security_groups"): warnings.warn( "Ray Client connection timed out. Ensure that " "the Ray Client port on the head node is reachable " "from your local machine. See https://docs.ray.io/en" "/latest/cluster/ray-client.html#step-2-check-ports for " "more information.") raise ConnectionError("ray client connection timeout")
def set_channel(self, channel: grpc.Channel) -> None: self.stub = ray_client_pb2_grpc.RayletDriverStub(channel)
def __init__(self, conn_str: str = "", secure: bool = False, metadata: List[Tuple[str, str]] = None, connection_retries: int = 3): """Initializes the worker side grpc client. Args: conn_str: The host:port connection string for the ray server. secure: whether to use SSL secure channel or not. metadata: additional metadata passed in the grpc request headers. connection_retries: Number of times to attempt to reconnect to the ray server if it doesn't respond immediately. Setting to 0 tries at least once. For infinite retries, catch the ConnectionError exception. """ self.metadata = metadata if metadata else [] self.channel = None self._client_id = make_client_id() if secure: credentials = grpc.ssl_channel_credentials() self.channel = grpc.secure_channel(conn_str, credentials) else: self.channel = grpc.insecure_channel(conn_str) # Retry the connection until the channel responds to something # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC ray_ready = False while conn_attempts < max(connection_retries, 1): conn_attempts += 1 try: # Let gRPC wait for us to see if the channel becomes ready. # If it throws, we couldn't connect. grpc.channel_ready_future(self.channel).result(timeout=timeout) # The HTTP2 channel is ready. Wrap the channel with the # RayletDriverStub, allowing for unary requests. self.server = ray_client_pb2_grpc.RayletDriverStub( self.channel) # Now the HTTP2 channel is ready, or proxied, but the # servicer may not be ready. Call is_initialized() and if # it throws, the servicer is not ready. On success, the # `ray_ready` result is checked. ray_ready = self.is_initialized() if ray_ready: # Ray is ready! Break out of the retry loop break # Ray is not ready yet, wait a timeout time.sleep(timeout) except grpc.FutureTimeoutError: logger.info( f"Couldn't connect channel in {timeout} seconds, retrying") # Note that channel_ready_future constitutes its own timeout, # which is why we do not sleep here. except grpc.RpcError as e: if e.code() == grpc.StatusCode.UNAVAILABLE: # UNAVAILABLE is gRPC's retryable error, # so we do that here. logger.info("Ray client server unavailable, " f"retrying in {timeout}s...") logger.debug(f"Received when checking init: {e.details()}") # Ray is not ready yet, wait a timeout time.sleep(timeout) else: # Any other gRPC error gets a reraise raise e # Fallthrough, backoff, and retry at the top of the loop logger.info("Waiting for Ray to become ready on the server, " f"retry in {timeout}s...") timeout = backoff(timeout) # If we made it through the loop without ray_ready it means we've used # up our retries and should error back to the user. if not ray_ready: raise ConnectionError("ray client connection timeout") # Initialize the streams to finish protocol negotiation. self.data_client = DataClient(self.channel, self._client_id, self.metadata) self.reference_count: Dict[bytes, int] = defaultdict(int) self.log_client = LogstreamClient(self.channel, self.metadata) self.log_client.set_logstream_level(logging.INFO) self.closed = False
def __init__(self, conn_str="", stub=None): if stub is None: self.channel = grpc.insecure_channel(conn_str) self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) else: self.server = stub