def _put_object(self, request: ray_client_pb2.PutRequest, client_id: str, context=None): """Put an object in the cluster with ray.put() via gRPC. Args: request: PutRequest with pickled data. client_id: The client who owns this data, for tracking when to delete this reference. context: gRPC context. """ try: obj = loads_from_client(request.data, self) with disable_client_hook(): objectref = ray.put(obj) except Exception as e: logger.exception("Put failed:") return ray_client_pb2.PutResponse( id=b"", valid=False, error=cloudpickle.dumps(e)) self.object_refs[client_id][objectref.binary()] = objectref if len(request.client_ref_id) > 0: self.client_side_ref_map[client_id][ request.client_ref_id] = objectref.binary() logger.debug("put: %s" % objectref) return ray_client_pb2.PutResponse(id=objectref.binary(), valid=True)
def _put_object( self, data: Union[bytes, bytearray], client_ref_id: bytes, client_id: str, context=None, ): """Put an object in the cluster with ray.put() via gRPC. Args: data: Pickled data. Can either be bytearray if this is called from the dataservicer, or bytes if called from PutObject. client_ref_id: The id associated with this object on the client. client_id: The client who owns this data, for tracking when to delete this reference. context: gRPC context. """ try: obj = loads_from_client(data, self) with disable_client_hook(): objectref = ray.put(obj) except Exception as e: logger.exception("Put failed:") return ray_client_pb2.PutResponse(id=b"", valid=False, error=cloudpickle.dumps(e)) self.object_refs[client_id][objectref.binary()] = objectref if len(client_ref_id) > 0: self.client_side_ref_map[client_id][ client_ref_id] = objectref.binary() logger.debug("put: %s" % objectref) return ray_client_pb2.PutResponse(id=objectref.binary(), valid=True)
def _put_object(self, request: ray_client_pb2.PutRequest, client_id: str, context=None): """Put an object in the cluster with ray.put() via gRPC. Args: request: PutRequest with pickled data. client_id: The client who owns this data, for tracking when to delete this reference. context: gRPC context. """ obj = loads_from_client(request.data, self) with disable_client_hook(): objectref = ray.put(obj) self.object_refs[client_id][objectref.binary()] = objectref logger.debug("put: %s" % objectref) return ray_client_pb2.PutResponse(id=objectref.binary())
def Datapath(self, request_iterator, context): start_time = time.time() # set to True if client shuts down gracefully cleanup_requested = False metadata = {k: v for k, v in context.invocation_metadata()} client_id = metadata.get("client_id") if client_id is None: logger.error("Client connecting with no client_id") return logger.debug(f"New data connection from client {client_id}: ") accepted_connection = self._init(client_id, context, start_time) response_cache = self.response_caches[client_id] # Set to False if client requests a reconnect grace period of 0 reconnect_enabled = True if not accepted_connection: return try: request_queue = Queue() queue_filler_thread = Thread(target=fill_queue, daemon=True, args=(request_iterator, request_queue)) queue_filler_thread.start() """For non `async get` requests, this loop yields immediately For `async get` requests, this loop: 1) does not yield, it just continues 2) When the result is ready, it yields """ for req in iter(request_queue.get, None): if isinstance(req, ray_client_pb2.DataResponse): # Early shortcut if this is the result of an async get. yield req continue assert isinstance(req, ray_client_pb2.DataRequest) if _should_cache(req) and reconnect_enabled: cached_resp = response_cache.check_cache(req.req_id) if isinstance(cached_resp, Exception): # Cache state is invalid, raise exception raise cached_resp if cached_resp is not None: yield cached_resp continue resp = None req_type = req.WhichOneof("type") if req_type == "init": resp_init = self.basic_service.Init(req.init) resp = ray_client_pb2.DataResponse(init=resp_init, ) with self.clients_lock: self.reconnect_grace_periods[ client_id] = req.init.reconnect_grace_period if req.init.reconnect_grace_period == 0: reconnect_enabled = False elif req_type == "get": if req.get.asynchronous: get_resp = self.basic_service._async_get_object( req.get, client_id, req.req_id, request_queue) if get_resp is None: # Skip sending a response for this request and # continue to the next requst. The response for # this request will be sent when the object is # ready. continue else: get_resp = self.basic_service._get_object( req.get, client_id) resp = ray_client_pb2.DataResponse(get=get_resp) elif req_type == "put": if not self.put_request_chunk_collector.add_chunk( req, req.put): # Put request still in progress continue put_resp = self.basic_service._put_object( self.put_request_chunk_collector.data, req.put.client_ref_id, client_id, ) self.put_request_chunk_collector.reset() resp = ray_client_pb2.DataResponse(put=put_resp) elif req_type == "release": released = [] for rel_id in req.release.ids: rel = self.basic_service.release(client_id, rel_id) released.append(rel) resp = ray_client_pb2.DataResponse( release=ray_client_pb2.ReleaseResponse(ok=released)) elif req_type == "connection_info": resp = ray_client_pb2.DataResponse( connection_info=self._build_connection_response()) elif req_type == "prep_runtime_env": with self.clients_lock: resp_prep = self.basic_service.PrepRuntimeEnv( req.prep_runtime_env) resp = ray_client_pb2.DataResponse( prep_runtime_env=resp_prep) elif req_type == "connection_cleanup": cleanup_requested = True cleanup_resp = ray_client_pb2.ConnectionCleanupResponse() resp = ray_client_pb2.DataResponse( connection_cleanup=cleanup_resp) elif req_type == "acknowledge": # Clean up acknowledged cache entries response_cache.cleanup(req.acknowledge.req_id) continue elif req_type == "task": with self.clients_lock: task = req.task if not self.client_task_chunk_collector.add_chunk( req, task): # Not all serialized arguments have arrived continue arglist, kwargs = loads_from_client( self.client_task_chunk_collector.data, self.basic_service) self.client_task_chunk_collector.reset() resp_ticket = self.basic_service.Schedule( req.task, arglist, kwargs, context) resp = ray_client_pb2.DataResponse( task_ticket=resp_ticket) elif req_type == "terminate": with self.clients_lock: response = self.basic_service.Terminate( req.terminate, context) resp = ray_client_pb2.DataResponse(terminate=response) elif req_type == "list_named_actors": with self.clients_lock: response = self.basic_service.ListNamedActors( req.list_named_actors) resp = ray_client_pb2.DataResponse( list_named_actors=response) else: raise Exception(f"Unreachable code: Request type " f"{req_type} not handled in Datapath") resp.req_id = req.req_id if _should_cache(req) and reconnect_enabled: response_cache.update_cache(req.req_id, resp) yield resp except Exception as e: logger.exception("Error in data channel:") recoverable = _propagate_error_in_context(e, context) invalid_cache = response_cache.invalidate(e) if not recoverable or invalid_cache: context.set_code(grpc.StatusCode.FAILED_PRECONDITION) # Connection isn't recoverable, skip cleanup cleanup_requested = True finally: logger.debug(f"Stream is broken with client {client_id}") queue_filler_thread.join(QUEUE_JOIN_SECONDS) if queue_filler_thread.is_alive(): logger.error( "Queue filler thread failed to join before timeout: {}". format(QUEUE_JOIN_SECONDS)) cleanup_delay = self.reconnect_grace_periods.get(client_id) if not cleanup_requested and cleanup_delay is not None: logger.debug("Cleanup wasn't requested, delaying cleanup by" f"{cleanup_delay} seconds.") # Delay cleanup, since client may attempt a reconnect # Wait on the "stopped" event in case the grpc server is # stopped and we can clean up earlier. self.stopped.wait(timeout=cleanup_delay) else: logger.debug("Cleanup was requested, cleaning up immediately.") with self.clients_lock: if client_id not in self.client_last_seen: logger.debug("Connection already cleaned up.") # Some other connection has already cleaned up this # this client's session. This can happen if the client # reconnects and then gracefully shut's down immediately. return last_seen = self.client_last_seen[client_id] if last_seen > start_time: # The client successfully reconnected and updated # last seen some time during the grace period logger.debug("Client reconnected, skipping cleanup") return # Either the client shut down gracefully, or the client # failed to reconnect within the grace period. Clean up # the connection. self.basic_service.release_all(client_id) del self.client_last_seen[client_id] if client_id in self.reconnect_grace_periods: del self.reconnect_grace_periods[client_id] if client_id in self.response_caches: del self.response_caches[client_id] self.num_clients -= 1 logger.debug(f"Removed client {client_id}, " f"remaining={self.num_clients}") # It's important to keep the Ray shutdown # within this locked context or else Ray could hang. # NOTE: it is strange to start ray in server.py but shut it # down here. Consider consolidating ray lifetime management. with disable_client_hook(): if self.num_clients == 0: logger.debug("Shutting down ray.") ray.shutdown()