async def get_actor_info(self): # TODO (Alex): GCS still needs to return actors from dead jobs. request = gcs_service_pb2.GetAllActorInfoRequest() request.show_dead_jobs = True reply = await self._gcs_actor_info_stub.GetAllActorInfo( request, timeout=5) actors = {} for actor_table_entry in reply.actor_table_data: actor_id = actor_table_entry.actor_id.hex() runtime_env = json.loads(actor_table_entry.serialized_runtime_env) entry = { "job_id": actor_table_entry.job_id.hex(), "state": gcs_pb2.ActorTableData.ActorState.Name( actor_table_entry.state), "name": actor_table_entry.name, "namespace": actor_table_entry.ray_namespace, "runtime_env": runtime_env, "start_time": actor_table_entry.start_time, "end_time": actor_table_entry.end_time, "is_detached": actor_table_entry.is_detached, "resources": dict( actor_table_entry.task_spec.required_resources), "actor_class": actor_table_entry.class_name, "current_worker_id": actor_table_entry.address.worker_id.hex(), "current_raylet_id": actor_table_entry.address.raylet_id.hex(), "ip_address": actor_table_entry.address.ip_address, "port": actor_table_entry.address.port, } actors[actor_id] = entry return actors
async def get_actor_info(self): # TODO (Alex): GCS still needs to return actors from dead jobs. request = gcs_service_pb2.GetAllActorInfoRequest() request.show_dead_jobs = True reply = await self._gcs_actor_info_stub.GetAllActorInfo(request, timeout=5) actors = {} for actor_table_entry in reply.actor_table_data: actor_id = actor_table_entry.actor_id.hex() runtime_env = json.loads(actor_table_entry.serialized_runtime_env) entry = { "job_id": actor_table_entry.job_id.hex(), "state": gcs_pb2.ActorTableData.ActorState.Name( actor_table_entry.state), "name": actor_table_entry.name, "namespace": actor_table_entry.ray_namespace, "runtime_env": runtime_env, "start_time": actor_table_entry.start_time, "end_time": actor_table_entry.end_time, "is_detached": actor_table_entry.is_detached, "resources": dict(actor_table_entry.required_resources), "actor_class": actor_table_entry.class_name, "current_worker_id": actor_table_entry.address.worker_id.hex(), "current_raylet_id": actor_table_entry.address.raylet_id.hex(), "ip_address": actor_table_entry.address.ip_address, "port": actor_table_entry.address.port, "metadata": dict(), } actors[actor_id] = entry deployments = await self.get_serve_info() for _, deployment_info in deployments.items(): for replica_actor_id, actor_info in deployment_info[ "actors"].items(): if replica_actor_id in actors: serve_metadata = dict() serve_metadata["replica_tag"] = actor_info[ "replica_tag"] serve_metadata["deployment_name"] = deployment_info[ "name"] serve_metadata["version"] = actor_info["version"] actors[replica_actor_id]["metadata"][ "serve"] = serve_metadata return actors
async def _update_actors(self): # Subscribe actor channel. aioredis_client = self._dashboard_head.aioredis_client receiver = Receiver() key = "{}:*".format(stats_collector_consts.ACTOR_CHANNEL) pattern = receiver.pattern(key) await aioredis_client.psubscribe(pattern) logger.info("Subscribed to %s", key) # Get all actor info. while True: try: logger.info("Getting all actor info from GCS.") request = gcs_service_pb2.GetAllActorInfoRequest() reply = await self._gcs_actor_info_stub.GetAllActorInfo( request, timeout=2) if reply.status.code == 0: result = {} for actor_info in reply.actor_table_data: result[binary_to_hex(actor_info.actor_id)] = \ actor_table_data_to_dict(actor_info) DataSource.actors.reset(result) logger.info("Received %d actor info from GCS.", len(result)) break else: raise Exception( f"Failed to GetAllActorInfo: {reply.status.message}") except Exception: logger.exception("Error Getting all actor info from GCS.") await asyncio.sleep(stats_collector_consts. RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS) # Receive actors from channel. async for sender, msg in receiver.iter(): try: _, data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString(data) actor_info = ray.gcs_utils.ActorTableData.FromString( pubsub_message.data) DataSource.actors[binary_to_hex(actor_info.actor_id)] = \ actor_table_data_to_dict(actor_info) except Exception: logger.exception("Error receiving actor info.")
async def _update_actors(self): # TODO(fyrestone): Refactor code for updating actor / node / job. # Subscribe actor channel. aioredis_client = self._dashboard_head.aioredis_client receiver = Receiver() key = "{}:*".format(actor_consts.ACTOR_CHANNEL) pattern = receiver.pattern(key) await aioredis_client.psubscribe(pattern) logger.info("Subscribed to %s", key) def _process_actor_table_data(data): actor_class = actor_classname_from_task_spec( data.get("taskSpec", {})) data["actorClass"] = actor_class # Get all actor info. while True: try: logger.info("Getting all actor info from GCS.") request = gcs_service_pb2.GetAllActorInfoRequest() reply = await self._gcs_actor_info_stub.GetAllActorInfo( request, timeout=5) if reply.status.code == 0: actors = {} for message in reply.actor_table_data: actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) actors[actor_table_data["actorId"]] = actor_table_data # Update actors. DataSource.actors.reset(actors) # Update node actors and job actors. job_actors = {} node_actors = {} for actor_id, actor_table_data in actors.items(): job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] job_actors.setdefault(job_id, {})[actor_id] = actor_table_data # Update only when node_id is not Nil. if node_id != actor_consts.NIL_NODE_ID: node_actors.setdefault( node_id, {})[actor_id] = actor_table_data DataSource.job_actors.reset(job_actors) DataSource.node_actors.reset(node_actors) logger.info("Received %d actor info from GCS.", len(actors)) break else: raise Exception( f"Failed to GetAllActorInfo: {reply.status.message}") except Exception: logger.exception("Error Getting all actor info from GCS.") await asyncio.sleep( actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS) # Receive actors from channel. state_keys = ("state", "address", "numRestarts", "timestamp", "pid") async for sender, msg in receiver.iter(): try: actor_id, actor_table_data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString( actor_table_data) message = ray.gcs_utils.ActorTableData.FromString( pubsub_message.data) actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) # If actor is not new registered but updated, we only update # states related fields. if actor_table_data["state"] != "DEPENDENCIES_UNREADY": actor_id = actor_id.decode( "UTF-8")[len(ray.gcs_utils.TablePrefix_ACTOR_string + ":"):] actor_table_data_copy = dict(DataSource.actors[actor_id]) for k in state_keys: actor_table_data_copy[k] = actor_table_data[k] actor_table_data = actor_table_data_copy actor_id = actor_table_data["actorId"] job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] # Update actors. DataSource.actors[actor_id] = actor_table_data # Update node actors (only when node_id is not Nil). if node_id != actor_consts.NIL_NODE_ID: node_actors = dict(DataSource.node_actors.get(node_id, {})) node_actors[actor_id] = actor_table_data DataSource.node_actors[node_id] = node_actors # Update job actors. job_actors = dict(DataSource.job_actors.get(job_id, {})) job_actors[actor_id] = actor_table_data DataSource.job_actors[job_id] = job_actors except Exception: logger.exception("Error receiving actor info.")
async def _update_actors(self): # Get all actor info. while True: try: logger.info("Getting all actor info from GCS.") request = gcs_service_pb2.GetAllActorInfoRequest() reply = await self._gcs_actor_info_stub.GetAllActorInfo( request, timeout=5 ) if reply.status.code == 0: actors = {} for message in reply.actor_table_data: actor_table_data = actor_table_data_to_dict(message) actors[actor_table_data["actorId"]] = actor_table_data # Update actors. DataSource.actors.reset(actors) # Update node actors and job actors. job_actors = {} node_actors = {} for actor_id, actor_table_data in actors.items(): job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] job_actors.setdefault(job_id, {})[actor_id] = actor_table_data # Update only when node_id is not Nil. if node_id != actor_consts.NIL_NODE_ID: node_actors.setdefault(node_id, {})[ actor_id ] = actor_table_data DataSource.job_actors.reset(job_actors) DataSource.node_actors.reset(node_actors) logger.info("Received %d actor info from GCS.", len(actors)) break else: raise Exception( f"Failed to GetAllActorInfo: {reply.status.message}" ) except Exception: logger.exception("Error Getting all actor info from GCS.") await asyncio.sleep( actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS ) state_keys = ("state", "address", "numRestarts", "timestamp", "pid") def process_actor_data_from_pubsub(actor_id, actor_table_data): actor_table_data = actor_table_data_to_dict(actor_table_data) # If actor is not new registered but updated, we only update # states related fields. if actor_table_data["state"] != "DEPENDENCIES_UNREADY": actor_table_data_copy = dict(DataSource.actors[actor_id]) for k in state_keys: actor_table_data_copy[k] = actor_table_data[k] actor_table_data = actor_table_data_copy actor_id = actor_table_data["actorId"] job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] # Update actors. DataSource.actors[actor_id] = actor_table_data # Update node actors (only when node_id is not Nil). if node_id != actor_consts.NIL_NODE_ID: node_actors = dict(DataSource.node_actors.get(node_id, {})) node_actors[actor_id] = actor_table_data DataSource.node_actors[node_id] = node_actors # Update job actors. job_actors = dict(DataSource.job_actors.get(job_id, {})) job_actors[actor_id] = actor_table_data DataSource.job_actors[job_id] = job_actors # Receive actors from channel. gcs_addr = self._dashboard_head.gcs_address subscriber = GcsAioActorSubscriber(address=gcs_addr) await subscriber.subscribe() while True: try: actor_id, actor_table_data = await subscriber.poll() if actor_id is not None: # Convert to lower case hex ID. actor_id = actor_id.hex() process_actor_data_from_pubsub(actor_id, actor_table_data) except Exception: logger.exception("Error processing actor info from GCS.")
async def _update_actors(self): # Subscribe actor channel. aioredis_client = self._dashboard_head.aioredis_client receiver = Receiver() key = "{}:*".format(stats_collector_consts.ACTOR_CHANNEL) pattern = receiver.pattern(key) await aioredis_client.psubscribe(pattern) logger.info("Subscribed to %s", key) def _process_actor_table_data(data): actor_class = actor_classname_from_task_spec( data.get("taskSpec", {})) data["actorClass"] = actor_class # Get all actor info. while True: try: logger.info("Getting all actor info from GCS.") request = gcs_service_pb2.GetAllActorInfoRequest() reply = await self._gcs_actor_info_stub.GetAllActorInfo( request, timeout=5) if reply.status.code == 0: actors = {} for message in reply.actor_table_data: actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) actors[actor_table_data["actorId"]] = actor_table_data # Update actors. DataSource.actors.reset(actors) # Update node actors and job actors. job_actors = {} node_actors = {} for actor_id, actor_table_data in actors.items(): job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] job_actors.setdefault(job_id, {})[actor_id] = actor_table_data node_actors.setdefault(node_id, {})[actor_id] = actor_table_data DataSource.job_actors.reset(job_actors) DataSource.node_actors.reset(node_actors) logger.info("Received %d actor info from GCS.", len(actors)) break else: raise Exception( f"Failed to GetAllActorInfo: {reply.status.message}") except Exception: logger.exception("Error Getting all actor info from GCS.") await asyncio.sleep(stats_collector_consts. RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS) # Receive actors from channel. async for sender, msg in receiver.iter(): try: _, actor_table_data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString( actor_table_data) message = ray.gcs_utils.ActorTableData.FromString( pubsub_message.data) actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) actor_id = actor_table_data["actorId"] job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] # Update actors. DataSource.actors[actor_id] = actor_table_data # Update node actors. node_actors = dict(DataSource.node_actors.get(node_id, {})) node_actors[actor_id] = actor_table_data DataSource.node_actors[node_id] = node_actors # Update job actors. job_actors = dict(DataSource.job_actors.get(job_id, {})) job_actors[actor_id] = actor_table_data DataSource.job_actors[job_id] = job_actors except Exception: logger.exception("Error receiving actor info.")