Ejemplo n.º 1
0
 async def get_actor_info(self):
     # TODO (Alex): GCS still needs to return actors from dead jobs.
     request = gcs_service_pb2.GetAllActorInfoRequest()
     request.show_dead_jobs = True
     reply = await self._gcs_actor_info_stub.GetAllActorInfo(
         request, timeout=5)
     actors = {}
     for actor_table_entry in reply.actor_table_data:
         actor_id = actor_table_entry.actor_id.hex()
         runtime_env = json.loads(actor_table_entry.serialized_runtime_env)
         entry = {
             "job_id": actor_table_entry.job_id.hex(),
             "state": gcs_pb2.ActorTableData.ActorState.Name(
                 actor_table_entry.state),
             "name": actor_table_entry.name,
             "namespace": actor_table_entry.ray_namespace,
             "runtime_env": runtime_env,
             "start_time": actor_table_entry.start_time,
             "end_time": actor_table_entry.end_time,
             "is_detached": actor_table_entry.is_detached,
             "resources": dict(
                 actor_table_entry.task_spec.required_resources),
             "actor_class": actor_table_entry.class_name,
             "current_worker_id": actor_table_entry.address.worker_id.hex(),
             "current_raylet_id": actor_table_entry.address.raylet_id.hex(),
             "ip_address": actor_table_entry.address.ip_address,
             "port": actor_table_entry.address.port,
         }
         actors[actor_id] = entry
     return actors
Ejemplo n.º 2
0
    async def get_actor_info(self):
        # TODO (Alex): GCS still needs to return actors from dead jobs.
        request = gcs_service_pb2.GetAllActorInfoRequest()
        request.show_dead_jobs = True
        reply = await self._gcs_actor_info_stub.GetAllActorInfo(request,
                                                                timeout=5)
        actors = {}
        for actor_table_entry in reply.actor_table_data:
            actor_id = actor_table_entry.actor_id.hex()
            runtime_env = json.loads(actor_table_entry.serialized_runtime_env)
            entry = {
                "job_id":
                actor_table_entry.job_id.hex(),
                "state":
                gcs_pb2.ActorTableData.ActorState.Name(
                    actor_table_entry.state),
                "name":
                actor_table_entry.name,
                "namespace":
                actor_table_entry.ray_namespace,
                "runtime_env":
                runtime_env,
                "start_time":
                actor_table_entry.start_time,
                "end_time":
                actor_table_entry.end_time,
                "is_detached":
                actor_table_entry.is_detached,
                "resources":
                dict(actor_table_entry.required_resources),
                "actor_class":
                actor_table_entry.class_name,
                "current_worker_id":
                actor_table_entry.address.worker_id.hex(),
                "current_raylet_id":
                actor_table_entry.address.raylet_id.hex(),
                "ip_address":
                actor_table_entry.address.ip_address,
                "port":
                actor_table_entry.address.port,
                "metadata":
                dict(),
            }
            actors[actor_id] = entry

            deployments = await self.get_serve_info()
            for _, deployment_info in deployments.items():
                for replica_actor_id, actor_info in deployment_info[
                        "actors"].items():
                    if replica_actor_id in actors:
                        serve_metadata = dict()
                        serve_metadata["replica_tag"] = actor_info[
                            "replica_tag"]
                        serve_metadata["deployment_name"] = deployment_info[
                            "name"]
                        serve_metadata["version"] = actor_info["version"]
                        actors[replica_actor_id]["metadata"][
                            "serve"] = serve_metadata
        return actors
Ejemplo n.º 3
0
    async def _update_actors(self):
        # Subscribe actor channel.
        aioredis_client = self._dashboard_head.aioredis_client
        receiver = Receiver()

        key = "{}:*".format(stats_collector_consts.ACTOR_CHANNEL)
        pattern = receiver.pattern(key)
        await aioredis_client.psubscribe(pattern)
        logger.info("Subscribed to %s", key)

        # Get all actor info.
        while True:
            try:
                logger.info("Getting all actor info from GCS.")
                request = gcs_service_pb2.GetAllActorInfoRequest()
                reply = await self._gcs_actor_info_stub.GetAllActorInfo(
                    request, timeout=2)
                if reply.status.code == 0:
                    result = {}
                    for actor_info in reply.actor_table_data:
                        result[binary_to_hex(actor_info.actor_id)] = \
                            actor_table_data_to_dict(actor_info)
                    DataSource.actors.reset(result)
                    logger.info("Received %d actor info from GCS.",
                                len(result))
                    break
                else:
                    raise Exception(
                        f"Failed to GetAllActorInfo: {reply.status.message}")
            except Exception:
                logger.exception("Error Getting all actor info from GCS.")
                await asyncio.sleep(stats_collector_consts.
                                    RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS)

        # Receive actors from channel.
        async for sender, msg in receiver.iter():
            try:
                _, data = msg
                pubsub_message = ray.gcs_utils.PubSubMessage.FromString(data)
                actor_info = ray.gcs_utils.ActorTableData.FromString(
                    pubsub_message.data)
                DataSource.actors[binary_to_hex(actor_info.actor_id)] = \
                    actor_table_data_to_dict(actor_info)
            except Exception:
                logger.exception("Error receiving actor info.")
Ejemplo n.º 4
0
    async def _update_actors(self):
        # TODO(fyrestone): Refactor code for updating actor / node / job.
        # Subscribe actor channel.
        aioredis_client = self._dashboard_head.aioredis_client
        receiver = Receiver()

        key = "{}:*".format(actor_consts.ACTOR_CHANNEL)
        pattern = receiver.pattern(key)
        await aioredis_client.psubscribe(pattern)
        logger.info("Subscribed to %s", key)

        def _process_actor_table_data(data):
            actor_class = actor_classname_from_task_spec(
                data.get("taskSpec", {}))
            data["actorClass"] = actor_class

        # Get all actor info.
        while True:
            try:
                logger.info("Getting all actor info from GCS.")
                request = gcs_service_pb2.GetAllActorInfoRequest()
                reply = await self._gcs_actor_info_stub.GetAllActorInfo(
                    request, timeout=5)
                if reply.status.code == 0:
                    actors = {}
                    for message in reply.actor_table_data:
                        actor_table_data = actor_table_data_to_dict(message)
                        _process_actor_table_data(actor_table_data)
                        actors[actor_table_data["actorId"]] = actor_table_data
                    # Update actors.
                    DataSource.actors.reset(actors)
                    # Update node actors and job actors.
                    job_actors = {}
                    node_actors = {}
                    for actor_id, actor_table_data in actors.items():
                        job_id = actor_table_data["jobId"]
                        node_id = actor_table_data["address"]["rayletId"]
                        job_actors.setdefault(job_id,
                                              {})[actor_id] = actor_table_data
                        # Update only when node_id is not Nil.
                        if node_id != actor_consts.NIL_NODE_ID:
                            node_actors.setdefault(
                                node_id, {})[actor_id] = actor_table_data
                    DataSource.job_actors.reset(job_actors)
                    DataSource.node_actors.reset(node_actors)
                    logger.info("Received %d actor info from GCS.",
                                len(actors))
                    break
                else:
                    raise Exception(
                        f"Failed to GetAllActorInfo: {reply.status.message}")
            except Exception:
                logger.exception("Error Getting all actor info from GCS.")
                await asyncio.sleep(
                    actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS)

        # Receive actors from channel.
        state_keys = ("state", "address", "numRestarts", "timestamp", "pid")
        async for sender, msg in receiver.iter():
            try:
                actor_id, actor_table_data = msg
                pubsub_message = ray.gcs_utils.PubSubMessage.FromString(
                    actor_table_data)
                message = ray.gcs_utils.ActorTableData.FromString(
                    pubsub_message.data)
                actor_table_data = actor_table_data_to_dict(message)
                _process_actor_table_data(actor_table_data)
                # If actor is not new registered but updated, we only update
                # states related fields.
                if actor_table_data["state"] != "DEPENDENCIES_UNREADY":
                    actor_id = actor_id.decode(
                        "UTF-8")[len(ray.gcs_utils.TablePrefix_ACTOR_string +
                                     ":"):]
                    actor_table_data_copy = dict(DataSource.actors[actor_id])
                    for k in state_keys:
                        actor_table_data_copy[k] = actor_table_data[k]
                    actor_table_data = actor_table_data_copy
                actor_id = actor_table_data["actorId"]
                job_id = actor_table_data["jobId"]
                node_id = actor_table_data["address"]["rayletId"]
                # Update actors.
                DataSource.actors[actor_id] = actor_table_data
                # Update node actors (only when node_id is not Nil).
                if node_id != actor_consts.NIL_NODE_ID:
                    node_actors = dict(DataSource.node_actors.get(node_id, {}))
                    node_actors[actor_id] = actor_table_data
                    DataSource.node_actors[node_id] = node_actors
                # Update job actors.
                job_actors = dict(DataSource.job_actors.get(job_id, {}))
                job_actors[actor_id] = actor_table_data
                DataSource.job_actors[job_id] = job_actors
            except Exception:
                logger.exception("Error receiving actor info.")
Ejemplo n.º 5
0
    async def _update_actors(self):
        # Get all actor info.
        while True:
            try:
                logger.info("Getting all actor info from GCS.")
                request = gcs_service_pb2.GetAllActorInfoRequest()
                reply = await self._gcs_actor_info_stub.GetAllActorInfo(
                    request, timeout=5
                )
                if reply.status.code == 0:
                    actors = {}
                    for message in reply.actor_table_data:
                        actor_table_data = actor_table_data_to_dict(message)
                        actors[actor_table_data["actorId"]] = actor_table_data
                    # Update actors.
                    DataSource.actors.reset(actors)
                    # Update node actors and job actors.
                    job_actors = {}
                    node_actors = {}
                    for actor_id, actor_table_data in actors.items():
                        job_id = actor_table_data["jobId"]
                        node_id = actor_table_data["address"]["rayletId"]
                        job_actors.setdefault(job_id, {})[actor_id] = actor_table_data
                        # Update only when node_id is not Nil.
                        if node_id != actor_consts.NIL_NODE_ID:
                            node_actors.setdefault(node_id, {})[
                                actor_id
                            ] = actor_table_data
                    DataSource.job_actors.reset(job_actors)
                    DataSource.node_actors.reset(node_actors)
                    logger.info("Received %d actor info from GCS.", len(actors))
                    break
                else:
                    raise Exception(
                        f"Failed to GetAllActorInfo: {reply.status.message}"
                    )
            except Exception:
                logger.exception("Error Getting all actor info from GCS.")
                await asyncio.sleep(
                    actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS
                )

        state_keys = ("state", "address", "numRestarts", "timestamp", "pid")

        def process_actor_data_from_pubsub(actor_id, actor_table_data):
            actor_table_data = actor_table_data_to_dict(actor_table_data)
            # If actor is not new registered but updated, we only update
            # states related fields.
            if actor_table_data["state"] != "DEPENDENCIES_UNREADY":
                actor_table_data_copy = dict(DataSource.actors[actor_id])
                for k in state_keys:
                    actor_table_data_copy[k] = actor_table_data[k]
                actor_table_data = actor_table_data_copy
            actor_id = actor_table_data["actorId"]
            job_id = actor_table_data["jobId"]
            node_id = actor_table_data["address"]["rayletId"]
            # Update actors.
            DataSource.actors[actor_id] = actor_table_data
            # Update node actors (only when node_id is not Nil).
            if node_id != actor_consts.NIL_NODE_ID:
                node_actors = dict(DataSource.node_actors.get(node_id, {}))
                node_actors[actor_id] = actor_table_data
                DataSource.node_actors[node_id] = node_actors
            # Update job actors.
            job_actors = dict(DataSource.job_actors.get(job_id, {}))
            job_actors[actor_id] = actor_table_data
            DataSource.job_actors[job_id] = job_actors

        # Receive actors from channel.
        gcs_addr = self._dashboard_head.gcs_address
        subscriber = GcsAioActorSubscriber(address=gcs_addr)
        await subscriber.subscribe()

        while True:
            try:
                actor_id, actor_table_data = await subscriber.poll()
                if actor_id is not None:
                    # Convert to lower case hex ID.
                    actor_id = actor_id.hex()
                    process_actor_data_from_pubsub(actor_id, actor_table_data)
            except Exception:
                logger.exception("Error processing actor info from GCS.")
Ejemplo n.º 6
0
    async def _update_actors(self):
        # Subscribe actor channel.
        aioredis_client = self._dashboard_head.aioredis_client
        receiver = Receiver()

        key = "{}:*".format(stats_collector_consts.ACTOR_CHANNEL)
        pattern = receiver.pattern(key)
        await aioredis_client.psubscribe(pattern)
        logger.info("Subscribed to %s", key)

        def _process_actor_table_data(data):
            actor_class = actor_classname_from_task_spec(
                data.get("taskSpec", {}))
            data["actorClass"] = actor_class

        # Get all actor info.
        while True:
            try:
                logger.info("Getting all actor info from GCS.")
                request = gcs_service_pb2.GetAllActorInfoRequest()
                reply = await self._gcs_actor_info_stub.GetAllActorInfo(
                    request, timeout=5)
                if reply.status.code == 0:
                    actors = {}
                    for message in reply.actor_table_data:
                        actor_table_data = actor_table_data_to_dict(message)
                        _process_actor_table_data(actor_table_data)
                        actors[actor_table_data["actorId"]] = actor_table_data
                    # Update actors.
                    DataSource.actors.reset(actors)
                    # Update node actors and job actors.
                    job_actors = {}
                    node_actors = {}
                    for actor_id, actor_table_data in actors.items():
                        job_id = actor_table_data["jobId"]
                        node_id = actor_table_data["address"]["rayletId"]
                        job_actors.setdefault(job_id,
                                              {})[actor_id] = actor_table_data
                        node_actors.setdefault(node_id,
                                               {})[actor_id] = actor_table_data
                    DataSource.job_actors.reset(job_actors)
                    DataSource.node_actors.reset(node_actors)
                    logger.info("Received %d actor info from GCS.",
                                len(actors))
                    break
                else:
                    raise Exception(
                        f"Failed to GetAllActorInfo: {reply.status.message}")
            except Exception:
                logger.exception("Error Getting all actor info from GCS.")
                await asyncio.sleep(stats_collector_consts.
                                    RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS)

        # Receive actors from channel.
        async for sender, msg in receiver.iter():
            try:
                _, actor_table_data = msg
                pubsub_message = ray.gcs_utils.PubSubMessage.FromString(
                    actor_table_data)
                message = ray.gcs_utils.ActorTableData.FromString(
                    pubsub_message.data)
                actor_table_data = actor_table_data_to_dict(message)
                _process_actor_table_data(actor_table_data)
                actor_id = actor_table_data["actorId"]
                job_id = actor_table_data["jobId"]
                node_id = actor_table_data["address"]["rayletId"]
                # Update actors.
                DataSource.actors[actor_id] = actor_table_data
                # Update node actors.
                node_actors = dict(DataSource.node_actors.get(node_id, {}))
                node_actors[actor_id] = actor_table_data
                DataSource.node_actors[node_id] = node_actors
                # Update job actors.
                job_actors = dict(DataSource.job_actors.get(job_id, {}))
                job_actors[actor_id] = actor_table_data
                DataSource.job_actors[job_id] = job_actors
            except Exception:
                logger.exception("Error receiving actor info.")