async def get_job_info(self): request = gcs_service_pb2.GetAllJobInfoRequest() reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5) jobs = {} for job_table_entry in reply.job_info_list: job_id = job_table_entry.job_id.hex() config = { "env_vars": dict(job_table_entry.config.worker_env), "namespace": job_table_entry.config.ray_namespace, "metadata": dict(job_table_entry.config.metadata), "runtime_env": json.loads(job_table_entry.config.serialized_runtime_env), } entry = { "is_dead": job_table_entry.is_dead, "start_time": job_table_entry.start_time, "end_time": job_table_entry.end_time, "config": config, } jobs[job_id] = entry return jobs
async def get_job_info(self): """Return info for each job. Here a job is a Ray driver.""" request = gcs_service_pb2.GetAllJobInfoRequest() reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5) jobs = {} for job_table_entry in reply.job_info_list: job_id = job_table_entry.job_id.hex() metadata = dict(job_table_entry.config.metadata) config = { "namespace": job_table_entry.config.ray_namespace, "metadata": metadata, "runtime_env": RuntimeEnv.deserialize(job_table_entry.config.runtime_env_info. serialized_runtime_env), } info = self._get_job_info(metadata) entry = { "status": None if info is None else info.status, "status_message": None if info is None else info.message, "is_dead": job_table_entry.is_dead, "start_time": job_table_entry.start_time, "end_time": job_table_entry.end_time, "config": config, } jobs[job_id] = entry return jobs
def get_all_job_info( self, timeout: Optional[float] = None ) -> gcs_service_pb2.GetAllJobInfoReply: req = gcs_service_pb2.GetAllJobInfoRequest() reply = self._job_info_stub.GetAllJobInfo(req, timeout=timeout) return reply
async def get_job_info(self): request = gcs_service_pb2.GetAllJobInfoRequest() reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5) jobs = {} for job_table_entry in reply.job_info_list: job_id = job_table_entry.job_id.hex() metadata = dict(job_table_entry.config.metadata) config = { "namespace": job_table_entry.config.ray_namespace, "metadata": metadata, "runtime_env": json.loads( job_table_entry.config.runtime_env.serialized_runtime_env), } status = self._get_job_status(metadata) entry = { "status": None if status is None else status.status, "status_message": None if status is None else status.message, "is_dead": job_table_entry.is_dead, "start_time": job_table_entry.start_time, "end_time": job_table_entry.end_time, "config": config, } jobs[job_id] = entry return jobs
async def _update_jobs(self): # Subscribe job channel. aioredis_client = self._dashboard_head.aioredis_client receiver = Receiver() key = f"{job_consts.JOB_CHANNEL}:*" pattern = receiver.pattern(key) await aioredis_client.psubscribe(pattern) logger.info("Subscribed to %s", key) # Get all job info. while True: try: logger.info("Getting all job info from GCS.") request = gcs_service_pb2.GetAllJobInfoRequest() reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5) if reply.status.code == 0: jobs = {} for job_table_data in reply.job_info_list: data = job_table_data_to_dict(job_table_data) jobs[data["jobId"]] = data # Update jobs. DataSource.jobs.reset(jobs) logger.info("Received %d job info from GCS.", len(jobs)) break else: raise Exception( f"Failed to GetAllJobInfo: {reply.status.message}") except Exception: logger.exception("Error Getting all job info from GCS.") await asyncio.sleep( job_consts.RETRY_GET_ALL_JOB_INFO_INTERVAL_SECONDS) # Receive jobs from channel. async for sender, msg in receiver.iter(): try: _, data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString(data) message = ray.gcs_utils.JobTableData.FromString( pubsub_message.data) job_id = ray._raylet.JobID(message.job_id) if job_id.is_submitted_from_dashboard(): job_table_data = job_table_data_to_dict(message) job_id = job_table_data["jobId"] # Update jobs. DataSource.jobs[job_id] = job_table_data else: logger.info( "Ignore job %s which is not submitted from dashboard.", job_id.hex()) except Exception: logger.exception("Error receiving job info.")
async def _get_job_activity_info(self, timeout: int) -> RayActivityResponse: # Returns if there is Ray activity from drivers (job). # Drivers in namespaces that start with _ray_internal_job_info_ are not # considered activity. try: request = gcs_service_pb2.GetAllJobInfoRequest() reply = await self._gcs_job_info_stub.GetAllJobInfo( request, timeout=timeout ) num_active_drivers = 0 for job_table_entry in reply.job_info_list: is_dead = bool(job_table_entry.is_dead) in_internal_namespace = job_table_entry.config.ray_namespace.startswith( JobInfoStorageClient.JOB_DATA_KEY_PREFIX ) if not is_dead and not in_internal_namespace: num_active_drivers += 1 is_active = ( RayActivityStatus.ACTIVE if num_active_drivers > 0 else RayActivityStatus.INACTIVE ) return RayActivityResponse( is_active=is_active, reason=f"Number of active drivers: {num_active_drivers}" if num_active_drivers else None, timestamp=datetime.now().timestamp(), ) except Exception as e: logger.exception("Failed to get activity status of Ray drivers.") return RayActivityResponse( is_active=RayActivityStatus.ERROR, reason=repr(e), timestamp=datetime.now().timestamp(), )
async def _get_job_activity_info(self, timeout: int) -> RayActivityResponse: # Returns if there is Ray activity from drivers (job). # Drivers in namespaces that start with _ray_internal_job_info_ are not # considered activity. request = gcs_service_pb2.GetAllJobInfoRequest() reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=timeout) num_active_drivers = 0 for job_table_entry in reply.job_info_list: is_dead = bool(job_table_entry.is_dead) in_internal_namespace = job_table_entry.config.ray_namespace.startswith( JobInfoStorageClient.JOB_DATA_KEY_PREFIX) if not is_dead and not in_internal_namespace: num_active_drivers += 1 return RayActivityResponse( is_active=num_active_drivers > 0, reason=f"Number of active drivers: {num_active_drivers}" if num_active_drivers else None, timestamp=datetime.now().timestamp(), )