Example #1
0
    async def get_job_info(self):
        request = gcs_service_pb2.GetAllJobInfoRequest()
        reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5)

        jobs = {}
        for job_table_entry in reply.job_info_list:
            job_id = job_table_entry.job_id.hex()
            config = {
                "env_vars":
                dict(job_table_entry.config.worker_env),
                "namespace":
                job_table_entry.config.ray_namespace,
                "metadata":
                dict(job_table_entry.config.metadata),
                "runtime_env":
                json.loads(job_table_entry.config.serialized_runtime_env),
            }
            entry = {
                "is_dead": job_table_entry.is_dead,
                "start_time": job_table_entry.start_time,
                "end_time": job_table_entry.end_time,
                "config": config,
            }
            jobs[job_id] = entry

        return jobs
Example #2
0
    async def get_job_info(self):
        """Return info for each job.  Here a job is a Ray driver."""
        request = gcs_service_pb2.GetAllJobInfoRequest()
        reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5)

        jobs = {}
        for job_table_entry in reply.job_info_list:
            job_id = job_table_entry.job_id.hex()
            metadata = dict(job_table_entry.config.metadata)
            config = {
                "namespace":
                job_table_entry.config.ray_namespace,
                "metadata":
                metadata,
                "runtime_env":
                RuntimeEnv.deserialize(job_table_entry.config.runtime_env_info.
                                       serialized_runtime_env),
            }
            info = self._get_job_info(metadata)
            entry = {
                "status": None if info is None else info.status,
                "status_message": None if info is None else info.message,
                "is_dead": job_table_entry.is_dead,
                "start_time": job_table_entry.start_time,
                "end_time": job_table_entry.end_time,
                "config": config,
            }
            jobs[job_id] = entry

        return jobs
Example #3
0
 def get_all_job_info(
         self,
         timeout: Optional[float] = None
 ) -> gcs_service_pb2.GetAllJobInfoReply:
     req = gcs_service_pb2.GetAllJobInfoRequest()
     reply = self._job_info_stub.GetAllJobInfo(req, timeout=timeout)
     return reply
Example #4
0
    async def get_job_info(self):
        request = gcs_service_pb2.GetAllJobInfoRequest()
        reply = await self._gcs_job_info_stub.GetAllJobInfo(request, timeout=5)

        jobs = {}
        for job_table_entry in reply.job_info_list:
            job_id = job_table_entry.job_id.hex()
            metadata = dict(job_table_entry.config.metadata)
            config = {
                "namespace":
                job_table_entry.config.ray_namespace,
                "metadata":
                metadata,
                "runtime_env":
                json.loads(
                    job_table_entry.config.runtime_env.serialized_runtime_env),
            }
            status = self._get_job_status(metadata)
            entry = {
                "status": None if status is None else status.status,
                "status_message": None if status is None else status.message,
                "is_dead": job_table_entry.is_dead,
                "start_time": job_table_entry.start_time,
                "end_time": job_table_entry.end_time,
                "config": config,
            }
            jobs[job_id] = entry

        return jobs
Example #5
0
    async def _update_jobs(self):
        # Subscribe job channel.
        aioredis_client = self._dashboard_head.aioredis_client
        receiver = Receiver()

        key = f"{job_consts.JOB_CHANNEL}:*"
        pattern = receiver.pattern(key)
        await aioredis_client.psubscribe(pattern)
        logger.info("Subscribed to %s", key)

        # Get all job info.
        while True:
            try:
                logger.info("Getting all job info from GCS.")
                request = gcs_service_pb2.GetAllJobInfoRequest()
                reply = await self._gcs_job_info_stub.GetAllJobInfo(request,
                                                                    timeout=5)
                if reply.status.code == 0:
                    jobs = {}
                    for job_table_data in reply.job_info_list:
                        data = job_table_data_to_dict(job_table_data)
                        jobs[data["jobId"]] = data
                    # Update jobs.
                    DataSource.jobs.reset(jobs)
                    logger.info("Received %d job info from GCS.", len(jobs))
                    break
                else:
                    raise Exception(
                        f"Failed to GetAllJobInfo: {reply.status.message}")
            except Exception:
                logger.exception("Error Getting all job info from GCS.")
                await asyncio.sleep(
                    job_consts.RETRY_GET_ALL_JOB_INFO_INTERVAL_SECONDS)

        # Receive jobs from channel.
        async for sender, msg in receiver.iter():
            try:
                _, data = msg
                pubsub_message = ray.gcs_utils.PubSubMessage.FromString(data)
                message = ray.gcs_utils.JobTableData.FromString(
                    pubsub_message.data)
                job_id = ray._raylet.JobID(message.job_id)
                if job_id.is_submitted_from_dashboard():
                    job_table_data = job_table_data_to_dict(message)
                    job_id = job_table_data["jobId"]
                    # Update jobs.
                    DataSource.jobs[job_id] = job_table_data
                else:
                    logger.info(
                        "Ignore job %s which is not submitted from dashboard.",
                        job_id.hex())
            except Exception:
                logger.exception("Error receiving job info.")
Example #6
0
    async def _get_job_activity_info(self, timeout: int) -> RayActivityResponse:
        # Returns if there is Ray activity from drivers (job).
        # Drivers in namespaces that start with _ray_internal_job_info_ are not
        # considered activity.
        try:
            request = gcs_service_pb2.GetAllJobInfoRequest()
            reply = await self._gcs_job_info_stub.GetAllJobInfo(
                request, timeout=timeout
            )

            num_active_drivers = 0
            for job_table_entry in reply.job_info_list:
                is_dead = bool(job_table_entry.is_dead)
                in_internal_namespace = job_table_entry.config.ray_namespace.startswith(
                    JobInfoStorageClient.JOB_DATA_KEY_PREFIX
                )
                if not is_dead and not in_internal_namespace:
                    num_active_drivers += 1

            is_active = (
                RayActivityStatus.ACTIVE
                if num_active_drivers > 0
                else RayActivityStatus.INACTIVE
            )
            return RayActivityResponse(
                is_active=is_active,
                reason=f"Number of active drivers: {num_active_drivers}"
                if num_active_drivers
                else None,
                timestamp=datetime.now().timestamp(),
            )
        except Exception as e:
            logger.exception("Failed to get activity status of Ray drivers.")
            return RayActivityResponse(
                is_active=RayActivityStatus.ERROR,
                reason=repr(e),
                timestamp=datetime.now().timestamp(),
            )
Example #7
0
    async def _get_job_activity_info(self,
                                     timeout: int) -> RayActivityResponse:
        # Returns if there is Ray activity from drivers (job).
        # Drivers in namespaces that start with _ray_internal_job_info_ are not
        # considered activity.
        request = gcs_service_pb2.GetAllJobInfoRequest()
        reply = await self._gcs_job_info_stub.GetAllJobInfo(request,
                                                            timeout=timeout)

        num_active_drivers = 0
        for job_table_entry in reply.job_info_list:
            is_dead = bool(job_table_entry.is_dead)
            in_internal_namespace = job_table_entry.config.ray_namespace.startswith(
                JobInfoStorageClient.JOB_DATA_KEY_PREFIX)
            if not is_dead and not in_internal_namespace:
                num_active_drivers += 1

        return RayActivityResponse(
            is_active=num_active_drivers > 0,
            reason=f"Number of active drivers: {num_active_drivers}"
            if num_active_drivers else None,
            timestamp=datetime.now().timestamp(),
        )