async def set_fetch_memory_info(self, req) -> aiohttp.web.Response: should_fetch = req.query["shouldFetch"] if should_fetch == "true": self._collect_memory_info = True elif should_fetch == "false": self._collect_memory_info = False else: return dashboard_utils.rest_response( success=False, message=f"Unknown argument to set_fetch {should_fetch}") return dashboard_utils.rest_response( success=True, message=f"Successfully set fetching to {should_fetch}")
async def get_job(self, req) -> aiohttp.web.Response: job_id = req.match_info.get("job_id") view = req.query.get("view") if view is None: job_detail = { "jobInfo": DataSource.jobs.get(job_id, {}), "jobActors": DataSource.job_actors.get(job_id, {}), "jobWorkers": DataSource.job_workers.get(job_id, []), } await GlobalSignals.job_info_fetched.send(job_detail) return dashboard_utils.rest_response( success=True, message="Job detail fetched.", detail=job_detail) else: return dashboard_utils.rest_response( success=False, message="Unknown view {}".format(view))
async def get_event(self, req) -> aiohttp.web.Response: job_id = req.query.get("job_id") if job_id is None: all_events = { job_id: list(job_events.values()) for job_id, job_events in DataSource.events.items() } return dashboard_utils.rest_response( success=True, message="All events fetched.", events=all_events) job_events = DataSource.events.get(job_id, {}) return dashboard_utils.rest_response( success=True, message="Job events fetched.", job_id=job_id, events=list(job_events.values()))
async def get_errors(self, req) -> aiohttp.web.Response: ip = req.query["ip"] pid = req.query.get("pid") node_errors = DataSource.ip_and_pid_to_errors[ip] filtered_errs = node_errors.get(pid, []) if pid else node_errors return dashboard_utils.rest_response( success=True, message="Fetched errors.", errors=filtered_errs)
async def get_logs(self, req) -> aiohttp.web.Response: ip = req.query["ip"] pid = req.query.get("pid") node_logs = DataSource.ip_and_pid_to_logs[ip] payload = node_logs.get(pid, []) if pid else node_logs return dashboard_utils.rest_response( success=True, message="Fetched logs.", logs=payload)
async def get_cluster_status(self, req): """Returns status information about the cluster. Currently contains two fields: autoscaling_status (str): a status message from the autoscaler. autoscaling_error (str): an error message from the autoscaler if anything has gone wrong during autoscaling. These fields are both read from the GCS, it's expected that the autoscaler writes them there. """ aioredis_client = self._dashboard_head.aioredis_client legacy_status = await aioredis_client.hget( DEBUG_AUTOSCALING_STATUS_LEGACY, "value") formatted_status_string = await aioredis_client.hget( DEBUG_AUTOSCALING_STATUS, "value") formatted_status = json.loads(formatted_status_string.decode() ) if formatted_status_string else {} error = await aioredis_client.hget(DEBUG_AUTOSCALING_ERROR, "value") return dashboard_utils.rest_response( success=True, message="Got cluster status.", autoscaling_status=legacy_status.decode() if legacy_status else None, autoscaling_error=error.decode() if error else None, cluster_status=formatted_status if formatted_status else None, )
async def get_errors(self, req) -> aiohttp.web.Response: ip = req.query["ip"] pid = str(req.query.get("pid", "")) node_errors = DataSource.ip_and_pid_to_errors.get(ip, {}) if pid: node_errors = {str(pid): node_errors.get(pid, [])} return dashboard_utils.rest_response( success=True, message="Fetched errors.", errors=node_errors)
async def get_availability(self, req) -> aiohttp.web.Response: availability = { "available": Analysis is not None, "trials_available": self._trials_available } return rest_response(success=True, message="Fetched tune availability", result=availability)
async def snapshot(self, req): job_data = await self.get_job_info() actor_data = await self.get_actor_info() snapshot = { "jobs": job_data, "actors": actor_data, } return dashboard_utils.rest_response( success=True, message="hello", snapshot=snapshot)
async def dump(self, req) -> aiohttp.web.Response: key = req.query.get("key") if key is None: all_data = { k: dict(v) for k, v in DataSource.__dict__.items() if not k.startswith("_") } return dashboard_utils.rest_response( success=True, message="Fetch all data from datacenter success.", **all_data) else: data = dict(DataSource.__dict__.get(key)) return dashboard_utils.rest_response( success=True, message=f"Fetch {key} from datacenter success.", **{key: data})
async def get_actor_groups(self, req) -> aiohttp.web.Response: actors = await DataOrganizer.get_all_actors() actor_creation_tasks = await DataOrganizer.get_actor_creation_tasks() # actor_creation_tasks have some common interface with actors, # and they get processed and shown in tandem in the logical view # hence we merge them together before constructing actor groups. actors.update(actor_creation_tasks) actor_groups = actor_utils.construct_actor_groups(actors) return rest_response(success=True, message="Fetched actor groups.", actor_groups=actor_groups)
async def get_ray_config(self, req) -> aiohttp.web.Response: if self._ray_config is None: try: config_path = os.path.expanduser("~/ray_bootstrap_config.yaml") with open(config_path) as f: cfg = yaml.safe_load(f) except yaml.YAMLError: return dashboard_utils.rest_response( success=False, message=f"No config found at {config_path}.", ) except FileNotFoundError: return dashboard_utils.rest_response( success=False, message="Invalid config, could not load YAML.") payload = { "min_workers": cfg["min_workers"], "max_workers": cfg["max_workers"], "initial_workers": cfg["initial_workers"], "autoscaling_mode": cfg["autoscaling_mode"], "idle_timeout_minutes": cfg["idle_timeout_minutes"], } try: payload["head_type"] = cfg["head_node"]["InstanceType"] except KeyError: payload["head_type"] = "unknown" try: payload["worker_type"] = cfg["worker_nodes"]["InstanceType"] except KeyError: payload["worker_type"] = "unknown" self._ray_config = payload return dashboard_utils.rest_response( success=True, message="Fetched ray config.", **self._ray_config, )
async def launch_profiling(self, req) -> aiohttp.web.Response: ip = req.query["ip"] pid = int(req.query["pid"]) duration = int(req.query["duration"]) reporter_stub = self._stubs[ip] reply = await reporter_stub.GetProfilingStats( reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration)) profiling_info = (json.loads(reply.profiling_stats) if reply.profiling_stats else reply.std_out) return dashboard_utils.rest_response(success=True, message="Profiling success.", profiling_info=profiling_info)
async def kill_actor(self, req) -> aiohttp.web.Response: try: actor_id = req.query["actorId"] ip_address = req.query["ipAddress"] port = req.query["port"] except KeyError: return rest_response(success=False, message="Bad Request") try: channel = aiogrpc.insecure_channel(f"{ip_address}:{port}") stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel) await stub.KillActor( core_worker_pb2.KillActorRequest( intended_actor_id=ray.utils.hex_to_binary(actor_id))) except aiogrpc.AioRpcError: # This always throws an exception because the worker # is killed and the channel is closed on the worker side # before this handler, however it deletes the actor correctly. pass return rest_response( success=True, message=f"Killed actor with id {actor_id}")
async def get_memory_table(self, req) -> aiohttp.web.Response: group_by = req.query.get("group_by") sort_by = req.query.get("sort_by") kwargs = {} if group_by: kwargs["group_by"] = GroupByType(group_by) if sort_by: kwargs["sort_by"] = SortingType(sort_by) memory_table = await DataOrganizer.get_memory_table(**kwargs) return dashboard_utils.rest_response( success=True, message="Fetched memory table", memory_table=memory_table.as_dict())
async def snapshot(self, req): job_data = await self.get_job_info() actor_data = await self.get_actor_info() serve_data = await self.get_serve_info() session_name = await self.get_session_name() snapshot = { "jobs": job_data, "actors": actor_data, "deployments": serve_data, "session_name": session_name, "ray_version": ray.__version__, "ray_commit": ray.__commit__ } return dashboard_utils.rest_response(success=True, message="hello", snapshot=snapshot)
async def test_aiohttp_cache_lru(self, req) -> aiohttp.web.Response: value = req.query.get("value") return dashboard_utils.rest_response(success=True, message="OK", value=value, timestamp=time.time())
async def get_notified_agents(self, req) -> aiohttp.web.Response: return dashboard_utils.rest_response( success=True, message="Fetch notified agents success.", **self._notified_agents)
async def enable_tensorboard(self, req) -> aiohttp.web.Response: self._enable_tensorboard() if not self._tensor_board_dir: return rest_response(success=False, message="Error enabling tensorboard") return rest_response(success=True, message="Enabled tensorboard")
async def tune_info(self, req) -> aiohttp.web.Response: stats = self.get_stats() return rest_response(success=True, message="Fetched tune info", result=stats)
def test_immutable_types(): d = {str(i): i for i in range(1000)} d["list"] = list(range(1000)) d["list"][0] = {str(i): i for i in range(1000)} d["dict"] = {str(i): i for i in range(1000)} immutable_dict = dashboard_utils.make_immutable(d) assert type(immutable_dict) == dashboard_utils.ImmutableDict assert immutable_dict == dashboard_utils.ImmutableDict(d) assert immutable_dict == d assert dashboard_utils.ImmutableDict(immutable_dict) == immutable_dict assert dashboard_utils.ImmutableList( immutable_dict["list"]) == immutable_dict["list"] assert "512" in d assert "512" in d["list"][0] assert "512" in d["dict"] # Test type conversion assert type(dict(immutable_dict)["list"]) == dashboard_utils.ImmutableList assert type(list( immutable_dict["list"])[0]) == dashboard_utils.ImmutableDict # Test json dumps / loads json_str = json.dumps(immutable_dict, cls=dashboard_utils.CustomEncoder) deserialized_immutable_dict = json.loads(json_str) assert type(deserialized_immutable_dict) == dict assert type(deserialized_immutable_dict["list"]) == list assert immutable_dict.mutable() == deserialized_immutable_dict dashboard_utils.rest_response(True, "OK", data=immutable_dict) dashboard_utils.rest_response(True, "OK", **immutable_dict) # Test copy copy_of_immutable = copy.copy(immutable_dict) assert copy_of_immutable == immutable_dict deepcopy_of_immutable = copy.deepcopy(immutable_dict) assert deepcopy_of_immutable == immutable_dict # Test get default immutable immutable_default_value = immutable_dict.get("not exist list", [1, 2]) assert type(immutable_default_value) == dashboard_utils.ImmutableList # Test recursive immutable assert type(immutable_dict["list"]) == dashboard_utils.ImmutableList assert type(immutable_dict["dict"]) == dashboard_utils.ImmutableDict assert type(immutable_dict["list"][0]) == dashboard_utils.ImmutableDict # Test exception with pytest.raises(TypeError): dashboard_utils.ImmutableList((1, 2)) with pytest.raises(TypeError): dashboard_utils.ImmutableDict([1, 2]) with pytest.raises(TypeError): immutable_dict["list"] = [] with pytest.raises(AttributeError): immutable_dict.update({1: 3}) with pytest.raises(TypeError): immutable_dict["list"][0] = 0 with pytest.raises(AttributeError): immutable_dict["list"].extend([1, 2]) with pytest.raises(AttributeError): immutable_dict["list"].insert(1, 2) d2 = dashboard_utils.ImmutableDict({1: np.zeros([3, 5])}) with pytest.raises(TypeError): print(d2[1]) d3 = dashboard_utils.ImmutableList([1, np.zeros([3, 5])]) with pytest.raises(TypeError): print(d3[1])
async def get_all_actors(self, req) -> aiohttp.web.Response: return dashboard_utils.rest_response(success=True, message="All actors fetched.", actors=DataSource.actors)
async def get_node(self, req) -> aiohttp.web.Response: node_id = req.match_info.get("node_id") node_info = await DataOrganizer.get_node_info(node_id) return dashboard_utils.rest_response(success=True, message="Node details fetched.", detail=node_info)