async def get_all_nodes(self, req) -> aiohttp.web.Response: view = req.query.get("view") if view == "summary": all_node_summary = await DataOrganizer.get_all_node_summary() return dashboard_optional_utils.rest_response( success=True, message="Node summary fetched.", summary=all_node_summary) elif view == "details": all_node_details = await DataOrganizer.get_all_node_details() return dashboard_optional_utils.rest_response( success=True, message="All node details fetched", clients=all_node_details, ) elif view is not None and view.lower() == "hostNameList".lower(): alive_hostnames = set() for node in DataSource.nodes.values(): if node["state"] == "ALIVE": alive_hostnames.add(node["nodeManagerHostname"]) return dashboard_optional_utils.rest_response( success=True, message="Node hostname list fetched.", host_name_list=list(alive_hostnames)) else: return dashboard_optional_utils.rest_response( success=False, message=f"Unknown view {view}")
async def kill_actor(self, req) -> aiohttp.web.Response: try: actor_id = req.query["actorId"] ip_address = req.query["ipAddress"] port = req.query["port"] except KeyError: return rest_response(success=False, message="Bad Request") try: options = ray_constants.GLOBAL_GRPC_OPTIONS channel = ray._private.utils.init_grpc_channel( f"{ip_address}:{port}", options=options, asynchronous=True ) stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel) await stub.KillActor( core_worker_pb2.KillActorRequest( intended_actor_id=ray._private.utils.hex_to_binary(actor_id) ) ) except aiogrpc.AioRpcError: # This always throws an exception because the worker # is killed and the channel is closed on the worker side # before this handler, however it deletes the actor correctly. pass return rest_response(success=True, message=f"Killed actor with id {actor_id}")
async def set_tune_experiment(self, req) -> aiohttp.web.Response: experiment = req.query["experiment"] err, experiment = self.set_experiment(experiment) if err: return rest_response(success=False, error=err) return rest_response(success=True, message="Successfully set experiment", **experiment)
async def set_fetch_memory_info(self, req) -> aiohttp.web.Response: should_fetch = req.query["shouldFetch"] if should_fetch == "true": self._collect_memory_info = True elif should_fetch == "false": self._collect_memory_info = False else: return dashboard_optional_utils.rest_response( success=False, message=f"Unknown argument to set_fetch {should_fetch}") return dashboard_optional_utils.rest_response( success=True, message=f"Successfully set fetching to {should_fetch}")
async def snapshot(self, req): ( job_info, job_submission_data, actor_data, serve_data, session_name, ) = await asyncio.gather( self.get_job_info(), self.get_job_submission_info(), self.get_actor_info(), self.get_serve_info(), self.get_session_name(), ) snapshot = { "jobs": job_info, "job_submission": job_submission_data, "actors": actor_data, "deployments": serve_data, "session_name": session_name, "ray_version": ray.__version__, "ray_commit": ray.__commit__, } return dashboard_optional_utils.rest_response(success=True, message="hello", snapshot=snapshot)
async def get_cluster_status(self, req): """Returns status information about the cluster. Currently contains two fields: autoscaling_status (str)-- a status message from the autoscaler. autoscaling_error (str)-- an error message from the autoscaler if anything has gone wrong during autoscaling. These fields are both read from the GCS, it's expected that the autoscaler writes them there. """ assert ray.experimental.internal_kv._internal_kv_initialized() legacy_status = internal_kv._internal_kv_get( DEBUG_AUTOSCALING_STATUS_LEGACY) formatted_status_string = internal_kv._internal_kv_get( DEBUG_AUTOSCALING_STATUS) formatted_status = (json.loads(formatted_status_string.decode()) if formatted_status_string else {}) error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR) return dashboard_optional_utils.rest_response( success=True, message="Got cluster status.", autoscaling_status=legacy_status.decode() if legacy_status else None, autoscaling_error=error.decode() if error else None, cluster_status=formatted_status if formatted_status else None, )
async def get_event(self, req) -> aiohttp.web.Response: job_id = req.query.get("job_id") if job_id is None: all_events = { job_id: list(job_events.values()) for job_id, job_events in DataSource.events.items() } return dashboard_optional_utils.rest_response( success=True, message="All events fetched.", events=all_events) job_events = DataSource.events.get(job_id, {}) return dashboard_optional_utils.rest_response( success=True, message="Job events fetched.", job_id=job_id, events=list(job_events.values()))
async def get_errors(self, req) -> aiohttp.web.Response: ip = req.query["ip"] pid = str(req.query.get("pid", "")) node_errors = DataSource.ip_and_pid_to_errors.get(ip, {}) if pid: node_errors = {str(pid): node_errors.get(pid, [])} return dashboard_optional_utils.rest_response( success=True, message="Fetched errors.", errors=node_errors)
async def get_availability(self, req) -> aiohttp.web.Response: availability = { "available": ExperimentAnalysis is not None, "trials_available": self._trials_available } return rest_response(success=True, message="Fetched tune availability", result=availability)
def _reply(self, success: bool, error_message: str, result: dict, **kwargs): """Reply to the client.""" return rest_response( success=success, message=error_message, result=result, convert_google_style=False, **kwargs, )
async def dump(self, req) -> aiohttp.web.Response: key = req.query.get("key") if key is None: all_data = { k: dict(v) for k, v in DataSource.__dict__.items() if not k.startswith("_") } return dashboard_optional_utils.rest_response( success=True, message="Fetch all data from datacenter success.", **all_data) else: data = dict(DataSource.__dict__.get(key)) return dashboard_optional_utils.rest_response( success=True, message=f"Fetch {key} from datacenter success.", **{key: data})
async def get_actor_groups(self, req) -> aiohttp.web.Response: actors = await DataOrganizer.get_all_actors() actor_creation_tasks = await DataOrganizer.get_actor_creation_tasks() # actor_creation_tasks have some common interface with actors, # and they get processed and shown in tandem in the logical view # hence we merge them together before constructing actor groups. actors.update(actor_creation_tasks) actor_groups = actor_utils.construct_actor_groups(actors) return rest_response( success=True, message="Fetched actor groups.", actor_groups=actor_groups )
async def kill_actor_gcs(self, req) -> aiohttp.web.Response: actor_id = req.query.get("actor_id") force_kill = req.query.get("force_kill", False) in ("true", "True") no_restart = req.query.get("no_restart", False) in ("true", "True") if not actor_id: return dashboard_optional_utils.rest_response( success=False, message="actor_id is required.") request = gcs_service_pb2.KillActorViaGcsRequest() request.actor_id = bytes.fromhex(actor_id) request.force_kill = force_kill request.no_restart = no_restart await self._gcs_actor_info_stub.KillActorViaGcs(request, timeout=5) message = (f"Force killed actor with id {actor_id}" if force_kill else f"Requested actor with id {actor_id} to terminate. " + "It will exit once running tasks complete") return dashboard_optional_utils.rest_response(success=True, message=message)
async def launch_profiling(self, req) -> aiohttp.web.Response: ip = req.query["ip"] pid = int(req.query["pid"]) duration = int(req.query["duration"]) reporter_stub = self._stubs[ip] reply = await reporter_stub.GetProfilingStats( reporter_pb2.GetProfilingStatsRequest(pid=pid, duration=duration)) profiling_info = (json.loads(reply.profiling_stats) if reply.profiling_stats else reply.std_out) return dashboard_optional_utils.rest_response( success=True, message="Profiling success.", profiling_info=profiling_info)
async def get_ray_config(self, req) -> aiohttp.web.Response: if self._ray_config is None: try: config_path = os.path.expanduser("~/ray_bootstrap_config.yaml") with open(config_path) as f: cfg = yaml.safe_load(f) except yaml.YAMLError: return dashboard_optional_utils.rest_response( success=False, message=f"No config found at {config_path}.", ) except FileNotFoundError: return dashboard_optional_utils.rest_response( success=False, message="Invalid config, could not load YAML.") payload = { "min_workers": cfg.get("min_workers", "unspecified"), "max_workers": cfg.get("max_workers", "unspecified"), } try: payload["head_type"] = cfg["head_node"]["InstanceType"] except KeyError: payload["head_type"] = "unknown" try: payload["worker_type"] = cfg["worker_nodes"]["InstanceType"] except KeyError: payload["worker_type"] = "unknown" self._ray_config = payload return dashboard_optional_utils.rest_response( success=True, message="Fetched ray config.", **self._ray_config, )
async def get_memory_table(self, req) -> aiohttp.web.Response: group_by = req.query.get("group_by") sort_by = req.query.get("sort_by") kwargs = {} if group_by: kwargs["group_by"] = GroupByType(group_by) if sort_by: kwargs["sort_by"] = SortingType(sort_by) memory_table = await DataOrganizer.get_memory_table(**kwargs) return dashboard_optional_utils.rest_response( success=True, message="Fetched memory table", memory_table=memory_table.as_dict())
async def test_aiohttp_cache(self, req) -> aiohttp.web.Response: value = req.query["value"] return dashboard_optional_utils.rest_response(success=True, message="OK", value=value, timestamp=time.time())
async def get_notified_agents(self, req) -> aiohttp.web.Response: return dashboard_optional_utils.rest_response( success=True, message="Fetch notified agents success.", **self._notified_agents, )
async def enable_tensorboard(self, req) -> aiohttp.web.Response: self._enable_tensorboard() if not self._tensor_board_dir: return rest_response(success=False, message="Error enabling tensorboard") return rest_response(success=True, message="Enabled tensorboard")
def test_immutable_types(): d = {str(i): i for i in range(1000)} d["list"] = list(range(1000)) d["list"][0] = {str(i): i for i in range(1000)} d["dict"] = {str(i): i for i in range(1000)} immutable_dict = dashboard_utils.make_immutable(d) assert type(immutable_dict) == dashboard_utils.ImmutableDict assert immutable_dict == dashboard_utils.ImmutableDict(d) assert immutable_dict == d assert dashboard_utils.ImmutableDict(immutable_dict) == immutable_dict assert dashboard_utils.ImmutableList( immutable_dict["list"]) == immutable_dict["list"] assert "512" in d assert "512" in d["list"][0] assert "512" in d["dict"] # Test type conversion assert type(dict(immutable_dict)["list"]) == dashboard_utils.ImmutableList assert type(list( immutable_dict["list"])[0]) == dashboard_utils.ImmutableDict # Test json dumps / loads json_str = json.dumps( immutable_dict, cls=dashboard_optional_utils.CustomEncoder) deserialized_immutable_dict = json.loads(json_str) assert type(deserialized_immutable_dict) == dict assert type(deserialized_immutable_dict["list"]) == list assert immutable_dict.mutable() == deserialized_immutable_dict dashboard_optional_utils.rest_response(True, "OK", data=immutable_dict) dashboard_optional_utils.rest_response(True, "OK", **immutable_dict) # Test copy copy_of_immutable = copy.copy(immutable_dict) assert copy_of_immutable == immutable_dict deepcopy_of_immutable = copy.deepcopy(immutable_dict) assert deepcopy_of_immutable == immutable_dict # Test get default immutable immutable_default_value = immutable_dict.get("not exist list", [1, 2]) assert type(immutable_default_value) == dashboard_utils.ImmutableList # Test recursive immutable assert type(immutable_dict["list"]) == dashboard_utils.ImmutableList assert type(immutable_dict["dict"]) == dashboard_utils.ImmutableDict assert type(immutable_dict["list"][0]) == dashboard_utils.ImmutableDict # Test exception with pytest.raises(TypeError): dashboard_utils.ImmutableList((1, 2)) with pytest.raises(TypeError): dashboard_utils.ImmutableDict([1, 2]) with pytest.raises(TypeError): immutable_dict["list"] = [] with pytest.raises(AttributeError): immutable_dict.update({1: 3}) with pytest.raises(TypeError): immutable_dict["list"][0] = 0 with pytest.raises(AttributeError): immutable_dict["list"].extend([1, 2]) with pytest.raises(AttributeError): immutable_dict["list"].insert(1, 2) d2 = dashboard_utils.ImmutableDict({1: np.zeros([3, 5])}) with pytest.raises(TypeError): print(d2[1]) d3 = dashboard_utils.ImmutableList([1, np.zeros([3, 5])]) with pytest.raises(TypeError): print(d3[1])
async def get_node(self, req) -> aiohttp.web.Response: node_id = req.match_info.get("node_id") node_info = await DataOrganizer.get_node_info(node_id) return dashboard_optional_utils.rest_response( success=True, message="Node details fetched.", detail=node_info)
async def tune_info(self, req) -> aiohttp.web.Response: stats = self.get_stats() return rest_response(success=True, message="Fetched tune info", result=stats)
async def get_all_actors(self, req) -> aiohttp.web.Response: return rest_response( success=True, message="All actors fetched.", actors=DataSource.actors )