def test_basic(ray_start_with_dashboard): """Dashboard test that starts a Ray cluster with a dashboard server running, then hits the dashboard API and asserts that it receives sensible data.""" redis_address = ray_start_with_dashboard["redis_address"] redis_password = REDIS_DEFAULT_PASSWORD node_stats = NodeStats(redis_address, redis_password) node_stats.start() # Wait for node stats to fire up. MAX_START_TIME_S = 30 t_start = datetime.now() while True: try: stats = node_stats.get_node_stats() client_stats = stats and stats.get("clients") if not client_stats: sleep(3) if (datetime.now() - t_start).seconds > MAX_START_TIME_S: pytest.fail("Node stats took too long to start up") continue break except Exception: continue assert len(client_stats) == 1 client = client_stats[0] assert len(client["workers"]) == 1
def start_node_stats(redis_address): redis_password = REDIS_DEFAULT_PASSWORD node_stats = NodeStats(redis_address, redis_password) node_stats.start() # Wait for node stats to fire up. MAX_START_TIME_S = 30 t_start = datetime.now() while True: try: stats = node_stats.get_node_stats() client_stats = stats and stats.get("clients") if not client_stats: sleep(3) if (datetime.now() - t_start).seconds > MAX_START_TIME_S: pytest.fail("Node stats took too long to start up") continue break except Exception: continue return node_stats
class DashboardController(BaseDashboardController): def __init__(self, redis_address, redis_password): self.node_stats = NodeStats(redis_address, redis_password) self.raylet_stats = RayletStats(redis_address, redis_password=redis_password) if Analysis is not None: self.tune_stats = TuneCollector(2.0) self.memory_table = MemoryTable([]) def _construct_raylet_info(self): D = self.raylet_stats.get_raylet_stats() workers_info_by_node = { data["nodeId"]: data.get("workersStats") for data in D.values() } infeasible_tasks = sum( (data.get("infeasibleTasks", []) for data in D.values()), []) # ready_tasks are used to render tasks that are not schedulable # due to resource limitations. # (e.g., Actor requires 2 GPUs but there is only 1 gpu available). ready_tasks = sum((data.get("readyTasks", []) for data in D.values()), []) actors = self.node_stats.get_actors(workers_info_by_node, infeasible_tasks, ready_tasks) for address, data in D.items(): # process view data measures_dicts = {} for view_data in data["viewData"]: view_name = view_data["viewName"] if view_name in ("local_available_resource", "local_total_resource", "object_manager_stats"): measures_dicts[view_name] = measures_to_dict( view_data["measures"]) # process resources info extra_info_strings = [] prefix = "ResourceName:" for resource_name, total_resource in measures_dicts[ "local_total_resource"].items(): available_resource = measures_dicts[ "local_available_resource"].get(resource_name, .0) resource_name = resource_name[len(prefix):] extra_info_strings.append("{}: {} / {}".format( resource_name, format_resource(resource_name, total_resource - available_resource), format_resource(resource_name, total_resource))) data["extraInfo"] = ", ".join(extra_info_strings) + "\n" if os.environ.get("RAY_DASHBOARD_DEBUG"): # process object store info extra_info_strings = [] prefix = "ValueType:" for stats_name in [ "used_object_store_memory", "num_local_objects" ]: stats_value = measures_dicts["object_manager_stats"].get( prefix + stats_name, .0) extra_info_strings.append("{}: {}".format( stats_name, stats_value)) data["extraInfo"] += ", ".join(extra_info_strings) # process actor info actors_str = json.dumps(actors, indent=2, sort_keys=True) lines = actors_str.split("\n") max_line_length = max(map(len, lines)) to_print = [] for line in lines: to_print.append(line + (max_line_length - len(line)) * " ") data["extraInfo"] += "\n" + "\n".join(to_print) return {"nodes": D, "actors": actors} def get_ray_config(self): try: config_path = os.path.expanduser("~/ray_bootstrap_config.yaml") with open(config_path) as f: cfg = yaml.safe_load(f) except Exception: error = "No config" return error, None D = { "min_workers": cfg["min_workers"], "max_workers": cfg["max_workers"], "initial_workers": cfg["initial_workers"], "autoscaling_mode": cfg["autoscaling_mode"], "idle_timeout_minutes": cfg["idle_timeout_minutes"], } try: D["head_type"] = cfg["head_node"]["InstanceType"] except KeyError: D["head_type"] = "unknown" try: D["worker_type"] = cfg["worker_nodes"]["InstanceType"] except KeyError: D["worker_type"] = "unknown" return None, D def get_node_info(self): return self.node_stats.get_node_stats() def get_raylet_info(self): return self._construct_raylet_info() def get_memory_table_info(self, group_by=GroupByType.NODE_ADDRESS, sort_by=SortingType.OBJECT_SIZE) -> MemoryTable: # Collecting memory info adds big overhead to the cluster. # This must be collected only when it is necessary. self.raylet_stats.include_memory_info = True D = self.raylet_stats.get_raylet_stats() workers_info_by_node = { data["nodeId"]: data.get("workersStats") for data in D.values() } self.memory_table = construct_memory_table(workers_info_by_node, group_by=group_by, sort_by=sort_by) return self.memory_table def stop_collecting_memory_table_info(self): self.raylet_stats.include_memory_info = False def tune_info(self): if Analysis is not None: D = self.tune_stats.get_stats() else: D = {} return D def tune_availability(self): if Analysis is not None: D = self.tune_stats.get_availability() else: D = {"available": False, "trials_available": False} return D def set_tune_experiment(self, experiment): if Analysis is not None: return self.tune_stats.set_experiment(experiment) return "Tune Not Enabled", None def enable_tune_tensorboard(self): if Analysis is not None: self.tune_stats.enable_tensorboard() def launch_profiling(self, node_id, pid, duration): profiling_id = self.raylet_stats.launch_profiling(node_id=node_id, pid=pid, duration=duration) return profiling_id def check_profiling_status(self, profiling_id): return self.raylet_stats.check_profiling_status(profiling_id) def get_profiling_info(self, profiling_id): return self.raylet_stats.get_profiling_info(profiling_id) def kill_actor(self, actor_id, ip_address, port): return self.raylet_stats.kill_actor(actor_id, ip_address, port) def get_logs(self, hostname, pid): return self.node_stats.get_logs(hostname, pid) def get_errors(self, hostname, pid): return self.node_stats.get_errors(hostname, pid) def start_collecting_metrics(self): self.node_stats.start() self.raylet_stats.start() if Analysis is not None: self.tune_stats.start()
class DashboardController(BaseDashboardController): def __init__(self, redis_address, redis_password): self.node_stats = NodeStats(redis_address, redis_password) self.raylet_stats = RayletStats(redis_address, redis_password=redis_password) if Analysis is not None: self.tune_stats = TuneCollector(2.0) self.memory_table = MemoryTable([]) def _construct_raylet_info(self): D = self.raylet_stats.get_raylet_stats() workers_info_by_node = { data["nodeId"]: data.get("workersStats") for data in D.values() } infeasible_tasks = sum( (data.get("infeasibleTasks", []) for data in D.values()), []) # ready_tasks are used to render tasks that are not schedulable # due to resource limitations. # (e.g., Actor requires 2 GPUs but there is only 1 gpu available). ready_tasks = sum((data.get("readyTasks", []) for data in D.values()), []) actor_groups = self.node_stats.get_actors(workers_info_by_node, infeasible_tasks, ready_tasks) plasma_stats = {} # HTTP call to metrics port for each node in nodes/ used_views = ("object_store_num_local_objects", "object_store_available_memory", "object_store_used_memory") for address, data in D.items(): # process view data views = [ view for view in data.get("viewData", []) if view.get("viewName") in used_views ] node_plasma_stats = {} for view in views: view_name = view["viewName"] view_measures = view["measures"] if view_measures: view_data = view_measures[0].get("doubleValue", .0) else: view_data = .0 node_plasma_stats[view_name] = view_data plasma_stats[address] = node_plasma_stats return { "nodes": D, "actorGroups": actor_groups, "plasmaStats": plasma_stats } def get_ray_config(self): try: config_path = os.path.expanduser("~/ray_bootstrap_config.yaml") with open(config_path) as f: cfg = yaml.safe_load(f) except Exception: error = "No config" return error, None D = { "min_workers": cfg["min_workers"], "max_workers": cfg["max_workers"], "initial_workers": cfg["initial_workers"], "autoscaling_mode": cfg["autoscaling_mode"], "idle_timeout_minutes": cfg["idle_timeout_minutes"], } try: D["head_type"] = cfg["head_node"]["InstanceType"] except KeyError: D["head_type"] = "unknown" try: D["worker_type"] = cfg["worker_nodes"]["InstanceType"] except KeyError: D["worker_type"] = "unknown" return None, D def get_node_info(self): return self.node_stats.get_node_stats() def get_raylet_info(self): return self._construct_raylet_info() def get_memory_table_info(self, group_by=GroupByType.NODE_ADDRESS, sort_by=SortingType.OBJECT_SIZE) -> MemoryTable: # Collecting memory info adds big overhead to the cluster. # This must be collected only when it is necessary. self.raylet_stats.include_memory_info = True D = self.raylet_stats.get_raylet_stats() workers_info_by_node = { data["nodeId"]: data.get("workersStats") for data in D.values() } self.memory_table = construct_memory_table(workers_info_by_node, group_by=group_by, sort_by=sort_by) return self.memory_table def stop_collecting_memory_table_info(self): self.raylet_stats.include_memory_info = False def tune_info(self): if Analysis is not None: D = self.tune_stats.get_stats() else: D = {} return D def tune_availability(self): if Analysis is not None: D = self.tune_stats.get_availability() else: D = {"available": False, "trials_available": False} return D def set_tune_experiment(self, experiment): if Analysis is not None: return self.tune_stats.set_experiment(experiment) return "Tune Not Enabled", None def enable_tune_tensorboard(self): if Analysis is not None: self.tune_stats.enable_tensorboard() def launch_profiling(self, node_id, pid, duration): profiling_id = self.raylet_stats.launch_profiling(node_id=node_id, pid=pid, duration=duration) return profiling_id def check_profiling_status(self, profiling_id): return self.raylet_stats.check_profiling_status(profiling_id) def get_profiling_info(self, profiling_id): return self.raylet_stats.get_profiling_info(profiling_id) def kill_actor(self, actor_id, ip_address, port): return self.raylet_stats.kill_actor(actor_id, ip_address, port) def get_logs(self, hostname, pid): return self.node_stats.get_logs(hostname, pid) def get_errors(self, hostname, pid): return self.node_stats.get_errors(hostname, pid) def start_collecting_metrics(self): self.node_stats.start() self.raylet_stats.start() if Analysis is not None: self.tune_stats.start()