Exemple #1
0
def test_basic(ray_start_with_dashboard):
    """Dashboard test that starts a Ray cluster with a dashboard server running,
    then hits the dashboard API and asserts that it receives sensible data."""
    redis_address = ray_start_with_dashboard["redis_address"]
    redis_password = REDIS_DEFAULT_PASSWORD
    node_stats = NodeStats(redis_address, redis_password)
    node_stats.start()
    # Wait for node stats to fire up.
    MAX_START_TIME_S = 30
    t_start = datetime.now()
    while True:
        try:
            stats = node_stats.get_node_stats()
            client_stats = stats and stats.get("clients")
            if not client_stats:
                sleep(3)
                if (datetime.now() - t_start).seconds > MAX_START_TIME_S:
                    pytest.fail("Node stats took too long to start up")
                continue
            break
        except Exception:
            continue
    assert len(client_stats) == 1
    client = client_stats[0]
    assert len(client["workers"]) == 1
Exemple #2
0
def start_node_stats(redis_address):
    redis_password = REDIS_DEFAULT_PASSWORD
    node_stats = NodeStats(redis_address, redis_password)
    node_stats.start()
    # Wait for node stats to fire up.
    MAX_START_TIME_S = 30
    t_start = datetime.now()
    while True:
        try:
            stats = node_stats.get_node_stats()
            client_stats = stats and stats.get("clients")
            if not client_stats:
                sleep(3)
                if (datetime.now() - t_start).seconds > MAX_START_TIME_S:
                    pytest.fail("Node stats took too long to start up")
                continue
            break
        except Exception:
            continue
    return node_stats
Exemple #3
0
class DashboardController(BaseDashboardController):
    def __init__(self, redis_address, redis_password):
        self.node_stats = NodeStats(redis_address, redis_password)
        self.raylet_stats = RayletStats(redis_address,
                                        redis_password=redis_password)
        if Analysis is not None:
            self.tune_stats = TuneCollector(2.0)
        self.memory_table = MemoryTable([])

    def _construct_raylet_info(self):
        D = self.raylet_stats.get_raylet_stats()
        workers_info_by_node = {
            data["nodeId"]: data.get("workersStats")
            for data in D.values()
        }

        infeasible_tasks = sum(
            (data.get("infeasibleTasks", []) for data in D.values()), [])
        # ready_tasks are used to render tasks that are not schedulable
        # due to resource limitations.
        # (e.g., Actor requires 2 GPUs but there is only 1 gpu available).
        ready_tasks = sum((data.get("readyTasks", []) for data in D.values()),
                          [])
        actors = self.node_stats.get_actors(workers_info_by_node,
                                            infeasible_tasks, ready_tasks)

        for address, data in D.items():
            # process view data
            measures_dicts = {}
            for view_data in data["viewData"]:
                view_name = view_data["viewName"]
                if view_name in ("local_available_resource",
                                 "local_total_resource",
                                 "object_manager_stats"):
                    measures_dicts[view_name] = measures_to_dict(
                        view_data["measures"])
            # process resources info
            extra_info_strings = []
            prefix = "ResourceName:"
            for resource_name, total_resource in measures_dicts[
                    "local_total_resource"].items():
                available_resource = measures_dicts[
                    "local_available_resource"].get(resource_name, .0)
                resource_name = resource_name[len(prefix):]
                extra_info_strings.append("{}: {} / {}".format(
                    resource_name,
                    format_resource(resource_name,
                                    total_resource - available_resource),
                    format_resource(resource_name, total_resource)))
            data["extraInfo"] = ", ".join(extra_info_strings) + "\n"
            if os.environ.get("RAY_DASHBOARD_DEBUG"):
                # process object store info
                extra_info_strings = []
                prefix = "ValueType:"
                for stats_name in [
                        "used_object_store_memory", "num_local_objects"
                ]:
                    stats_value = measures_dicts["object_manager_stats"].get(
                        prefix + stats_name, .0)
                    extra_info_strings.append("{}: {}".format(
                        stats_name, stats_value))
                data["extraInfo"] += ", ".join(extra_info_strings)
                # process actor info
                actors_str = json.dumps(actors, indent=2, sort_keys=True)
                lines = actors_str.split("\n")
                max_line_length = max(map(len, lines))
                to_print = []
                for line in lines:
                    to_print.append(line + (max_line_length - len(line)) * " ")
                data["extraInfo"] += "\n" + "\n".join(to_print)
        return {"nodes": D, "actors": actors}

    def get_ray_config(self):
        try:
            config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
            with open(config_path) as f:
                cfg = yaml.safe_load(f)
        except Exception:
            error = "No config"
            return error, None

        D = {
            "min_workers": cfg["min_workers"],
            "max_workers": cfg["max_workers"],
            "initial_workers": cfg["initial_workers"],
            "autoscaling_mode": cfg["autoscaling_mode"],
            "idle_timeout_minutes": cfg["idle_timeout_minutes"],
        }

        try:
            D["head_type"] = cfg["head_node"]["InstanceType"]
        except KeyError:
            D["head_type"] = "unknown"

        try:
            D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
        except KeyError:
            D["worker_type"] = "unknown"

        return None, D

    def get_node_info(self):
        return self.node_stats.get_node_stats()

    def get_raylet_info(self):
        return self._construct_raylet_info()

    def get_memory_table_info(self,
                              group_by=GroupByType.NODE_ADDRESS,
                              sort_by=SortingType.OBJECT_SIZE) -> MemoryTable:
        # Collecting memory info adds big overhead to the cluster.
        # This must be collected only when it is necessary.
        self.raylet_stats.include_memory_info = True
        D = self.raylet_stats.get_raylet_stats()
        workers_info_by_node = {
            data["nodeId"]: data.get("workersStats")
            for data in D.values()
        }
        self.memory_table = construct_memory_table(workers_info_by_node,
                                                   group_by=group_by,
                                                   sort_by=sort_by)
        return self.memory_table

    def stop_collecting_memory_table_info(self):
        self.raylet_stats.include_memory_info = False

    def tune_info(self):
        if Analysis is not None:
            D = self.tune_stats.get_stats()
        else:
            D = {}
        return D

    def tune_availability(self):
        if Analysis is not None:
            D = self.tune_stats.get_availability()
        else:
            D = {"available": False, "trials_available": False}
        return D

    def set_tune_experiment(self, experiment):
        if Analysis is not None:
            return self.tune_stats.set_experiment(experiment)
        return "Tune Not Enabled", None

    def enable_tune_tensorboard(self):
        if Analysis is not None:
            self.tune_stats.enable_tensorboard()

    def launch_profiling(self, node_id, pid, duration):
        profiling_id = self.raylet_stats.launch_profiling(node_id=node_id,
                                                          pid=pid,
                                                          duration=duration)
        return profiling_id

    def check_profiling_status(self, profiling_id):
        return self.raylet_stats.check_profiling_status(profiling_id)

    def get_profiling_info(self, profiling_id):
        return self.raylet_stats.get_profiling_info(profiling_id)

    def kill_actor(self, actor_id, ip_address, port):
        return self.raylet_stats.kill_actor(actor_id, ip_address, port)

    def get_logs(self, hostname, pid):
        return self.node_stats.get_logs(hostname, pid)

    def get_errors(self, hostname, pid):
        return self.node_stats.get_errors(hostname, pid)

    def start_collecting_metrics(self):
        self.node_stats.start()
        self.raylet_stats.start()
        if Analysis is not None:
            self.tune_stats.start()
Exemple #4
0
class DashboardController(BaseDashboardController):
    def __init__(self, redis_address, redis_password):
        self.node_stats = NodeStats(redis_address, redis_password)
        self.raylet_stats = RayletStats(redis_address,
                                        redis_password=redis_password)
        if Analysis is not None:
            self.tune_stats = TuneCollector(2.0)
        self.memory_table = MemoryTable([])

    def _construct_raylet_info(self):
        D = self.raylet_stats.get_raylet_stats()
        workers_info_by_node = {
            data["nodeId"]: data.get("workersStats")
            for data in D.values()
        }

        infeasible_tasks = sum(
            (data.get("infeasibleTasks", []) for data in D.values()), [])
        # ready_tasks are used to render tasks that are not schedulable
        # due to resource limitations.
        # (e.g., Actor requires 2 GPUs but there is only 1 gpu available).
        ready_tasks = sum((data.get("readyTasks", []) for data in D.values()),
                          [])
        actor_groups = self.node_stats.get_actors(workers_info_by_node,
                                                  infeasible_tasks,
                                                  ready_tasks)
        plasma_stats = {}
        # HTTP call to metrics port for each node in nodes/
        used_views = ("object_store_num_local_objects",
                      "object_store_available_memory",
                      "object_store_used_memory")
        for address, data in D.items():
            # process view data
            views = [
                view for view in data.get("viewData", [])
                if view.get("viewName") in used_views
            ]
            node_plasma_stats = {}
            for view in views:
                view_name = view["viewName"]
                view_measures = view["measures"]
                if view_measures:
                    view_data = view_measures[0].get("doubleValue", .0)
                else:
                    view_data = .0
                node_plasma_stats[view_name] = view_data
            plasma_stats[address] = node_plasma_stats

        return {
            "nodes": D,
            "actorGroups": actor_groups,
            "plasmaStats": plasma_stats
        }

    def get_ray_config(self):
        try:
            config_path = os.path.expanduser("~/ray_bootstrap_config.yaml")
            with open(config_path) as f:
                cfg = yaml.safe_load(f)
        except Exception:
            error = "No config"
            return error, None

        D = {
            "min_workers": cfg["min_workers"],
            "max_workers": cfg["max_workers"],
            "initial_workers": cfg["initial_workers"],
            "autoscaling_mode": cfg["autoscaling_mode"],
            "idle_timeout_minutes": cfg["idle_timeout_minutes"],
        }

        try:
            D["head_type"] = cfg["head_node"]["InstanceType"]
        except KeyError:
            D["head_type"] = "unknown"

        try:
            D["worker_type"] = cfg["worker_nodes"]["InstanceType"]
        except KeyError:
            D["worker_type"] = "unknown"

        return None, D

    def get_node_info(self):
        return self.node_stats.get_node_stats()

    def get_raylet_info(self):
        return self._construct_raylet_info()

    def get_memory_table_info(self,
                              group_by=GroupByType.NODE_ADDRESS,
                              sort_by=SortingType.OBJECT_SIZE) -> MemoryTable:
        # Collecting memory info adds big overhead to the cluster.
        # This must be collected only when it is necessary.
        self.raylet_stats.include_memory_info = True
        D = self.raylet_stats.get_raylet_stats()
        workers_info_by_node = {
            data["nodeId"]: data.get("workersStats")
            for data in D.values()
        }
        self.memory_table = construct_memory_table(workers_info_by_node,
                                                   group_by=group_by,
                                                   sort_by=sort_by)
        return self.memory_table

    def stop_collecting_memory_table_info(self):
        self.raylet_stats.include_memory_info = False

    def tune_info(self):
        if Analysis is not None:
            D = self.tune_stats.get_stats()
        else:
            D = {}
        return D

    def tune_availability(self):
        if Analysis is not None:
            D = self.tune_stats.get_availability()
        else:
            D = {"available": False, "trials_available": False}
        return D

    def set_tune_experiment(self, experiment):
        if Analysis is not None:
            return self.tune_stats.set_experiment(experiment)
        return "Tune Not Enabled", None

    def enable_tune_tensorboard(self):
        if Analysis is not None:
            self.tune_stats.enable_tensorboard()

    def launch_profiling(self, node_id, pid, duration):
        profiling_id = self.raylet_stats.launch_profiling(node_id=node_id,
                                                          pid=pid,
                                                          duration=duration)
        return profiling_id

    def check_profiling_status(self, profiling_id):
        return self.raylet_stats.check_profiling_status(profiling_id)

    def get_profiling_info(self, profiling_id):
        return self.raylet_stats.get_profiling_info(profiling_id)

    def kill_actor(self, actor_id, ip_address, port):
        return self.raylet_stats.kill_actor(actor_id, ip_address, port)

    def get_logs(self, hostname, pid):
        return self.node_stats.get_logs(hostname, pid)

    def get_errors(self, hostname, pid):
        return self.node_stats.get_errors(hostname, pid)

    def start_collecting_metrics(self):
        self.node_stats.start()
        self.raylet_stats.start()
        if Analysis is not None:
            self.tune_stats.start()