Ejemplo n.º 1
0
def test_prometheus_file_based_service_discovery(ray_start_cluster):
    # Make sure Prometheus service discovery file is correctly written
    # when number of nodes are dynamically changed.
    NUM_NODES = 5
    cluster = ray_start_cluster
    nodes = [cluster.add_node() for _ in range(NUM_NODES)]
    cluster.wait_for_nodes()
    addr = ray.init(address=cluster.address)
    redis_address = addr["redis_address"]
    writer = PrometheusServiceDiscoveryWriter(
        redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray")

    def get_metrics_export_address_from_node(nodes):
        return [
            "{}:{}".format(node.node_ip_address, node.metrics_export_port)
            for node in nodes
        ]

    loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
    assert (set(get_metrics_export_address_from_node(nodes)) == set(
        loaded_json_data["targets"]))

    # Let's update nodes.
    for _ in range(3):
        nodes.append(cluster.add_node())

    # Make sure service discovery file content is correctly updated.
    loaded_json_data = json.loads(writer.get_file_discovery_content())[0]
    assert (set(get_metrics_export_address_from_node(nodes)) == set(
        loaded_json_data["targets"]))
Ejemplo n.º 2
0
    def __init__(self,
                 host,
                 port,
                 redis_address,
                 temp_dir,
                 redis_password=None,
                 metrics_export_address=None):
        self.host = host
        self.port = port
        self.redis_client = ray.services.create_redis_client(
            redis_address, password=redis_password)
        self.temp_dir = temp_dir
        self.dashboard_id = str(uuid.uuid4())
        self.dashboard_controller = DashboardController(
            redis_address, redis_password)
        self.service_discovery = PrometheusServiceDiscoveryWriter(
            redis_address, redis_password, temp_dir)

        # Setting the environment variable RAY_DASHBOARD_DEV=1 disables some
        # security checks in the dashboard server to ease development while
        # using the React dev server. Specifically, when this option is set, we
        # allow cross-origin requests to be made.
        self.is_dev = os.environ.get("RAY_DASHBOARD_DEV") == "1"

        self.app = aiohttp.web.Application()
        route_handler = DashboardRouteHandler(self.dashboard_controller,
                                              is_dev=self.is_dev)

        # Setup Metrics exporting service if necessary.
        self.metrics_export_address = metrics_export_address
        if self.metrics_export_address:
            self._setup_metrics_export()

        # Setup Dashboard Routes
        build_dir = setup_static_dir(self.app)
        setup_speedscope_dir(self.app, build_dir)
        setup_dashboard_route(
            self.app,
            route_handler,
            index="/",
            favicon="/favicon.ico",
            ray_config="/api/ray_config",
            node_info="/api/node_info",
            raylet_info="/api/raylet_info",
            tune_info="/api/tune_info",
            tune_availability="/api/tune_availability",
            launch_profiling="/api/launch_profiling",
            check_profiling_status="/api/check_profiling_status",
            get_profiling_info="/api/get_profiling_info",
            kill_actor="/api/kill_actor",
            logs="/api/logs",
            errors="/api/errors",
            memory_table="/api/memory_table",
            stop_memory_table="/api/stop_memory_table")
        self.app.router.add_get("/{_}", route_handler.get_forbidden)
        self.app.router.add_post("/api/set_tune_experiment",
                                 route_handler.set_tune_experiment)
        self.app.router.add_post("/api/enable_tune_tensorboard",
                                 route_handler.enable_tune_tensorboard)
Ejemplo n.º 3
0
class Dashboard:
    """A dashboard process for monitoring Ray nodes.

    This dashboard is made up of a REST API which collates data published by
        Reporter processes on nodes into a json structure, and a webserver
        which polls said API for display purposes.

    Args:
        host(str): Host address of dashboard aiohttp server.
        port(str): Port number of dashboard aiohttp server.
        redis_address(str): GCS address of a Ray cluster
        temp_dir (str): The temporary directory used for log files and
            information for this Ray session.
        redis_passord(str): Redis password to access GCS
        metrics_export_address(str): The address users host their dashboard.
    """
    def __init__(self,
                 host,
                 port,
                 redis_address,
                 temp_dir,
                 redis_password=None,
                 metrics_export_address=None):
        self.host = host
        self.port = port
        self.redis_client = ray.services.create_redis_client(
            redis_address, password=redis_password)
        self.temp_dir = temp_dir
        self.dashboard_id = str(uuid.uuid4())
        self.dashboard_controller = DashboardController(
            redis_address, redis_password)
        self.service_discovery = PrometheusServiceDiscoveryWriter(
            redis_address, redis_password, temp_dir)

        # Setting the environment variable RAY_DASHBOARD_DEV=1 disables some
        # security checks in the dashboard server to ease development while
        # using the React dev server. Specifically, when this option is set, we
        # allow cross-origin requests to be made.
        self.is_dev = os.environ.get("RAY_DASHBOARD_DEV") == "1"

        self.app = aiohttp.web.Application()
        route_handler = DashboardRouteHandler(self.dashboard_controller,
                                              is_dev=self.is_dev)

        # Setup Metrics exporting service if necessary.
        self.metrics_export_address = metrics_export_address
        if self.metrics_export_address:
            self._setup_metrics_export()

        # Setup Dashboard Routes
        build_dir = setup_static_dir(self.app)
        setup_speedscope_dir(self.app, build_dir)
        setup_dashboard_route(
            self.app,
            route_handler,
            index="/",
            favicon="/favicon.ico",
            ray_config="/api/ray_config",
            node_info="/api/node_info",
            raylet_info="/api/raylet_info",
            tune_info="/api/tune_info",
            tune_availability="/api/tune_availability",
            launch_profiling="/api/launch_profiling",
            check_profiling_status="/api/check_profiling_status",
            get_profiling_info="/api/get_profiling_info",
            kill_actor="/api/kill_actor",
            logs="/api/logs",
            errors="/api/errors",
            memory_table="/api/memory_table",
            stop_memory_table="/api/stop_memory_table")
        self.app.router.add_get("/{_}", route_handler.get_forbidden)
        self.app.router.add_post("/api/set_tune_experiment",
                                 route_handler.set_tune_experiment)
        self.app.router.add_post("/api/enable_tune_tensorboard",
                                 route_handler.enable_tune_tensorboard)

    def _setup_metrics_export(self):
        exporter = Exporter(self.dashboard_id, self.metrics_export_address,
                            self.dashboard_controller)
        self.metrics_export_client = MetricsExportClient(
            self.metrics_export_address, self.dashboard_controller,
            self.dashboard_id, exporter)

        # Setup endpoints
        metrics_export_handler = MetricsExportHandler(
            self.dashboard_controller,
            self.metrics_export_client,
            self.dashboard_id,
            is_dev=self.is_dev)
        setup_metrics_export_routes(self.app, metrics_export_handler)

    def _start_exporting_metrics(self):
        result, error = self.metrics_export_client.start_exporting_metrics()
        if not result and error:
            url = ray.services.get_webui_url_from_redis(self.redis_client)
            error += (" Please reenable the metrics export by going to "
                      "the url: {}/api/metrics/enable".format(url))
            ray.utils.push_error_to_driver_through_redis(
                self.redis_client, "metrics export failed", error)

    def log_dashboard_url(self):
        url = ray.services.get_webui_url_from_redis(self.redis_client)
        if url is None:
            raise ValueError("WebUI URL is not present in GCS.")
        with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f:
            f.write(url)
        logger.info("Dashboard running on {}".format(url))

    def run(self):
        self.log_dashboard_url()
        self.dashboard_controller.start_collecting_metrics()
        self.service_discovery.start()
        if self.metrics_export_address:
            self._start_exporting_metrics()
        aiohttp.web.run_app(self.app, host=self.host, port=self.port)
Ejemplo n.º 4
0
    try:
        setup_component_logger(
            logging_level=args.logging_level,
            logging_format=args.logging_format,
            log_dir=args.log_dir,
            filename=args.logging_filename,
            max_bytes=args.logging_rotate_bytes,
            backup_count=args.logging_rotate_backup_count)

        dashboard = Dashboard(
            args.host,
            args.port,
            args.redis_address,
            redis_password=args.redis_password,
            log_dir=args.log_dir)
        service_discovery = PrometheusServiceDiscoveryWriter(
            args.redis_address, args.redis_password, args.temp_dir)
        service_discovery.start()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(dashboard.run())
    except Exception as e:
        # Something went wrong, so push an error to all drivers.
        redis_client = ray._private.services.create_redis_client(
            args.redis_address, password=args.redis_password)
        traceback_str = ray.utils.format_error_message(traceback.format_exc())
        message = ("The dashboard on node {} failed with the following "
                   "error:\n{}".format(platform.uname()[1], traceback_str))
        ray.utils.push_error_to_driver_through_redis(
            redis_client, ray_constants.DASHBOARD_DIED_ERROR, message)
        if isinstance(e, OSError) and e.errno == errno.ENOENT:
            logger.warning(message)
        else: