class Dashboard: """A dashboard process for monitoring Ray nodes. This dashboard is made up of a REST API which collates data published by Reporter processes on nodes into a json structure, and a webserver which polls said API for display purposes. Args: host(str): Host address of dashboard aiohttp server. port(str): Port number of dashboard aiohttp server. redis_address(str): GCS address of a Ray cluster temp_dir (str): The temporary directory used for log files and information for this Ray session. redis_passord(str): Redis password to access GCS metrics_export_address(str): The address users host their dashboard. """ def __init__(self, host, port, redis_address, temp_dir, redis_password=None, metrics_export_address=None): self.host = host self.port = port self.redis_client = ray.services.create_redis_client( redis_address, password=redis_password) self.temp_dir = temp_dir self.dashboard_id = str(uuid.uuid4()) self.dashboard_controller = DashboardController( redis_address, redis_password) self.service_discovery = PrometheusServiceDiscoveryWriter( redis_address, redis_password, temp_dir) # Setting the environment variable RAY_DASHBOARD_DEV=1 disables some # security checks in the dashboard server to ease development while # using the React dev server. Specifically, when this option is set, we # allow cross-origin requests to be made. self.is_dev = os.environ.get("RAY_DASHBOARD_DEV") == "1" self.app = aiohttp.web.Application() route_handler = DashboardRouteHandler(self.dashboard_controller, is_dev=self.is_dev) # Setup Metrics exporting service if necessary. self.metrics_export_address = metrics_export_address if self.metrics_export_address: self._setup_metrics_export() # Setup Dashboard Routes build_dir = setup_static_dir(self.app) setup_speedscope_dir(self.app, build_dir) setup_dashboard_route( self.app, route_handler, index="/", favicon="/favicon.ico", ray_config="/api/ray_config", node_info="/api/node_info", raylet_info="/api/raylet_info", tune_info="/api/tune_info", tune_availability="/api/tune_availability", launch_profiling="/api/launch_profiling", check_profiling_status="/api/check_profiling_status", get_profiling_info="/api/get_profiling_info", kill_actor="/api/kill_actor", logs="/api/logs", errors="/api/errors", memory_table="/api/memory_table", stop_memory_table="/api/stop_memory_table") self.app.router.add_get("/{_}", route_handler.get_forbidden) self.app.router.add_post("/api/set_tune_experiment", route_handler.set_tune_experiment) self.app.router.add_post("/api/enable_tune_tensorboard", route_handler.enable_tune_tensorboard) def _setup_metrics_export(self): exporter = Exporter(self.dashboard_id, self.metrics_export_address, self.dashboard_controller) self.metrics_export_client = MetricsExportClient( self.metrics_export_address, self.dashboard_controller, self.dashboard_id, exporter) # Setup endpoints metrics_export_handler = MetricsExportHandler( self.dashboard_controller, self.metrics_export_client, self.dashboard_id, is_dev=self.is_dev) setup_metrics_export_routes(self.app, metrics_export_handler) def _start_exporting_metrics(self): result, error = self.metrics_export_client.start_exporting_metrics() if not result and error: url = ray.services.get_webui_url_from_redis(self.redis_client) error += (" Please reenable the metrics export by going to " "the url: {}/api/metrics/enable".format(url)) ray.utils.push_error_to_driver_through_redis( self.redis_client, "metrics export failed", error) def log_dashboard_url(self): url = ray.services.get_webui_url_from_redis(self.redis_client) if url is None: raise ValueError("WebUI URL is not present in GCS.") with open(os.path.join(self.temp_dir, "dashboard_url"), "w") as f: f.write(url) logger.info("Dashboard running on {}".format(url)) def run(self): self.log_dashboard_url() self.dashboard_controller.start_collecting_metrics() self.service_discovery.start() if self.metrics_export_address: self._start_exporting_metrics() aiohttp.web.run_app(self.app, host=self.host, port=self.port)
logging_level=args.logging_level, logging_format=args.logging_format, log_dir=args.log_dir, filename=args.logging_filename, max_bytes=args.logging_rotate_bytes, backup_count=args.logging_rotate_backup_count) dashboard = Dashboard( args.host, args.port, args.redis_address, redis_password=args.redis_password, log_dir=args.log_dir) service_discovery = PrometheusServiceDiscoveryWriter( args.redis_address, args.redis_password, args.temp_dir) service_discovery.start() loop = asyncio.get_event_loop() loop.run_until_complete(dashboard.run()) except Exception as e: # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) traceback_str = ray.utils.format_error_message(traceback.format_exc()) message = ("The dashboard on node {} failed with the following " "error:\n{}".format(platform.uname()[1], traceback_str)) ray.utils.push_error_to_driver_through_redis( redis_client, ray_constants.DASHBOARD_DIED_ERROR, message) if isinstance(e, OSError) and e.errno == errno.ENOENT: logger.warning(message) else: logger.exception(message)