def __init__(self, group_name: str): self._group_name = group_name self._job_id = ray.get_runtime_context().job_id gcs_address = ray._private.worker._global_node.gcs_address self._gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=10) internal_kv._initialize_internal_kv(self._gcs_client)
def __init__(self, dashboard_agent): super().__init__(dashboard_agent) self._runtime_env_dir = dashboard_agent.runtime_env_dir self._logging_params = dashboard_agent.logging_params self._per_job_logger_cache = dict() # Cache the results of creating envs to avoid repeatedly calling into # conda and other slow calls. self._env_cache: Dict[str, CreatedEnvResult] = dict() # Maps a serialized runtime env to a lock that is used # to prevent multiple concurrent installs of the same env. self._env_locks: Dict[str, asyncio.Lock] = dict() # Keeps track of the URIs contained within each env so we can # invalidate the env cache when a URI is deleted. # This is a temporary mechanism until we have per-URI caching. self._uris_to_envs: Dict[str, Set[str]] = defaultdict(set) # Initialize internal KV to be used by the working_dir setup code. _initialize_internal_kv(self._dashboard_agent.gcs_client) assert _internal_kv_initialized() self._pip_manager = PipManager(self._runtime_env_dir) self._conda_manager = CondaManager(self._runtime_env_dir) self._py_modules_manager = PyModulesManager(self._runtime_env_dir) self._working_dir_manager = WorkingDirManager(self._runtime_env_dir) self._container_manager = ContainerManager(dashboard_agent.temp_dir) self._working_dir_uri_cache = URICache( self._working_dir_manager.delete_uri, WORKING_DIR_CACHE_SIZE_BYTES) self._py_modules_uri_cache = URICache( self._py_modules_manager.delete_uri, PY_MODULES_CACHE_SIZE_BYTES) self._conda_uri_cache = URICache(self._conda_manager.delete_uri, CONDA_CACHE_SIZE_BYTES) self._pip_uri_cache = URICache(self._pip_manager.delete_uri, PIP_CACHE_SIZE_BYTES) self._logger = default_logger
def __init__(self, dashboard_head): super().__init__(dashboard_head) self._gcs_job_info_stub = None self._gcs_actor_info_stub = None self._dashboard_head = dashboard_head _initialize_internal_kv(dashboard_head.gcs_client) assert _internal_kv_initialized() self._job_status_client = JobStatusStorageClient()
def __init__(self, dashboard_head): super().__init__(dashboard_head) self._gcs_job_info_stub = None self._gcs_actor_info_stub = None self._dashboard_head = dashboard_head # Initialize internal KV to be used by the working_dir setup code. _initialize_internal_kv(dashboard_head.gcs_client) assert _internal_kv_initialized()
def test_upload_fails(self): """Check that function throws useful error when upload fails.""" uri = "gcs://test.zip" bytes = b"test" assert len(bytes) < GCS_STORAGE_MAX_SIZE _internal_kv_reset() _initialize_internal_kv(self.DisconnectedClient()) with pytest.raises(RuntimeError, match="Failed to store package in the GCS"): _store_package_in_gcs(uri, bytes)
def get_ray_status_output(address): redis_client = ray._private.services.create_redis_client(address, "") gcs_client = gcs_utils.GcsClient.create_from_redis(redis_client) internal_kv._initialize_internal_kv(gcs_client) status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS) error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR) return { "demand": debug_status( status, error).split("Demands:")[1].strip("\n").strip(" "), "usage": debug_status(status, error).split("Demands:")[0].split( "Usage:")[1].strip("\n").strip(" ") }
def get_ray_status_output(address): gcs_client = gcs_utils.GcsClient(address=address) internal_kv._initialize_internal_kv(gcs_client) status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS) error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR) return { "demand": debug_status(status, error).split("Demands:")[1].strip("\n").strip(" "), "usage": debug_status(status, error).split("Demands:")[0].split("Usage:") [1].strip("\n").strip(" "), }
def __init__(self, dashboard_agent): super().__init__(dashboard_agent) self._runtime_env_dir = dashboard_agent.runtime_env_dir self._logging_params = dashboard_agent.logging_params self._per_job_logger_cache = dict() # Cache the results of creating envs to avoid repeatedly calling into # conda and other slow calls. self._env_cache: Dict[str, CreatedEnvResult] = dict() # Maps a serialized runtime env to a lock that is used # to prevent multiple concurrent installs of the same env. self._env_locks: Dict[str, asyncio.Lock] = dict() # Initialize internal KV to be used by the working_dir setup code. _initialize_internal_kv(self._dashboard_agent.gcs_client) assert _internal_kv_initialized()
def __init__(self, dashboard_agent): super().__init__(dashboard_agent) self._runtime_env_dir = dashboard_agent.runtime_env_dir self._logging_params = dashboard_agent.logging_params self._per_job_logger_cache = dict() # Cache the results of creating envs to avoid repeatedly calling into # conda and other slow calls. self._env_cache: Dict[str, CreatedEnvResult] = dict() # Maps a serialized runtime env to a lock that is used # to prevent multiple concurrent installs of the same env. self._env_locks: Dict[str, asyncio.Lock] = dict() _initialize_internal_kv(self._dashboard_agent.gcs_client) assert _internal_kv_initialized() self._pip_manager = PipManager(self._runtime_env_dir) self._conda_manager = CondaManager(self._runtime_env_dir) self._py_modules_manager = PyModulesManager(self._runtime_env_dir) self._java_jars_manager = JavaJarsManager(self._runtime_env_dir) self._working_dir_manager = WorkingDirManager(self._runtime_env_dir) self._container_manager = ContainerManager(dashboard_agent.temp_dir) self._reference_table = ReferenceTable( self.uris_parser, self.unused_uris_processor, self.unused_runtime_env_processor, ) self._working_dir_uri_cache = URICache( self._working_dir_manager.delete_uri, WORKING_DIR_CACHE_SIZE_BYTES) self._py_modules_uri_cache = URICache( self._py_modules_manager.delete_uri, PY_MODULES_CACHE_SIZE_BYTES) self._java_jars_uri_cache = URICache( self._java_jars_manager.delete_uri, JAVA_JARS_CACHE_SIZE_BYTES) self._conda_uri_cache = URICache(self._conda_manager.delete_uri, CONDA_CACHE_SIZE_BYTES) self._pip_uri_cache = URICache(self._pip_manager.delete_uri, PIP_CACHE_SIZE_BYTES) self._logger = default_logger
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state( redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) self.gcs_node_info_stub = \ gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis gcs_client = GcsClient.create_from_redis(self.redis) _initialize_internal_kv(gcs_client) worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password if os.environ.get("RAY_FAKE_CLUSTER"): self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID) else: self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning("`prometheus_client` not found, so metrics will " "not be exported.") logger.info("Monitor: Started")
async def run(self): gcs_address = self.gcs_address # Dashboard will handle connection failure automatically self.gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=0) internal_kv._initialize_internal_kv(self.gcs_client) self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True) self.gcs_error_subscriber = GcsAioErrorSubscriber(address=gcs_address) self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address) await self.gcs_error_subscriber.subscribe() await self.gcs_log_subscriber.subscribe() self.health_check_thread = GCSHealthCheckThread(gcs_address) self.health_check_thread.start() # Start a grpc asyncio server. await self.server.start() async def _async_notify(): """Notify signals from queue.""" while True: co = await dashboard_utils.NotifyQueue.get() try: await co except Exception: logger.exception(f"Error notifying coroutine {co}") modules = self._load_modules() http_host, http_port = self.http_host, self.http_port if not self.minimal: self.http_server = await self._configure_http_server(modules) http_host, http_port = self.http_server.get_address() internal_kv._internal_kv_put( ray_constants.DASHBOARD_ADDRESS, f"{http_host}:{http_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # TODO: Use async version if performance is an issue # Write the dashboard head port to gcs kv. internal_kv._internal_kv_put( dashboard_consts.DASHBOARD_RPC_ADDRESS, f"{self.ip}:{self.grpc_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Freeze signal after all modules loaded. dashboard_utils.SignalManager.freeze() concurrent_tasks = [ self._gcs_check_alive(), _async_notify(), DataOrganizer.purge(), DataOrganizer.organize(), ] await asyncio.gather(*concurrent_tasks, *(m.run(self.server) for m in modules)) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()
def __init__(self): gcs_address = ray.worker._global_node.gcs_address self._gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=0) internal_kv._initialize_internal_kv(self._gcs_client)
def __init__( self, address: str, autoscaling_config: Union[str, Callable[[], Dict[str, Any]]], redis_password: Optional[str] = None, prefix_cluster_info: bool = False, monitor_ip: Optional[str] = None, stop_event: Optional[Event] = None, retry_on_failure: bool = True, ): gcs_address = address options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = ( gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)) self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( gcs_channel) if redis_password is not None: logger.warning("redis_password has been deprecated.") # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker gcs_client = GcsClient(address=gcs_address) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) _initialize_internal_kv(gcs_client) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) worker.mode = 0 head_node_ip = gcs_address.split(":")[0] self.load_metrics = LoadMetrics() self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.retry_on_failure = retry_on_failure self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry, ) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning( "`prometheus_client` not found, so metrics will not be exported." ) logger.info("Monitor: Started")
def __init__( self, node_ip_address, dashboard_agent_port, gcs_address, minimal, temp_dir=None, session_dir=None, runtime_env_dir=None, log_dir=None, metrics_export_port=None, node_manager_port=None, listen_port=0, object_store_name=None, raylet_name=None, logging_params=None, disable_metrics_collection: bool = False, ): """Initialize the DashboardAgent object.""" # Public attributes are accessible for all agent modules. self.ip = node_ip_address self.minimal = minimal assert gcs_address is not None self.gcs_address = gcs_address self.temp_dir = temp_dir self.session_dir = session_dir self.runtime_env_dir = runtime_env_dir self.log_dir = log_dir self.dashboard_agent_port = dashboard_agent_port self.metrics_export_port = metrics_export_port self.node_manager_port = node_manager_port self.listen_port = listen_port self.object_store_name = object_store_name self.raylet_name = raylet_name self.logging_params = logging_params self.node_id = os.environ["RAY_NODE_ID"] self.metrics_collection_disabled = disable_metrics_collection # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is # only used for fate-sharing with the raylet and we need a different # fate-sharing mechanism for Windows anyways. if sys.platform not in ["win32", "cygwin"]: self.ppid = int(os.environ["RAY_RAYLET_PID"]) assert self.ppid > 0 logger.info("Parent pid is %s", self.ppid) # Setup raylet channel options = ray_constants.GLOBAL_GRPC_OPTIONS self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel( f"{self.ip}:{self.node_manager_port}", options, asynchronous=True) # Setup grpc server self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" try: self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( self.server, f"{grpc_ip}:{self.dashboard_agent_port}") except Exception: # TODO(SongGuyang): Catch the exception here because there is # port conflict issue which brought from static port. We should # remove this after we find better port resolution. logger.exception( "Failed to add port to grpc server. Agent will stay alive but " "disable the grpc service.") self.server = None self.grpc_port = None else: logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port) # If the agent is started as non-minimal version, http server should # be configured to communicate with the dashboard in a head node. self.http_server = None # Used by the agent and sub-modules. # TODO(architkulkarni): Remove gcs_client once the agent exclusively uses # gcs_aio_client and not gcs_client. self.gcs_client = GcsClient(address=self.gcs_address) _initialize_internal_kv(self.gcs_client) assert _internal_kv_initialized() self.gcs_aio_client = GcsAioClient(address=self.gcs_address) self.publisher = GcsAioPublisher(address=self.gcs_address)
async def run(self): # Create a http session for all modules. # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"): self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) else: self.http_session = aiohttp.ClientSession() gcs_address = await self.get_gcs_address() # Dashboard will handle connection failure automatically self.gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=0) internal_kv._initialize_internal_kv(self.gcs_client) self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True) if gcs_pubsub_enabled(): self.gcs_error_subscriber = GcsAioErrorSubscriber( address=gcs_address) self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address) await self.gcs_error_subscriber.subscribe() await self.gcs_log_subscriber.subscribe() self.health_check_thread = GCSHealthCheckThread(gcs_address) self.health_check_thread.start() # Start a grpc asyncio server. await self.server.start() async def _async_notify(): """Notify signals from queue.""" while True: co = await dashboard_utils.NotifyQueue.get() try: await co except Exception: logger.exception(f"Error notifying coroutine {co}") modules = self._load_modules() # Http server should be initialized after all modules loaded. # working_dir uploads for job submission can be up to 100MiB. app = aiohttp.web.Application(client_max_size=100 * 1024**2) app.add_routes(routes=routes.bound_routes()) runner = aiohttp.web.AppRunner(app) await runner.setup() last_ex = None for i in range(1 + self.http_port_retries): try: site = aiohttp.web.TCPSite(runner, self.http_host, self.http_port) await site.start() break except OSError as e: last_ex = e self.http_port += 1 logger.warning("Try to use port %s: %s", self.http_port, e) else: raise Exception(f"Failed to find a valid port for dashboard after " f"{self.http_port_retries} retries: {last_ex}") http_host, http_port, *_ = site._server.sockets[0].getsockname() http_host = self.ip if ipaddress.ip_address( http_host).is_unspecified else http_host logger.info("Dashboard head http address: %s:%s", http_host, http_port) # TODO: Use async version if performance is an issue # Write the dashboard head port to gcs kv. internal_kv._internal_kv_put( ray_constants.DASHBOARD_ADDRESS, f"{http_host}:{http_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD) internal_kv._internal_kv_put( dashboard_consts.DASHBOARD_RPC_ADDRESS, f"{self.ip}:{self.grpc_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD) # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Freeze signal after all modules loaded. dashboard_utils.SignalManager.freeze() concurrent_tasks = [ self._gcs_check_alive(), _async_notify(), DataOrganizer.purge(), DataOrganizer.organize(), ] await asyncio.gather(*concurrent_tasks, *(m.run(self.server) for m in modules)) await self.server.wait_for_termination()