def _handle_failure(self, error): logger.exception("Error in monitor loop") if self.autoscaler is not None and \ os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1": self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True) if not use_gcs_for_bootstrap(): redis_client = ray._private.services.create_redis_client( self.redis_address, password=self.redis_password) else: redis_client = None gcs_publisher = None if gcs_pubsub_enabled(): if use_gcs_for_bootstrap(): gcs_publisher = GcsPublisher(address=args.gcs_address) else: gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) from ray._private.utils import publish_error_to_driver publish_error_to_driver(ray_constants.MONITOR_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher)
def __init__(self, http_host, http_port, http_port_retries, gcs_address, redis_address, redis_password, log_dir): self.health_check_thread: GCSHealthCheckThread = None self._gcs_rpc_error_counter = 0 # Public attributes are accessible for all head modules. # Walkaround for issue: https://github.com/ray-project/ray/issues/7084 self.http_host = "127.0.0.1" if http_host == "localhost" else http_host self.http_port = http_port self.http_port_retries = http_port_retries if use_gcs_for_bootstrap(): assert gcs_address is not None self.gcs_address = gcs_address else: self.redis_address = dashboard_utils.address_tuple(redis_address) self.redis_password = redis_password self.log_dir = log_dir self.aioredis_client = None self.aiogrpc_gcs_channel = None self.gcs_error_subscriber = None self.gcs_log_subscriber = None self.http_session = None self.ip = ray.util.get_node_ip_address() if not use_gcs_for_bootstrap(): ip, port = redis_address.split(":") else: ip, port = gcs_address.split(":") self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( self.server, f"{grpc_ip}:0") logger.info("Dashboard head grpc address: %s:%s", grpc_ip, self.grpc_port)
async def check(self, *args, **kwargs): try: if not ray.is_initialized(): try: if use_gcs_for_bootstrap(): address = self._dashboard_head.gcs_address redis_pw = None logger.info( f"Connecting to ray with address={address}") else: ip, port = self._dashboard_head.redis_address redis_pw = self._dashboard_head.redis_password address = f"{ip}:{port}" logger.info( f"Connecting to ray with address={address}, " f"redis_pw={redis_pw}") ray.init( address=address, namespace=RAY_INTERNAL_JOBS_NAMESPACE, _redis_password=redis_pw) except Exception as e: ray.shutdown() raise e from None return await f(self, *args, **kwargs) except Exception as e: logger.exception(f"Unexpected error in handler: {e}") return Response( text=traceback.format_exc(), status=aiohttp.web.HTTPInternalServerError.status_code)
def __init__(self, dashboard_agent): """Initialize the reporter object.""" super().__init__(dashboard_agent) if IN_KUBERNETES_POD: # psutil does not compute this correctly when in a K8s pod. # Use ray._private.utils instead. cpu_count = ray._private.utils.get_num_cpus() self._cpu_counts = (cpu_count, cpu_count) else: self._cpu_counts = (psutil.cpu_count(), psutil.cpu_count(logical=False)) self._ip = dashboard_agent.ip if not use_gcs_for_bootstrap(): self._redis_address, _ = dashboard_agent.redis_address self._is_head_node = (self._ip == self._redis_address) else: self._is_head_node = ( self._ip == dashboard_agent.gcs_address.split(":")[0]) self._hostname = socket.gethostname() self._workers = set() self._network_stats_hist = [(0, (0.0, 0.0))] # time, (sent, recv) self._metrics_agent = MetricsAgent( "127.0.0.1" if self._ip == "127.0.0.1" else "", dashboard_agent.metrics_export_port) self._key = f"{reporter_consts.REPORTER_PREFIX}" \ f"{self._dashboard_agent.node_id}"
def run_string_as_driver(driver_script: str, env: Dict = None): """Run a driver as a separate process. Args: driver_script (str): A string to run as a Python script. env (dict): The environment variables for the driver. Returns: The script's output. """ if env is not None and gcs_utils.use_gcs_for_bootstrap(): env.update({ "RAY_bootstrap_with_gcs": "1", "RAY_gcs_grpc_based_pubsub": "1", "RAY_gcs_storage": "memory", "RAY_bootstrap_with_gcs": "1", }) proc = subprocess.Popen( [sys.executable, "-"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env, ) with proc: output = proc.communicate(driver_script.encode("ascii"))[0] if proc.returncode: print(ray._private.utils.decode(output)) raise subprocess.CalledProcessError(proc.returncode, proc.args, output, proc.stderr) out = ray._private.utils.decode(output) return out
def __init__(self, node_ip_address, redis_address, dashboard_agent_port, gcs_address, redis_password=None, temp_dir=None, session_dir=None, runtime_env_dir=None, log_dir=None, metrics_export_port=None, node_manager_port=None, listen_port=0, object_store_name=None, raylet_name=None, logging_params=None): """Initialize the DashboardAgent object.""" # Public attributes are accessible for all agent modules. self.ip = node_ip_address if use_gcs_for_bootstrap(): assert gcs_address is not None self.gcs_address = gcs_address else: self.redis_address = dashboard_utils.address_tuple(redis_address) self.redis_password = redis_password self.aioredis_client = None self.gcs_address = None self.temp_dir = temp_dir self.session_dir = session_dir self.runtime_env_dir = runtime_env_dir self.log_dir = log_dir self.dashboard_agent_port = dashboard_agent_port self.metrics_export_port = metrics_export_port self.node_manager_port = node_manager_port self.listen_port = listen_port self.object_store_name = object_store_name self.raylet_name = raylet_name self.logging_params = logging_params self.node_id = os.environ["RAY_NODE_ID"] # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is # only used for fate-sharing with the raylet and we need a different # fate-sharing mechanism for Windows anyways. if sys.platform not in ["win32", "cygwin"]: self.ppid = int(os.environ["RAY_RAYLET_PID"]) assert self.ppid > 0 logger.info("Parent pid is %s", self.ppid) self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( self.server, f"{grpc_ip}:{self.dashboard_agent_port}") logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port) options = (("grpc.enable_http_proxy", 0), ) self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel( f"{self.ip}:{self.node_manager_port}", options, asynchronous=True) self.http_session = None
def test_raylet_tempfiles(shutdown_only): expected_socket_files = ({"plasma_store", "raylet"} if sys.platform != "win32" else set()) ray.init(num_cpus=0) node = ray.worker._global_node top_levels = set(os.listdir(node.get_session_dir_path())) assert top_levels.issuperset({"sockets", "logs"}) log_files_expected = { "log_monitor.log", "monitor.log", "raylet.out", "raylet.err", "gcs_server.out", "gcs_server.err", "dashboard.log", "dashboard_agent.log", } if not use_gcs_for_bootstrap(): log_files_expected.update({ "redis-shard_0.out", "redis-shard_0.err", "redis.out", "redis.err" }) def check_all_log_file_exists(): for expected in log_files_expected: log_files = set(os.listdir(node.get_logs_dir_path())) if expected not in log_files: raise RuntimeError(f"File {expected} not found!") return True wait_for_condition(check_all_log_file_exists) # Get the list of log files again since the previous one # might have the stale information. log_files = set(os.listdir(node.get_logs_dir_path())) assert log_files_expected.issubset(log_files) assert log_files.issuperset(log_files_expected) socket_files = set(os.listdir(node.get_sockets_dir_path())) assert socket_files == expected_socket_files ray.shutdown() ray.init(num_cpus=2) node = ray.worker._global_node top_levels = set(os.listdir(node.get_session_dir_path())) assert top_levels.issuperset({"sockets", "logs"}) time.sleep(3) # wait workers to start log_files = set(os.listdir(node.get_logs_dir_path())) assert log_files.issuperset(log_files_expected) # Check numbers of worker log file. assert sum(1 for filename in log_files if filename.startswith("worker")) == 4 socket_files = set(os.listdir(node.get_sockets_dir_path())) assert socket_files == expected_socket_files
def make_global_state_accessor(address_info): if not gcs_utils.use_gcs_for_bootstrap(): gcs_options = GcsClientOptions.from_redis_address( address_info["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD ) else: gcs_options = GcsClientOptions.from_gcs_address(address_info["gcs_address"]) global_state_accessor = GlobalStateAccessor(gcs_options) global_state_accessor.connect() return global_state_accessor
def test_heartbeats_single(ray_start_cluster_head): """Unit test for `Cluster.wait_for_nodes`. Test proper metrics. """ cluster = ray_start_cluster_head if use_gcs_for_bootstrap(): monitor = setup_monitor(cluster.gcs_address) else: monitor = setup_monitor(cluster.address) total_cpus = ray.state.cluster_resources()["CPU"] verify_load_metrics(monitor, ({"CPU": 0.0}, {"CPU": total_cpus})) @ray.remote def work(signal): wait_signal = signal.wait.remote() while True: ready, not_ready = ray.wait([wait_signal], timeout=0) if len(ready) == 1: break time.sleep(1) signal = SignalActor.remote() work_handle = work.remote(signal) verify_load_metrics(monitor, ({"CPU": 1.0}, {"CPU": total_cpus})) ray.get(signal.send.remote()) ray.get(work_handle) @ray.remote(num_cpus=1) class Actor: def work(self, signal): wait_signal = signal.wait.remote() while True: ready, not_ready = ray.wait([wait_signal], timeout=0) if len(ready) == 1: break time.sleep(1) signal = SignalActor.remote() test_actor = Actor.remote() work_handle = test_actor.work.remote(signal) time.sleep(1) # Time for actor to get placed and the method to start. verify_load_metrics(monitor, ({"CPU": 1.0}, {"CPU": total_cpus})) ray.get(signal.send.remote()) ray.get(work_handle) del monitor
def try_create_gcs_client( address: Optional[str], redis_password: Optional[str] ) -> Optional[GcsClient]: """ Try to create a gcs client based on the the command line args or by autodetecting a running Ray cluster. """ address = canonicalize_bootstrap_address(address) if use_gcs_for_bootstrap(): return GcsClient(address=address) else: if redis_password is None: redis_password = ray.ray_constants.REDIS_DEFAULT_PASSWORD return GcsClient.connect_to_gcs_by_redis_address(address, redis_password)
def make_gcs_client(address_info): if not use_gcs_for_bootstrap(): address = address_info["redis_address"] address = address.split(":") assert len(address) == 2 client = redis.StrictRedis( host=address[0], port=int(address[1]), password=ray_constants.REDIS_DEFAULT_PASSWORD) gcs_client = ray._private.gcs_utils.GcsClient.create_from_redis(client) else: address = address_info["gcs_address"] gcs_client = ray._private.gcs_utils.GcsClient(address=address) return gcs_client
def serve_proxier( connection_str: str, address: Optional[str], *, redis_password: Optional[str] = None, session_dir: Optional[str] = None, runtime_env_agent_port: int = 0, ): # Initialize internal KV to be used to upload and download working_dir # before calling ray.init within the RayletServicers. # NOTE(edoakes): redis_address and redis_password should only be None in # tests. if use_gcs_for_bootstrap(): if address is not None: gcs_cli = GcsClient(address=address) ray.experimental.internal_kv._initialize_internal_kv(gcs_cli) else: if address is not None and redis_password is not None: gcs_cli = GcsClient.connect_to_gcs_by_redis_address( address, redis_password) ray.experimental.internal_kv._initialize_internal_kv(gcs_cli) server = grpc.server( futures.ThreadPoolExecutor(max_workers=CLIENT_SERVER_MAX_THREADS), options=GRPC_OPTIONS, ) proxy_manager = ProxyManager( address, session_dir=session_dir, redis_password=redis_password, runtime_env_agent_port=runtime_env_agent_port, ) task_servicer = RayletServicerProxy(None, proxy_manager) data_servicer = DataServicerProxy(proxy_manager) logs_servicer = LogstreamServicerProxy(proxy_manager) ray_client_pb2_grpc.add_RayletDriverServicer_to_server( task_servicer, server) ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server( data_servicer, server) ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server( logs_servicer, server) add_port_to_grpc_server(server, connection_str) server.start() return ClientServerHandle( task_servicer=task_servicer, data_servicer=data_servicer, logs_servicer=logs_servicer, grpc_server=server, )
def __init__(self, redis_address, redis_password, gcs_address, temp_dir): if use_gcs_for_bootstrap(): gcs_client_options = ray._raylet.GcsClientOptions.from_gcs_address( gcs_address) self.gcs_address = gcs_address else: gcs_client_options = ray._raylet.GcsClientOptions.from_redis_address( redis_address, redis_password) self.redis_address = redis_address self.redis_password = redis_password ray.state.state._initialize_global_state(gcs_client_options) self.temp_dir = temp_dir self.default_service_discovery_flush_period = 5 super().__init__()
def get_ray_status_output(address): if gcs_utils.use_gcs_for_bootstrap(): gcs_client = gcs_utils.GcsClient(address=address) else: redis_client = ray._private.services.create_redis_client(address, "") gcs_client = gcs_utils.GcsClient.create_from_redis(redis_client) internal_kv._initialize_internal_kv(gcs_client) status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS) error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR) return { "demand": debug_status(status, error).split("Demands:")[1].strip("\n").strip(" "), "usage": debug_status(status, error).split("Demands:")[0].split("Usage:") [1].strip("\n").strip(" ") }
async def get_gcs_address(self): # Create an aioredis client for all modules. if use_gcs_for_bootstrap(): return self.gcs_address else: try: self.aioredis_client = \ await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES) except (socket.gaierror, ConnectionError): logger.error( "Dashboard head exiting: " "Failed to connect to redis at %s", self.redis_address) sys.exit(-1) return await get_gcs_address_with_retry(self.aioredis_client)
def __init__(self, logs_dir, redis_address, gcs_address, redis_password=None): """Initialize the log monitor object.""" self.ip = services.get_node_ip_address() self.logs_dir = logs_dir if gcs_utils.use_gcs_for_bootstrap(): self.redis_client = None else: self.redis_client = ray._private.services.create_redis_client( redis_address, password=redis_password ) gcs_address = gcs_utils.get_gcs_address_from_redis(self.redis_client) self.publisher = None if gcs_pubsub.gcs_pubsub_enabled(): self.publisher = gcs_pubsub.GcsPublisher(address=gcs_address) self.log_filenames = set() self.open_file_infos = [] self.closed_file_infos = [] self.can_open_more_files = True
def assert_no_thrashing(address): state = ray.state.GlobalState() if use_gcs_for_bootstrap(): options = GcsClientOptions.from_gcs_address(address) else: options = GcsClientOptions.from_redis_address( address, ray.ray_constants.REDIS_DEFAULT_PASSWORD) state._initialize_global_state(options) summary = memory_summary(address=address, stats_only=True) restored_bytes = 0 consumed_bytes = 0 for line in summary.split("\n"): if "Restored" in line: restored_bytes = int(line.split(" ")[1]) if "consumed" in line: consumed_bytes = int(line.split(" ")[-2]) assert (consumed_bytes >= restored_bytes ), f"consumed: {consumed_bytes}, restored: {restored_bytes}"
def list_state_cli_group(ctx): address = services.canonicalize_bootstrap_address(None) gcs_client = GcsClient(address=address, nums_reconnect_retry=0) ray.experimental.internal_kv._initialize_internal_kv(gcs_client) api_server_url = ray._private.utils.internal_kv_get_with_retry( gcs_client, ray_constants.DASHBOARD_ADDRESS, namespace=ray_constants.KV_NAMESPACE_DASHBOARD, num_retries=20, ) if api_server_url is None: raise ValueError(( "Couldn't obtain the API server address from GCS. It is likely that " "the GCS server is down. Check gcs_server.[out | err] to see if it is " "still alive.")) assert use_gcs_for_bootstrap() ctx.ensure_object(dict) ctx.obj["api_server_url"] = f"http://{api_server_url.decode()}"
def test_driver_lives_parallel(ray_start_regular): all_processes = ray.worker._global_node.all_processes process_infos = (all_processes[ray_constants.PROCESS_TYPE_RAYLET] + all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] + all_processes[ray_constants.PROCESS_TYPE_MONITOR]) if not use_gcs_for_bootstrap(): process_infos += all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] # Kill all the components in parallel. for process_info in process_infos: process_info.process.terminate() time.sleep(0.1) for process_info in process_infos: process_info.process.kill() for process_info in process_infos: process_info.process.wait()
def memory_summary(address=None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD, group_by="NODE_ADDRESS", sort_by="OBJECT_SIZE", units="B", line_wrap=True, stats_only=False, num_entries=None): from ray.dashboard.memory_utils import memory_summary address = services.canonicalize_bootstrap_address(address) state = GlobalState() if use_gcs_for_bootstrap(): options = GcsClientOptions.from_gcs_address(address) else: options = GcsClientOptions.from_redis_address(address, redis_password) state._initialize_global_state(options) if stats_only: return get_store_stats(state) return (memory_summary(state, group_by, sort_by, line_wrap, units, num_entries) + get_store_stats(state))
def node(self) -> ray.node.Node: """Gets a 'ray.Node' object for this node (the head node). If it does not already exist, one is created using the bootstrap address. """ if self._node: return self._node if use_gcs_for_bootstrap(): ray_params = RayParams(gcs_address=self.address) else: ray_params = RayParams(redis_address=self.address) if self._redis_password: ray_params.redis_password = self._redis_password self._node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False, spawn_reaper=False, connect_only=True) return self._node
async def decorator(self, *args, **kwargs): try: if not ray.is_initialized(): try: if use_gcs_for_bootstrap(): address = self._dashboard_head.gcs_address redis_pw = None logger.info( f"Connecting to ray with address={address}") else: ip, port = self._dashboard_head.redis_address redis_pw = self._dashboard_head.redis_password address = f"{ip}:{port}" logger.info( f"Connecting to ray with address={address}, " f"redis_pw={redis_pw}") ray.init( address=address, namespace=RAY_INTERNAL_DASHBOARD_NAMESPACE, _redis_password=redis_pw, ) except Exception as e: ray.shutdown() raise e from None if connect_to_serve: # TODO(edoakes): this should probably run in the `serve` # namespace. serve.start(detached=True) return await f(self, *args, **kwargs) except Exception as e: logger.exception(f"Unexpected error in handler: {e}") return Response( text=traceback.format_exc(), status=aiohttp.web.HTTPInternalServerError.status_code, )
def get_file_discovery_content(self): """Return the content for Prometheus service discovery.""" nodes = ray.nodes() metrics_export_addresses = [ "{}:{}".format(node["NodeManagerAddress"], node["MetricsExportPort"]) for node in nodes if node["alive"] is True ] if not use_gcs_for_bootstrap(): redis_client = services.create_redis_client( self.redis_address, self.redis_password) autoscaler_addr = redis_client.get("AutoscalerMetricsAddress") else: gcs_client = GcsClient(address=self.gcs_address) autoscaler_addr = gcs_client.internal_kv_get( b"AutoscalerMetricsAddress", None) if autoscaler_addr: metrics_export_addresses.append(autoscaler_addr.decode("utf-8")) return json.dumps([{ "labels": { "job": "ray" }, "targets": metrics_export_addresses }])
def add_node(self, wait=True, **node_args): """Adds a node to the local Ray Cluster. All nodes are by default started with the following settings: cleanup=True, num_cpus=1, object_store_memory=150 * 1024 * 1024 # 150 MiB Args: wait (bool): Whether to wait until the node is alive. node_args: Keyword arguments used in `start_ray_head` and `start_ray_node`. Overrides defaults. Returns: Node object of the added Ray node. """ default_kwargs = { "num_cpus": 1, "num_gpus": 0, "object_store_memory": 150 * 1024 * 1024, # 150 MiB "min_worker_port": 0, "max_worker_port": 0, "dashboard_port": None, } ray_params = ray._private.parameter.RayParams(**node_args) ray_params.update_if_absent(**default_kwargs) with disable_client_hook(): if self.head_node is None: node = ray.node.Node( ray_params, head=True, shutdown_at_exit=self._shutdown_at_exit, spawn_reaper=self._shutdown_at_exit, ) self.head_node = node self.redis_address = self.head_node.redis_address self.redis_password = node_args.get( "redis_password", ray_constants.REDIS_DEFAULT_PASSWORD ) self.webui_url = self.head_node.webui_url # Init global state accessor when creating head node. if use_gcs_for_bootstrap(): gcs_options = GcsClientOptions.from_gcs_address(node.gcs_address) else: gcs_options = GcsClientOptions.from_redis_address( self.redis_address, self.redis_password ) self.global_state._initialize_global_state(gcs_options) else: ray_params.update_if_absent(redis_address=self.redis_address) ray_params.update_if_absent(gcs_address=self.gcs_address) # We only need one log monitor per physical node. ray_params.update_if_absent(include_log_monitor=False) # Let grpc pick a port. ray_params.update_if_absent(node_manager_port=0) node = ray.node.Node( ray_params, head=False, shutdown_at_exit=self._shutdown_at_exit, spawn_reaper=self._shutdown_at_exit, ) self.worker_nodes.add(node) if wait: # Wait for the node to appear in the client table. We do this # so that the nodes appears in the client table in the order # that the corresponding calls to add_node were made. We do # this because in the tests we assume that the driver is # connected to the first node that is added. self._wait_for_node(node) return node
try: dashboard_url = ray.experimental.internal_kv._internal_kv_get( ray_constants.DASHBOARD_ADDRESS, namespace=ray_constants.KV_NAMESPACE_DASHBOARD) if dashboard_url: new_port = int(dashboard_url.split(b":")[-1]) assert new_port > int(port) break except AssertionError as e: logger.info("Retry because of %s", e) finally: if time.time() > start_time + timeout_seconds: raise Exception("Timed out while testing.") @pytest.mark.skipif(use_gcs_for_bootstrap(), reason="Not working right now.") def test_gcs_check_alive(fast_gcs_failure_detection, ray_start_with_dashboard): assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True) all_processes = ray.worker._global_node.all_processes dashboard_info = all_processes[ray_constants.PROCESS_TYPE_DASHBOARD][0] dashboard_proc = psutil.Process(dashboard_info.process.pid) gcs_server_info = all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER][0] gcs_server_proc = psutil.Process(gcs_server_info.process.pid) assert dashboard_proc.status() in [ psutil.STATUS_RUNNING, psutil.STATUS_SLEEPING, psutil.STATUS_DISK_SLEEP ] gcs_server_proc.kill()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if (parent is None or parent.pid == 1 or self.ppid != parent.pid): logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) if not use_gcs_for_bootstrap(): # Create an aioredis client for all modules. try: self.aioredis_client = \ await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES) except (socket.gaierror, ConnectionRefusedError): logger.error( "Dashboard agent exiting: " "Failed to connect to redis at %s", self.redis_address) sys.exit(-1) # Create a http session for all modules. # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"): self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) else: self.http_session = aiohttp.ClientSession() # Start a grpc asyncio server. await self.server.start() if not use_gcs_for_bootstrap(): gcs_address = await self.aioredis_client.get( dashboard_consts.GCS_SERVER_ADDRESS) self.gcs_client = GcsClient(address=gcs_address.decode()) else: self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Http server should be initialized after all modules loaded. app = aiohttp.web.Application() app.add_routes(routes=routes.bound_routes()) # Enable CORS on all routes. cors = aiohttp_cors.setup(app, defaults={ "*": aiohttp_cors.ResourceOptions( allow_credentials=True, expose_headers="*", allow_methods="*", allow_headers=("Content-Type", "X-Header"), ) }) for route in list(app.router.routes()): cors.add(route) runner = aiohttp.web.AppRunner(app) await runner.setup() site = aiohttp.web.TCPSite( runner, "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0", self.listen_port) await site.start() http_host, http_port, *_ = site._server.sockets[0].getsockname() logger.info("Dashboard agent http address: %s:%s", http_host, http_port) # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Write the dashboard agent port to redis. # TODO: Use async version if performance is an issue internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest(agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip)) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup()
loop.run_until_complete(agent.run()) except Exception as e: # All these env vars should be available because # they are provided by the parent raylet. restart_count = os.environ["RESTART_COUNT"] max_restart_count = os.environ["MAX_RESTART_COUNT"] raylet_pid = os.environ["RAY_RAYLET_PID"] node_ip = args.node_ip_address if restart_count >= max_restart_count: # Agent is failed to be started many times. # Push an error to all drivers, so that users can know the # impact of the issue. redis_client = None gcs_publisher = None if gcs_pubsub_enabled(): if use_gcs_for_bootstrap(): gcs_publisher = GcsPublisher(args.gcs_address) else: redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) else: redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) traceback_str = ray._private.utils.format_error_message( traceback.format_exc()) message = ( f"(ip={node_ip}) " f"The agent on node {platform.uname()[1]} failed to "
class TestRedisPassword: @pytest.mark.skipif(use_gcs_for_bootstrap(), reason="Not valid for gcs bootstrap") def test_redis_password(self, password, shutdown_only): @ray.remote def f(): return 1 info = ray.init(_redis_password=password) address = info["redis_address"] redis_ip, redis_port = address.split(":") # Check that we can run a task object_ref = f.remote() ray.get(object_ref) # Check that Redis connections require a password redis_client = redis.StrictRedis(host=redis_ip, port=redis_port, password=None) with pytest.raises(redis.exceptions.AuthenticationError): redis_client.ping() # We want to simulate how this is called by ray.scripts.start(). try: ray._private.services.wait_for_redis_to_start( redis_ip, redis_port, password="******") # We catch a generic Exception here in case someone later changes the # type of the exception. except Exception as ex: if not (isinstance(ex.__cause__, redis.AuthenticationError) and "invalid password" in str(ex.__cause__)) and not ( isinstance(ex, redis.ResponseError) and "WRONGPASS invalid username-password pair" in str(ex)): raise # By contrast, we may be fairly confident the exact string # 'invalid password' won't go away, because redis-py simply wraps # the exact error from the Redis library. # https://github.com/andymccurdy/redis-py/blob/master/ # redis/connection.py#L132 # Except, apparently sometimes redis-py raises a completely # different *type* of error for a bad password, # redis.ResponseError, which is not even derived from # redis.ConnectionError as redis.AuthenticationError is. # Check that we can connect to Redis using the provided password redis_client = redis.StrictRedis(host=redis_ip, port=redis_port, password=password) assert redis_client.ping() def test_redis_password_cluster(self, password, shutdown_only): @ray.remote def f(): return 1 node_args = {"redis_password": password} cluster = Cluster(initialize_head=True, connect=True, head_node_args=node_args) cluster.add_node(**node_args) object_ref = f.remote() ray.get(object_ref)
log_monitor = LogMonitor( args.logs_dir, args.redis_address, args.gcs_address, redis_password=args.redis_password, ) try: log_monitor.run() except Exception as e: # Something went wrong, so push an error to all drivers. redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) gcs_publisher = None if gcs_pubsub_enabled(): if gcs_utils.use_gcs_for_bootstrap(): gcs_publisher = GcsPublisher(address=args.gcs_address) else: gcs_publisher = GcsPublisher( address=gcs_utils.get_gcs_address_from_redis(redis_client)) traceback_str = ray._private.utils.format_error_message( traceback.format_exc()) message = (f"The log monitor on node {platform.node()} " f"failed with the following error:\n{traceback_str}") ray._private.utils.publish_error_to_driver( ray_constants.LOG_MONITOR_DIED_ERROR, message, redis_client=redis_client, gcs_publisher=gcs_publisher, ) logger.error(message)
# Create a second remote function to guarantee that when we call # get_path2.remote(), the second function to run will have been run on # the worker. @ray.remote def get_path2(): return sys.path assert "fake_directory" not in ray.get(get_path2.remote()) @pytest.mark.skipif( "RAY_PROFILING" not in os.environ, reason="Only tested in client/profiling build.") @pytest.mark.skipif( client_test_enabled() and use_gcs_for_bootstrap(), reason=("wait_for_function will miss in this mode. To be fixed after using" " gcs to bootstrap all component.")) def test_profiling_api(ray_start_2_cpus): @ray.remote def f(delay): with profiling.profile( "custom_event", extra_data={"name": "custom name"}): time.sleep(delay) pass @ray.remote def g(input_list): # The argument input_list should be a list containing one object ref. ray.wait([input_list[0]])