Ejemplo n.º 1
0
    def __init__(
        self,
        namespace: str = None,
    ):
        if namespace is not None and not isinstance(namespace, str):
            raise TypeError("namespace must a string, got: {}.".format(
                type(namespace)))

        self.gcs_client = GcsClient(
            address=ray.get_runtime_context().gcs_address)
        self.timeout = RAY_SERVE_KV_TIMEOUT_S
        self.namespace = namespace or ""
Ejemplo n.º 2
0
def try_create_gcs_client(
    address: Optional[str], redis_password: Optional[str]
) -> Optional[GcsClient]:
    """
    Try to create a gcs client based on the the command line args or by
    autodetecting a running Ray cluster.
    """
    address = canonicalize_bootstrap_address(address)
    if use_gcs_for_bootstrap():
        return GcsClient(address=address)
    else:
        if redis_password is None:
            redis_password = ray.ray_constants.REDIS_DEFAULT_PASSWORD
        return GcsClient.connect_to_gcs_by_redis_address(address, redis_password)
Ejemplo n.º 3
0
def serve_proxier(
    connection_str: str,
    address: Optional[str],
    *,
    redis_password: Optional[str] = None,
    session_dir: Optional[str] = None,
    runtime_env_agent_port: int = 0,
):
    # Initialize internal KV to be used to upload and download working_dir
    # before calling ray.init within the RayletServicers.
    # NOTE(edoakes): redis_address and redis_password should only be None in
    # tests.
    if use_gcs_for_bootstrap():
        if address is not None:
            gcs_cli = GcsClient(address=address)
            ray.experimental.internal_kv._initialize_internal_kv(gcs_cli)
    else:
        if address is not None and redis_password is not None:
            gcs_cli = GcsClient.connect_to_gcs_by_redis_address(
                address, redis_password)
            ray.experimental.internal_kv._initialize_internal_kv(gcs_cli)

    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=CLIENT_SERVER_MAX_THREADS),
        options=GRPC_OPTIONS,
    )
    proxy_manager = ProxyManager(
        address,
        session_dir=session_dir,
        redis_password=redis_password,
        runtime_env_agent_port=runtime_env_agent_port,
    )
    task_servicer = RayletServicerProxy(None, proxy_manager)
    data_servicer = DataServicerProxy(proxy_manager)
    logs_servicer = LogstreamServicerProxy(proxy_manager)
    ray_client_pb2_grpc.add_RayletDriverServicer_to_server(
        task_servicer, server)
    ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server(
        data_servicer, server)
    ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server(
        logs_servicer, server)
    add_port_to_grpc_server(server, connection_str)
    server.start()
    return ClientServerHandle(
        task_servicer=task_servicer,
        data_servicer=data_servicer,
        logs_servicer=logs_servicer,
        grpc_server=server,
    )
Ejemplo n.º 4
0
 def get_file_discovery_content(self):
     """Return the content for Prometheus service discovery."""
     nodes = ray.nodes()
     metrics_export_addresses = [
         "{}:{}".format(node["NodeManagerAddress"], node["MetricsExportPort"])
         for node in nodes
         if node["alive"] is True
     ]
     gcs_client = GcsClient(address=self.gcs_address)
     autoscaler_addr = gcs_client.internal_kv_get(b"AutoscalerMetricsAddress", None)
     if autoscaler_addr:
         metrics_export_addresses.append(autoscaler_addr.decode("utf-8"))
     return json.dumps(
         [{"labels": {"job": "ray"}, "targets": metrics_export_addresses}]
     )
Ejemplo n.º 5
0
 def __init__(self, group_name: str):
     self._group_name = group_name
     self._job_id = ray.get_runtime_context().job_id
     gcs_address = ray._private.worker._global_node.gcs_address
     self._gcs_client = GcsClient(address=gcs_address,
                                  nums_reconnect_retry=10)
     internal_kv._initialize_internal_kv(self._gcs_client)
Ejemplo n.º 6
0
def try_create_gcs_client(
        address: Optional[str],
        redis_password: Optional[str]) -> Optional[GcsClient]:
    """
    Try to create a gcs client based on the the command line args or by
    autodetecting a running Ray cluster.
    """
    address = canonicalize_bootstrap_address(address)
    return GcsClient(address=address)
Ejemplo n.º 7
0
def list_state_cli_group(ctx):
    address = services.canonicalize_bootstrap_address(None)
    gcs_client = GcsClient(address=address, nums_reconnect_retry=0)
    ray.experimental.internal_kv._initialize_internal_kv(gcs_client)
    api_server_url = ray._private.utils.internal_kv_get_with_retry(
        gcs_client,
        ray_constants.DASHBOARD_ADDRESS,
        namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        num_retries=20,
    )
    if api_server_url is None:
        raise ValueError((
            "Couldn't obtain the API server address from GCS. It is likely that "
            "the GCS server is down. Check gcs_server.[out | err] to see if it is "
            "still alive."))

    assert use_gcs_for_bootstrap()
    ctx.ensure_object(dict)
    ctx.obj["api_server_url"] = f"http://{api_server_url.decode()}"
Ejemplo n.º 8
0
 def __init__(self):
     gcs_address = ray.worker._global_node.gcs_address
     self._gcs_client = GcsClient(address=gcs_address,
                                  nums_reconnect_retry=0)
     internal_kv._initialize_internal_kv(self._gcs_client)
Ejemplo n.º 9
0
    def __init__(
        self,
        address: str,
        autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
        redis_password: Optional[str] = None,
        prefix_cluster_info: bool = False,
        monitor_ip: Optional[str] = None,
        stop_event: Optional[Event] = None,
        retry_on_failure: bool = True,
    ):
        gcs_address = address
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = (
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel))
        self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(
            gcs_channel)
        if redis_password is not None:
            logger.warning("redis_password has been deprecated.")
        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        gcs_client = GcsClient(address=gcs_address)

        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        _initialize_internal_kv(gcs_client)
        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        worker.mode = 0
        head_node_ip = gcs_address.split(":")[0]

        self.load_metrics = LoadMetrics()
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.retry_on_failure = retry_on_failure
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry,
                )
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning(
                "`prometheus_client` not found, so metrics will not be exported."
            )

        logger.info("Monitor: Started")
Ejemplo n.º 10
0
class RayInternalKVStore(KVStoreBase):
    """Wraps ray's internal_kv with a namespace to avoid collisions.

    Supports string keys and bytes values, caller must handle serialization.
    """
    def __init__(
        self,
        namespace: str = None,
    ):
        if namespace is not None and not isinstance(namespace, str):
            raise TypeError("namespace must a string, got: {}.".format(
                type(namespace)))

        self.gcs_client = GcsClient(
            address=ray.get_runtime_context().gcs_address)
        self.timeout = RAY_SERVE_KV_TIMEOUT_S
        self.namespace = namespace or ""

    def get_storage_key(self, key: str) -> str:
        return "{ns}-{key}".format(ns=self.namespace, key=key)

    def put(self, key: str, val: bytes) -> bool:
        """Put the key-value pair into the store.

        Args:
            key (str)
            val (bytes)
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        if not isinstance(val, bytes):
            raise TypeError("val must be bytes, got: {}.".format(type(val)))

        try:
            return self.gcs_client.internal_kv_put(
                self.get_storage_key(key).encode(),
                val,
                overwrite=True,
                namespace=ray_constants.KV_NAMESPACE_SERVE,
                timeout=self.timeout,
            )
        except Exception as e:
            raise KVStoreError(e.code())

    def get(self, key: str) -> Optional[bytes]:
        """Get the value associated with the given key from the store.

        Args:
            key (str)

        Returns:
            The bytes value. If the key wasn't found, returns None.
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))

        try:
            return self.gcs_client.internal_kv_get(
                self.get_storage_key(key).encode(),
                namespace=ray_constants.KV_NAMESPACE_SERVE,
                timeout=self.timeout,
            )
        except Exception as e:
            raise KVStoreError(e.code())

    def delete(self, key: str):
        """Delete the value associated with the given key from the store.

        Args:
            key (str)
        """

        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))

        try:
            return self.gcs_client.internal_kv_del(
                self.get_storage_key(key).encode(),
                False,
                namespace=ray_constants.KV_NAMESPACE_SERVE,
                timeout=self.timeout,
            )
        except Exception as e:
            raise KVStoreError(e.code())
Ejemplo n.º 11
0
    async def run(self):
        # Create an aioredis client for all modules.
        try:
            self.aioredis_client = await dashboard_utils.get_aioredis_client(
                self.redis_address, self.redis_password,
                dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
        except (socket.gaierror, ConnectionError):
            logger.error(
                "Dashboard head exiting: "
                "Failed to connect to redis at %s", self.redis_address)
            sys.exit(-1)

        # Create a http session for all modules.
        # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
        if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
            self.http_session = aiohttp.ClientSession(
                loop=asyncio.get_event_loop())
        else:
            self.http_session = aiohttp.ClientSession()

        # Waiting for GCS is ready.
        gcs_address = await get_gcs_address_with_retry(self.aioredis_client)
        self.gcs_client = GcsClient(gcs_address)
        self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True)

        self.health_check_thread = GCSHealthCheckThread(gcs_address)
        self.health_check_thread.start()

        # Start a grpc asyncio server.
        await self.server.start()

        async def _async_notify():
            """Notify signals from queue."""
            while True:
                co = await dashboard_utils.NotifyQueue.get()
                try:
                    await co
                except Exception:
                    logger.exception(f"Error notifying coroutine {co}")

        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        # working_dir uploads for job submission can be up to 100MiB.
        app = aiohttp.web.Application(client_max_size=100 * 1024**2)
        app.add_routes(routes=routes.bound_routes())

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        last_ex = None
        for i in range(1 + self.http_port_retries):
            try:
                site = aiohttp.web.TCPSite(runner, self.http_host,
                                           self.http_port)
                await site.start()
                break
            except OSError as e:
                last_ex = e
                self.http_port += 1
                logger.warning("Try to use port %s: %s", self.http_port, e)
        else:
            raise Exception(f"Failed to find a valid port for dashboard after "
                            f"{self.http_port_retries} retries: {last_ex}")
        http_host, http_port, *_ = site._server.sockets[0].getsockname()
        http_host = self.ip if ipaddress.ip_address(
            http_host).is_unspecified else http_host
        logger.info("Dashboard head http address: %s:%s", http_host, http_port)

        # Write the dashboard head port to redis.
        await self.aioredis_client.set(ray_constants.REDIS_KEY_DASHBOARD,
                                       f"{http_host}:{http_port}")
        await self.aioredis_client.set(
            dashboard_consts.REDIS_KEY_DASHBOARD_RPC,
            f"{self.ip}:{self.grpc_port}")

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Freeze signal after all modules loaded.
        dashboard_utils.SignalManager.freeze()
        concurrent_tasks = [
            self._gcs_check_alive(),
            _async_notify(),
            DataOrganizer.purge(),
            DataOrganizer.organize(),
        ]
        await asyncio.gather(*concurrent_tasks,
                             *(m.run(self.server) for m in modules))
        await self.server.wait_for_termination()
Ejemplo n.º 12
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if parent is None or parent.pid == 1 or self.ppid != parent.pid:
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS
                    )
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        # Start a grpc asyncio server.
        if self.server:
            await self.server.start()

        self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Setup http server if necessary.
        if not self.minimal:
            # If the agent is not minimal it should start the http server
            # to communicate with the dashboard in a head node.
            # Http server is not started in the minimal version because
            # it requires additional dependencies that are not
            # included in the minimal ray package.
            try:
                self.http_server = await self._configure_http_server(modules)
            except Exception:
                # TODO(SongGuyang): Catch the exception here because there is
                # port conflict issue which brought from static port. We should
                # remove this after we find better port resolution.
                logger.exception(
                    "Failed to start http server. Agent will stay alive but "
                    "disable the http service."
                )

        # Write the dashboard agent port to kv.
        # TODO: Use async version if performance is an issue
        # -1 should indicate that http server is not started.
        http_port = -1 if not self.http_server else self.http_server.http_port
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel
        )

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(
                agent_id=self.agent_id,
                agent_port=self.grpc_port,
                agent_ip_address=self.ip,
            )
        )

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()
Ejemplo n.º 13
0
    async def run(self):

        # Create a http session for all modules.
        # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
        if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
            self.http_session = aiohttp.ClientSession(
                loop=asyncio.get_event_loop())
        else:
            self.http_session = aiohttp.ClientSession()

        gcs_address = await self.get_gcs_address()

        # Dashboard will handle connection failure automatically
        self.gcs_client = GcsClient(address=gcs_address,
                                    nums_reconnect_retry=0)
        internal_kv._initialize_internal_kv(self.gcs_client)
        self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True)
        if gcs_pubsub_enabled():
            self.gcs_error_subscriber = GcsAioErrorSubscriber(
                address=gcs_address)
            self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address)
            await self.gcs_error_subscriber.subscribe()
            await self.gcs_log_subscriber.subscribe()

        self.health_check_thread = GCSHealthCheckThread(gcs_address)
        self.health_check_thread.start()

        # Start a grpc asyncio server.
        await self.server.start()

        async def _async_notify():
            """Notify signals from queue."""
            while True:
                co = await dashboard_utils.NotifyQueue.get()
                try:
                    await co
                except Exception:
                    logger.exception(f"Error notifying coroutine {co}")

        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        # working_dir uploads for job submission can be up to 100MiB.
        app = aiohttp.web.Application(client_max_size=100 * 1024**2)
        app.add_routes(routes=routes.bound_routes())

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        last_ex = None
        for i in range(1 + self.http_port_retries):
            try:
                site = aiohttp.web.TCPSite(runner, self.http_host,
                                           self.http_port)
                await site.start()
                break
            except OSError as e:
                last_ex = e
                self.http_port += 1
                logger.warning("Try to use port %s: %s", self.http_port, e)
        else:
            raise Exception(f"Failed to find a valid port for dashboard after "
                            f"{self.http_port_retries} retries: {last_ex}")
        http_host, http_port, *_ = site._server.sockets[0].getsockname()
        http_host = self.ip if ipaddress.ip_address(
            http_host).is_unspecified else http_host
        logger.info("Dashboard head http address: %s:%s", http_host, http_port)

        # TODO: Use async version if performance is an issue
        # Write the dashboard head port to gcs kv.
        internal_kv._internal_kv_put(
            ray_constants.DASHBOARD_ADDRESS,
            f"{http_host}:{http_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)
        internal_kv._internal_kv_put(
            dashboard_consts.DASHBOARD_RPC_ADDRESS,
            f"{self.ip}:{self.grpc_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Freeze signal after all modules loaded.
        dashboard_utils.SignalManager.freeze()
        concurrent_tasks = [
            self._gcs_check_alive(),
            _async_notify(),
            DataOrganizer.purge(),
            DataOrganizer.organize(),
        ]
        await asyncio.gather(*concurrent_tasks,
                             *(m.run(self.server) for m in modules))
        await self.server.wait_for_termination()
Ejemplo n.º 14
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if parent is None or parent.pid == 1 or self.ppid != parent.pid:
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.
                        DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        if not use_gcs_for_bootstrap():
            # Create an aioredis client for all modules.
            try:
                self.aioredis_client = await dashboard_utils.get_aioredis_client(
                    self.redis_address,
                    self.redis_password,
                    dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                    dashboard_consts.RETRY_REDIS_CONNECTION_TIMES,
                )
            except (socket.gaierror, ConnectionRefusedError):
                logger.error(
                    "Dashboard agent exiting: "
                    "Failed to connect to redis at %s",
                    self.redis_address,
                )
                sys.exit(-1)

        # Start a grpc asyncio server.
        await self.server.start()

        if not use_gcs_for_bootstrap():
            gcs_address = await self.aioredis_client.get(
                dashboard_consts.GCS_SERVER_ADDRESS)
            self.gcs_client = GcsClient(address=gcs_address.decode())
        else:
            self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Setup http server if necessary.
        if not self.minimal:
            # If the agent is not minimal it should start the http server
            # to communicate with the dashboard in a head node.
            # Http server is not started in the minimal version because
            # it requires additional dependencies that are not
            # included in the minimal ray package.
            self.http_server = await self._configure_http_server(modules)

        # Write the dashboard agent port to redis.
        # TODO: Use async version if performance is an issue
        # -1 should indicate that http server is not started.
        http_port = -1 if not self.http_server else self.http_server.http_port
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(
                agent_pid=os.getpid(),
                agent_port=self.grpc_port,
                agent_ip_address=self.ip,
            ))

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()
Ejemplo n.º 15
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if (parent is None or parent.pid == 1
                            or self.ppid != parent.pid):
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.
                        DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        if not use_gcs_for_bootstrap():
            # Create an aioredis client for all modules.
            try:
                self.aioredis_client = \
                    await dashboard_utils.get_aioredis_client(
                        self.redis_address, self.redis_password,
                        dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                        dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
            except (socket.gaierror, ConnectionRefusedError):
                logger.error(
                    "Dashboard agent exiting: "
                    "Failed to connect to redis at %s", self.redis_address)
                sys.exit(-1)

        # Create a http session for all modules.
        # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
        if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
            self.http_session = aiohttp.ClientSession(
                loop=asyncio.get_event_loop())
        else:
            self.http_session = aiohttp.ClientSession()

        # Start a grpc asyncio server.
        await self.server.start()

        if not use_gcs_for_bootstrap():
            gcs_address = await self.aioredis_client.get(
                dashboard_consts.GCS_SERVER_ADDRESS)
            self.gcs_client = GcsClient(address=gcs_address.decode())
        else:
            self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        app = aiohttp.web.Application()
        app.add_routes(routes=routes.bound_routes())

        # Enable CORS on all routes.
        cors = aiohttp_cors.setup(app,
                                  defaults={
                                      "*":
                                      aiohttp_cors.ResourceOptions(
                                          allow_credentials=True,
                                          expose_headers="*",
                                          allow_methods="*",
                                          allow_headers=("Content-Type",
                                                         "X-Header"),
                                      )
                                  })
        for route in list(app.router.routes()):
            cors.add(route)

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        site = aiohttp.web.TCPSite(
            runner, "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0",
            self.listen_port)
        await site.start()
        http_host, http_port, *_ = site._server.sockets[0].getsockname()
        logger.info("Dashboard agent http address: %s:%s", http_host,
                    http_port)

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Write the dashboard agent port to redis.
        # TODO: Use async version if performance is an issue
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(agent_pid=os.getpid(),
                                                   agent_port=self.grpc_port,
                                                   agent_ip_address=self.ip))

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()
        # Wait for finish signal.
        await runner.cleanup()
Ejemplo n.º 16
0
    async def run(self):
        gcs_address = self.gcs_address

        # Dashboard will handle connection failure automatically
        self.gcs_client = GcsClient(address=gcs_address,
                                    nums_reconnect_retry=0)
        internal_kv._initialize_internal_kv(self.gcs_client)
        self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True)

        self.gcs_error_subscriber = GcsAioErrorSubscriber(address=gcs_address)
        self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address)
        await self.gcs_error_subscriber.subscribe()
        await self.gcs_log_subscriber.subscribe()

        self.health_check_thread = GCSHealthCheckThread(gcs_address)
        self.health_check_thread.start()

        # Start a grpc asyncio server.
        await self.server.start()

        async def _async_notify():
            """Notify signals from queue."""
            while True:
                co = await dashboard_utils.NotifyQueue.get()
                try:
                    await co
                except Exception:
                    logger.exception(f"Error notifying coroutine {co}")

        modules = self._load_modules()

        http_host, http_port = self.http_host, self.http_port
        if not self.minimal:
            self.http_server = await self._configure_http_server(modules)
            http_host, http_port = self.http_server.get_address()
        internal_kv._internal_kv_put(
            ray_constants.DASHBOARD_ADDRESS,
            f"{http_host}:{http_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # TODO: Use async version if performance is an issue
        # Write the dashboard head port to gcs kv.
        internal_kv._internal_kv_put(
            dashboard_consts.DASHBOARD_RPC_ADDRESS,
            f"{self.ip}:{self.grpc_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Freeze signal after all modules loaded.
        dashboard_utils.SignalManager.freeze()
        concurrent_tasks = [
            self._gcs_check_alive(),
            _async_notify(),
            DataOrganizer.purge(),
            DataOrganizer.organize(),
        ]
        await asyncio.gather(*concurrent_tasks,
                             *(m.run(self.server) for m in modules))
        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()
Ejemplo n.º 17
0
    def __init__(
        self,
        node_ip_address,
        dashboard_agent_port,
        gcs_address,
        minimal,
        temp_dir=None,
        session_dir=None,
        runtime_env_dir=None,
        log_dir=None,
        metrics_export_port=None,
        node_manager_port=None,
        listen_port=0,
        object_store_name=None,
        raylet_name=None,
        logging_params=None,
        disable_metrics_collection: bool = False,
    ):
        """Initialize the DashboardAgent object."""
        # Public attributes are accessible for all agent modules.
        self.ip = node_ip_address
        self.minimal = minimal

        assert gcs_address is not None
        self.gcs_address = gcs_address

        self.temp_dir = temp_dir
        self.session_dir = session_dir
        self.runtime_env_dir = runtime_env_dir
        self.log_dir = log_dir
        self.dashboard_agent_port = dashboard_agent_port
        self.metrics_export_port = metrics_export_port
        self.node_manager_port = node_manager_port
        self.listen_port = listen_port
        self.object_store_name = object_store_name
        self.raylet_name = raylet_name
        self.logging_params = logging_params
        self.node_id = os.environ["RAY_NODE_ID"]
        self.metrics_collection_disabled = disable_metrics_collection
        # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is
        # only used for fate-sharing with the raylet and we need a different
        # fate-sharing mechanism for Windows anyways.
        if sys.platform not in ["win32", "cygwin"]:
            self.ppid = int(os.environ["RAY_RAYLET_PID"])
            assert self.ppid > 0
            logger.info("Parent pid is %s", self.ppid)

        # Setup raylet channel
        options = ray_constants.GLOBAL_GRPC_OPTIONS
        self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel(
            f"{self.ip}:{self.node_manager_port}", options, asynchronous=True)

        # Setup grpc server
        self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
        grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
        try:
            self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
                self.server, f"{grpc_ip}:{self.dashboard_agent_port}")
        except Exception:
            # TODO(SongGuyang): Catch the exception here because there is
            # port conflict issue which brought from static port. We should
            # remove this after we find better port resolution.
            logger.exception(
                "Failed to add port to grpc server. Agent will stay alive but "
                "disable the grpc service.")
            self.server = None
            self.grpc_port = None
        else:
            logger.info("Dashboard agent grpc address: %s:%s", grpc_ip,
                        self.grpc_port)

        # If the agent is started as non-minimal version, http server should
        # be configured to communicate with the dashboard in a head node.
        self.http_server = None

        # Used by the agent and sub-modules.
        # TODO(architkulkarni): Remove gcs_client once the agent exclusively uses
        # gcs_aio_client and not gcs_client.
        self.gcs_client = GcsClient(address=self.gcs_address)
        _initialize_internal_kv(self.gcs_client)
        assert _internal_kv_initialized()
        self.gcs_aio_client = GcsAioClient(address=self.gcs_address)
        self.publisher = GcsAioPublisher(address=self.gcs_address)
Ejemplo n.º 18
0
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False,
                 monitor_ip=None,
                 stop_event: Optional[Event] = None):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(
            redis_address, redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)
        if monitor_ip:
            self.redis.set("AutoscalerMetricsAddress",
                           f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}")
        (ip, port) = redis_address.split(":")
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)
        self.gcs_node_info_stub = \
            gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        gcs_client = GcsClient.create_from_redis(self.redis)
        _initialize_internal_kv(gcs_client)
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.redis_address = redis_address
        self.redis_password = redis_password
        if os.environ.get("RAY_FAKE_CLUSTER"):
            self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID)
        else:
            self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry)
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning("`prometheus_client` not found, so metrics will "
                           "not be exported.")

        logger.info("Monitor: Started")
Ejemplo n.º 19
0
def test_external_storage_namespace_isolation(shutdown_only):
    addr = ray.init(namespace="a",
                    _system_config={
                        "external_storage_namespace": "c1"
                    }).address_info["address"]
    gcs_client = GcsClient(address=addr)

    assert gcs_client.internal_kv_put(b"ABC", b"DEF", True, None) == 1

    assert gcs_client.internal_kv_get(b"ABC", None) == b"DEF"

    ray.shutdown()

    addr = ray.init(namespace="a",
                    _system_config={
                        "external_storage_namespace": "c2"
                    }).address_info["address"]
    gcs_client = GcsClient(address=addr)
    assert gcs_client.internal_kv_get(b"ABC", None) is None
    assert gcs_client.internal_kv_put(b"ABC", b"XYZ", True, None) == 1

    assert gcs_client.internal_kv_get(b"ABC", None) == b"XYZ"
    ray.shutdown()

    addr = ray.init(namespace="a",
                    _system_config={
                        "external_storage_namespace": "c1"
                    }).address_info["address"]
    gcs_client = GcsClient(address=addr)
    assert gcs_client.internal_kv_get(b"ABC", None) == b"DEF"