Example #1
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if (parent is None or parent.pid == 1
                            or self.ppid != parent.pid):
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.
                        DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        if not use_gcs_for_bootstrap():
            # Create an aioredis client for all modules.
            try:
                self.aioredis_client = \
                    await dashboard_utils.get_aioredis_client(
                        self.redis_address, self.redis_password,
                        dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                        dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
            except (socket.gaierror, ConnectionRefusedError):
                logger.error(
                    "Dashboard agent exiting: "
                    "Failed to connect to redis at %s", self.redis_address)
                sys.exit(-1)

        # Create a http session for all modules.
        # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
        if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
            self.http_session = aiohttp.ClientSession(
                loop=asyncio.get_event_loop())
        else:
            self.http_session = aiohttp.ClientSession()

        # Start a grpc asyncio server.
        await self.server.start()

        if not use_gcs_for_bootstrap():
            gcs_address = await self.aioredis_client.get(
                dashboard_consts.GCS_SERVER_ADDRESS)
            self.gcs_client = GcsClient(address=gcs_address.decode())
        else:
            self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        app = aiohttp.web.Application()
        app.add_routes(routes=routes.bound_routes())

        # Enable CORS on all routes.
        cors = aiohttp_cors.setup(app,
                                  defaults={
                                      "*":
                                      aiohttp_cors.ResourceOptions(
                                          allow_credentials=True,
                                          expose_headers="*",
                                          allow_methods="*",
                                          allow_headers=("Content-Type",
                                                         "X-Header"),
                                      )
                                  })
        for route in list(app.router.routes()):
            cors.add(route)

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        site = aiohttp.web.TCPSite(
            runner, "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0",
            self.listen_port)
        await site.start()
        http_host, http_port, *_ = site._server.sockets[0].getsockname()
        logger.info("Dashboard agent http address: %s:%s", http_host,
                    http_port)

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Write the dashboard agent port to redis.
        # TODO: Use async version if performance is an issue
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(agent_pid=os.getpid(),
                                                   agent_port=self.grpc_port,
                                                   agent_ip_address=self.ip))

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()
        # Wait for finish signal.
        await runner.cleanup()
Example #2
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead."""
            curr_proc = psutil.Process()
            while True:
                parent = curr_proc.parent()
                if parent is None or parent.pid == 1:
                    logger.error("raylet is dead, agent will die because "
                                 "it fate-shares with raylet.")
                    sys.exit(0)
                await asyncio.sleep(
                    dashboard_consts.
                    DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)

        check_parent_task = create_task(_check_parent())

        # Create an aioredis client for all modules.
        try:
            self.aioredis_client = await dashboard_utils.get_aioredis_client(
                self.redis_address, self.redis_password,
                dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                dashboard_consts.RETRY_REDIS_CONNECTION_TIMES)
        except (socket.gaierror, ConnectionRefusedError):
            logger.error(
                "Dashboard agent exiting: "
                "Failed to connect to redis at %s", self.redis_address)
            sys.exit(-1)

        # Create a http session for all modules.
        self.http_session = aiohttp.ClientSession(
            loop=asyncio.get_event_loop())

        # Start a grpc asyncio server.
        await self.server.start()

        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        app = aiohttp.web.Application()
        app.add_routes(routes=routes.bound_routes())

        # Enable CORS on all routes.
        cors = aiohttp_cors.setup(
            app,
            defaults={
                "*": aiohttp_cors.ResourceOptions(
                    allow_credentials=True,
                    expose_headers="*",
                    allow_methods="*",
                    allow_headers=("Content-Type", "X-Header"),
                )
            })
        for route in list(app.router.routes()):
            cors.add(route)

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        site = aiohttp.web.TCPSite(runner, self.ip, 0)
        await site.start()
        http_host, http_port = site._server.sockets[0].getsockname()
        logger.info("Dashboard agent http address: %s:%s", http_host,
                    http_port)

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Write the dashboard agent port to redis.
        await self.aioredis_client.set(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]))

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(
                agent_pid=os.getpid(),
                agent_port=self.grpc_port,
                agent_ip_address=self.ip))

        await asyncio.gather(check_parent_task,
                             *(m.run(self.server) for m in modules))
        await self.server.wait_for_termination()
        # Wait for finish signal.
        await runner.cleanup()
Example #3
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if parent is None or parent.pid == 1 or self.ppid != parent.pid:
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS
                    )
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        # Start a grpc asyncio server.
        if self.server:
            await self.server.start()

        self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Setup http server if necessary.
        if not self.minimal:
            # If the agent is not minimal it should start the http server
            # to communicate with the dashboard in a head node.
            # Http server is not started in the minimal version because
            # it requires additional dependencies that are not
            # included in the minimal ray package.
            try:
                self.http_server = await self._configure_http_server(modules)
            except Exception:
                # TODO(SongGuyang): Catch the exception here because there is
                # port conflict issue which brought from static port. We should
                # remove this after we find better port resolution.
                logger.exception(
                    "Failed to start http server. Agent will stay alive but "
                    "disable the http service."
                )

        # Write the dashboard agent port to kv.
        # TODO: Use async version if performance is an issue
        # -1 should indicate that http server is not started.
        http_port = -1 if not self.http_server else self.http_server.http_port
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel
        )

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(
                agent_id=self.agent_id,
                agent_port=self.grpc_port,
                agent_ip_address=self.ip,
            )
        )

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()
Example #4
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if parent is None or parent.pid == 1 or self.ppid != parent.pid:
                        log_path = os.path.join(self.log_dir, "raylet.out")
                        error = False
                        msg = f"Raylet is terminated: ip={self.ip}, id={self.node_id}. "
                        try:
                            with open(log_path, "r", encoding="utf-8") as f:
                                # Seek to _RAYLET_LOG_MAX_TAIL_SIZE from the end if the
                                # file is larger than that.
                                f.seek(0, io.SEEK_END)
                                pos = max(0,
                                          f.tell() - _RAYLET_LOG_MAX_TAIL_SIZE)
                                f.seek(pos, io.SEEK_SET)
                                # Read remaining logs by lines.
                                raylet_logs = f.readlines()
                                # Assume the SIGTERM message must exist within the last
                                # _RAYLET_LOG_MAX_TAIL_SIZE of the log file.
                                if any("Raylet received SIGTERM" in line
                                       for line in raylet_logs):
                                    msg += "Termination is graceful."
                                    logger.info(msg)
                                else:
                                    msg += (
                                        "Termination is unexpected. Possible reasons "
                                        "include: (1) SIGKILL by the user or system "
                                        "OOM killer, (2) Invalid memory access from "
                                        "Raylet causing SIGSEGV or SIGBUS, "
                                        "(3) Other termination signals. "
                                        f"Last {_RAYLET_LOG_MAX_PUBLISH_LINES} lines "
                                        "of the Raylet logs:\n")
                                    msg += "    " + "    ".join(raylet_logs[
                                        -_RAYLET_LOG_MAX_PUBLISH_LINES:])
                                    error = True
                        except Exception as e:
                            msg += f"Failed to read Raylet logs at {log_path}: {e}!"
                            logger.exception()
                            error = True
                        if error:
                            logger.error(msg)
                            # TODO: switch to async if necessary.
                            ray._private.utils.publish_error_to_driver(
                                ray_constants.RAYLET_DIED_ERROR,
                                msg,
                                gcs_publisher=GcsPublisher(
                                    address=self.gcs_address),
                            )
                        else:
                            logger.info(msg)
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.
                        DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        # Start a grpc asyncio server.
        if self.server:
            await self.server.start()

        modules = self._load_modules()

        # Setup http server if necessary.
        if not self.minimal:
            # If the agent is not minimal it should start the http server
            # to communicate with the dashboard in a head node.
            # Http server is not started in the minimal version because
            # it requires additional dependencies that are not
            # included in the minimal ray package.
            try:
                self.http_server = await self._configure_http_server(modules)
            except Exception:
                # TODO(SongGuyang): Catch the exception here because there is
                # port conflict issue which brought from static port. We should
                # remove this after we find better port resolution.
                logger.exception(
                    "Failed to start http server. Agent will stay alive but "
                    "disable the http service.")

        # Write the dashboard agent port to kv.
        # TODO: Use async version if performance is an issue
        # -1 should indicate that http server is not started.
        http_port = -1 if not self.http_server else self.http_server.http_port
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(
                agent_pid=os.getpid(),
                agent_port=self.grpc_port,
                agent_ip_address=self.ip,
            ))

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()
Example #5
0
    async def run(self):
        async def _check_parent():
            """Check if raylet is dead and fate-share if it is."""
            try:
                curr_proc = psutil.Process()
                while True:
                    parent = curr_proc.parent()
                    if parent is None or parent.pid == 1 or self.ppid != parent.pid:
                        logger.error("Raylet is dead, exiting.")
                        sys.exit(0)
                    await asyncio.sleep(
                        dashboard_consts.
                        DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)
            except Exception:
                logger.error("Failed to check parent PID, exiting.")
                sys.exit(1)

        if sys.platform not in ["win32", "cygwin"]:
            check_parent_task = create_task(_check_parent())

        if not use_gcs_for_bootstrap():
            # Create an aioredis client for all modules.
            try:
                self.aioredis_client = await dashboard_utils.get_aioredis_client(
                    self.redis_address,
                    self.redis_password,
                    dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS,
                    dashboard_consts.RETRY_REDIS_CONNECTION_TIMES,
                )
            except (socket.gaierror, ConnectionRefusedError):
                logger.error(
                    "Dashboard agent exiting: "
                    "Failed to connect to redis at %s",
                    self.redis_address,
                )
                sys.exit(-1)

        # Start a grpc asyncio server.
        await self.server.start()

        if not use_gcs_for_bootstrap():
            gcs_address = await self.aioredis_client.get(
                dashboard_consts.GCS_SERVER_ADDRESS)
            self.gcs_client = GcsClient(address=gcs_address.decode())
        else:
            self.gcs_client = GcsClient(address=self.gcs_address)
        modules = self._load_modules()

        # Setup http server if necessary.
        if not self.minimal:
            # If the agent is not minimal it should start the http server
            # to communicate with the dashboard in a head node.
            # Http server is not started in the minimal version because
            # it requires additional dependencies that are not
            # included in the minimal ray package.
            self.http_server = await self._configure_http_server(modules)

        # Write the dashboard agent port to redis.
        # TODO: Use async version if performance is an issue
        # -1 should indicate that http server is not started.
        http_port = -1 if not self.http_server else self.http_server.http_port
        internal_kv._internal_kv_put(
            f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}",
            json.dumps([http_port, self.grpc_port]),
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Register agent to agent manager.
        raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
            self.aiogrpc_raylet_channel)

        await raylet_stub.RegisterAgent(
            agent_manager_pb2.RegisterAgentRequest(
                agent_pid=os.getpid(),
                agent_port=self.grpc_port,
                agent_ip_address=self.ip,
            ))

        tasks = [m.run(self.server) for m in modules]
        if sys.platform not in ["win32", "cygwin"]:
            tasks.append(check_parent_task)
        await asyncio.gather(*tasks)

        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()