Exemple #1
0
 def __init__(self, group_name: str):
     self._group_name = group_name
     self._job_id = ray.get_runtime_context().job_id
     gcs_address = ray._private.worker._global_node.gcs_address
     self._gcs_client = GcsClient(address=gcs_address,
                                  nums_reconnect_retry=10)
     internal_kv._initialize_internal_kv(self._gcs_client)
Exemple #2
0
    def __init__(self, dashboard_agent):
        super().__init__(dashboard_agent)
        self._runtime_env_dir = dashboard_agent.runtime_env_dir
        self._logging_params = dashboard_agent.logging_params
        self._per_job_logger_cache = dict()
        # Cache the results of creating envs to avoid repeatedly calling into
        # conda and other slow calls.
        self._env_cache: Dict[str, CreatedEnvResult] = dict()
        # Maps a serialized runtime env to a lock that is used
        # to prevent multiple concurrent installs of the same env.
        self._env_locks: Dict[str, asyncio.Lock] = dict()
        # Keeps track of the URIs contained within each env so we can
        # invalidate the env cache when a URI is deleted.
        # This is a temporary mechanism until we have per-URI caching.
        self._uris_to_envs: Dict[str, Set[str]] = defaultdict(set)
        # Initialize internal KV to be used by the working_dir setup code.
        _initialize_internal_kv(self._dashboard_agent.gcs_client)
        assert _internal_kv_initialized()

        self._pip_manager = PipManager(self._runtime_env_dir)
        self._conda_manager = CondaManager(self._runtime_env_dir)
        self._py_modules_manager = PyModulesManager(self._runtime_env_dir)
        self._working_dir_manager = WorkingDirManager(self._runtime_env_dir)
        self._container_manager = ContainerManager(dashboard_agent.temp_dir)

        self._working_dir_uri_cache = URICache(
            self._working_dir_manager.delete_uri, WORKING_DIR_CACHE_SIZE_BYTES)
        self._py_modules_uri_cache = URICache(
            self._py_modules_manager.delete_uri, PY_MODULES_CACHE_SIZE_BYTES)
        self._conda_uri_cache = URICache(self._conda_manager.delete_uri,
                                         CONDA_CACHE_SIZE_BYTES)
        self._pip_uri_cache = URICache(self._pip_manager.delete_uri,
                                       PIP_CACHE_SIZE_BYTES)
        self._logger = default_logger
Exemple #3
0
    def __init__(self, dashboard_head):
        super().__init__(dashboard_head)
        self._gcs_job_info_stub = None
        self._gcs_actor_info_stub = None
        self._dashboard_head = dashboard_head

        _initialize_internal_kv(dashboard_head.gcs_client)
        assert _internal_kv_initialized()
        self._job_status_client = JobStatusStorageClient()
Exemple #4
0
    def __init__(self, dashboard_head):
        super().__init__(dashboard_head)
        self._gcs_job_info_stub = None
        self._gcs_actor_info_stub = None
        self._dashboard_head = dashboard_head

        # Initialize internal KV to be used by the working_dir setup code.
        _initialize_internal_kv(dashboard_head.gcs_client)
        assert _internal_kv_initialized()
Exemple #5
0
    def test_upload_fails(self):
        """Check that function throws useful error when upload fails."""

        uri = "gcs://test.zip"
        bytes = b"test"

        assert len(bytes) < GCS_STORAGE_MAX_SIZE

        _internal_kv_reset()
        _initialize_internal_kv(self.DisconnectedClient())
        with pytest.raises(RuntimeError, match="Failed to store package in the GCS"):
            _store_package_in_gcs(uri, bytes)
def get_ray_status_output(address):
    redis_client = ray._private.services.create_redis_client(address, "")
    gcs_client = gcs_utils.GcsClient.create_from_redis(redis_client)
    internal_kv._initialize_internal_kv(gcs_client)
    status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS)
    error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
    return {
        "demand": debug_status(
            status, error).split("Demands:")[1].strip("\n").strip(" "),
        "usage": debug_status(status, error).split("Demands:")[0].split(
            "Usage:")[1].strip("\n").strip(" ")
    }
def get_ray_status_output(address):
    gcs_client = gcs_utils.GcsClient(address=address)
    internal_kv._initialize_internal_kv(gcs_client)
    status = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_STATUS)
    error = internal_kv._internal_kv_get(DEBUG_AUTOSCALING_ERROR)
    return {
        "demand":
        debug_status(status,
                     error).split("Demands:")[1].strip("\n").strip(" "),
        "usage":
        debug_status(status, error).split("Demands:")[0].split("Usage:")
        [1].strip("\n").strip(" "),
    }
Exemple #8
0
    def __init__(self, dashboard_agent):
        super().__init__(dashboard_agent)
        self._runtime_env_dir = dashboard_agent.runtime_env_dir
        self._logging_params = dashboard_agent.logging_params
        self._per_job_logger_cache = dict()
        # Cache the results of creating envs to avoid repeatedly calling into
        # conda and other slow calls.
        self._env_cache: Dict[str, CreatedEnvResult] = dict()
        # Maps a serialized runtime env to a lock that is used
        # to prevent multiple concurrent installs of the same env.
        self._env_locks: Dict[str, asyncio.Lock] = dict()

        # Initialize internal KV to be used by the working_dir setup code.
        _initialize_internal_kv(self._dashboard_agent.gcs_client)
        assert _internal_kv_initialized()
Exemple #9
0
    def __init__(self, dashboard_agent):
        super().__init__(dashboard_agent)
        self._runtime_env_dir = dashboard_agent.runtime_env_dir
        self._logging_params = dashboard_agent.logging_params
        self._per_job_logger_cache = dict()
        # Cache the results of creating envs to avoid repeatedly calling into
        # conda and other slow calls.
        self._env_cache: Dict[str, CreatedEnvResult] = dict()
        # Maps a serialized runtime env to a lock that is used
        # to prevent multiple concurrent installs of the same env.
        self._env_locks: Dict[str, asyncio.Lock] = dict()
        _initialize_internal_kv(self._dashboard_agent.gcs_client)
        assert _internal_kv_initialized()

        self._pip_manager = PipManager(self._runtime_env_dir)
        self._conda_manager = CondaManager(self._runtime_env_dir)
        self._py_modules_manager = PyModulesManager(self._runtime_env_dir)
        self._java_jars_manager = JavaJarsManager(self._runtime_env_dir)
        self._working_dir_manager = WorkingDirManager(self._runtime_env_dir)
        self._container_manager = ContainerManager(dashboard_agent.temp_dir)

        self._reference_table = ReferenceTable(
            self.uris_parser,
            self.unused_uris_processor,
            self.unused_runtime_env_processor,
        )

        self._working_dir_uri_cache = URICache(
            self._working_dir_manager.delete_uri, WORKING_DIR_CACHE_SIZE_BYTES)
        self._py_modules_uri_cache = URICache(
            self._py_modules_manager.delete_uri, PY_MODULES_CACHE_SIZE_BYTES)
        self._java_jars_uri_cache = URICache(
            self._java_jars_manager.delete_uri, JAVA_JARS_CACHE_SIZE_BYTES)
        self._conda_uri_cache = URICache(self._conda_manager.delete_uri,
                                         CONDA_CACHE_SIZE_BYTES)
        self._pip_uri_cache = URICache(self._pip_manager.delete_uri,
                                       PIP_CACHE_SIZE_BYTES)
        self._logger = default_logger
Exemple #10
0
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False,
                 monitor_ip=None,
                 stop_event: Optional[Event] = None):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(
            redis_address, redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)
        if monitor_ip:
            self.redis.set("AutoscalerMetricsAddress",
                           f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}")
        (ip, port) = redis_address.split(":")
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)
        self.gcs_node_info_stub = \
            gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        gcs_client = GcsClient.create_from_redis(self.redis)
        _initialize_internal_kv(gcs_client)
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.redis_address = redis_address
        self.redis_password = redis_password
        if os.environ.get("RAY_FAKE_CLUSTER"):
            self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID)
        else:
            self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry)
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning("`prometheus_client` not found, so metrics will "
                           "not be exported.")

        logger.info("Monitor: Started")
Exemple #11
0
    async def run(self):
        gcs_address = self.gcs_address

        # Dashboard will handle connection failure automatically
        self.gcs_client = GcsClient(address=gcs_address,
                                    nums_reconnect_retry=0)
        internal_kv._initialize_internal_kv(self.gcs_client)
        self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True)

        self.gcs_error_subscriber = GcsAioErrorSubscriber(address=gcs_address)
        self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address)
        await self.gcs_error_subscriber.subscribe()
        await self.gcs_log_subscriber.subscribe()

        self.health_check_thread = GCSHealthCheckThread(gcs_address)
        self.health_check_thread.start()

        # Start a grpc asyncio server.
        await self.server.start()

        async def _async_notify():
            """Notify signals from queue."""
            while True:
                co = await dashboard_utils.NotifyQueue.get()
                try:
                    await co
                except Exception:
                    logger.exception(f"Error notifying coroutine {co}")

        modules = self._load_modules()

        http_host, http_port = self.http_host, self.http_port
        if not self.minimal:
            self.http_server = await self._configure_http_server(modules)
            http_host, http_port = self.http_server.get_address()
        internal_kv._internal_kv_put(
            ray_constants.DASHBOARD_ADDRESS,
            f"{http_host}:{http_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # TODO: Use async version if performance is an issue
        # Write the dashboard head port to gcs kv.
        internal_kv._internal_kv_put(
            dashboard_consts.DASHBOARD_RPC_ADDRESS,
            f"{self.ip}:{self.grpc_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
        )

        # Freeze signal after all modules loaded.
        dashboard_utils.SignalManager.freeze()
        concurrent_tasks = [
            self._gcs_check_alive(),
            _async_notify(),
            DataOrganizer.purge(),
            DataOrganizer.organize(),
        ]
        await asyncio.gather(*concurrent_tasks,
                             *(m.run(self.server) for m in modules))
        await self.server.wait_for_termination()

        if self.http_server:
            await self.http_server.cleanup()
Exemple #12
0
 def __init__(self):
     gcs_address = ray.worker._global_node.gcs_address
     self._gcs_client = GcsClient(address=gcs_address,
                                  nums_reconnect_retry=0)
     internal_kv._initialize_internal_kv(self._gcs_client)
Exemple #13
0
    def __init__(
        self,
        address: str,
        autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
        redis_password: Optional[str] = None,
        prefix_cluster_info: bool = False,
        monitor_ip: Optional[str] = None,
        stop_event: Optional[Event] = None,
        retry_on_failure: bool = True,
    ):
        gcs_address = address
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = (
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel))
        self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(
            gcs_channel)
        if redis_password is not None:
            logger.warning("redis_password has been deprecated.")
        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        gcs_client = GcsClient(address=gcs_address)

        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        _initialize_internal_kv(gcs_client)
        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        worker.mode = 0
        head_node_ip = gcs_address.split(":")[0]

        self.load_metrics = LoadMetrics()
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.retry_on_failure = retry_on_failure
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry,
                )
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning(
                "`prometheus_client` not found, so metrics will not be exported."
            )

        logger.info("Monitor: Started")
Exemple #14
0
    def __init__(
        self,
        node_ip_address,
        dashboard_agent_port,
        gcs_address,
        minimal,
        temp_dir=None,
        session_dir=None,
        runtime_env_dir=None,
        log_dir=None,
        metrics_export_port=None,
        node_manager_port=None,
        listen_port=0,
        object_store_name=None,
        raylet_name=None,
        logging_params=None,
        disable_metrics_collection: bool = False,
    ):
        """Initialize the DashboardAgent object."""
        # Public attributes are accessible for all agent modules.
        self.ip = node_ip_address
        self.minimal = minimal

        assert gcs_address is not None
        self.gcs_address = gcs_address

        self.temp_dir = temp_dir
        self.session_dir = session_dir
        self.runtime_env_dir = runtime_env_dir
        self.log_dir = log_dir
        self.dashboard_agent_port = dashboard_agent_port
        self.metrics_export_port = metrics_export_port
        self.node_manager_port = node_manager_port
        self.listen_port = listen_port
        self.object_store_name = object_store_name
        self.raylet_name = raylet_name
        self.logging_params = logging_params
        self.node_id = os.environ["RAY_NODE_ID"]
        self.metrics_collection_disabled = disable_metrics_collection
        # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is
        # only used for fate-sharing with the raylet and we need a different
        # fate-sharing mechanism for Windows anyways.
        if sys.platform not in ["win32", "cygwin"]:
            self.ppid = int(os.environ["RAY_RAYLET_PID"])
            assert self.ppid > 0
            logger.info("Parent pid is %s", self.ppid)

        # Setup raylet channel
        options = ray_constants.GLOBAL_GRPC_OPTIONS
        self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel(
            f"{self.ip}:{self.node_manager_port}", options, asynchronous=True)

        # Setup grpc server
        self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), ))
        grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
        try:
            self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
                self.server, f"{grpc_ip}:{self.dashboard_agent_port}")
        except Exception:
            # TODO(SongGuyang): Catch the exception here because there is
            # port conflict issue which brought from static port. We should
            # remove this after we find better port resolution.
            logger.exception(
                "Failed to add port to grpc server. Agent will stay alive but "
                "disable the grpc service.")
            self.server = None
            self.grpc_port = None
        else:
            logger.info("Dashboard agent grpc address: %s:%s", grpc_ip,
                        self.grpc_port)

        # If the agent is started as non-minimal version, http server should
        # be configured to communicate with the dashboard in a head node.
        self.http_server = None

        # Used by the agent and sub-modules.
        # TODO(architkulkarni): Remove gcs_client once the agent exclusively uses
        # gcs_aio_client and not gcs_client.
        self.gcs_client = GcsClient(address=self.gcs_address)
        _initialize_internal_kv(self.gcs_client)
        assert _internal_kv_initialized()
        self.gcs_aio_client = GcsAioClient(address=self.gcs_address)
        self.publisher = GcsAioPublisher(address=self.gcs_address)
Exemple #15
0
    async def run(self):

        # Create a http session for all modules.
        # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
        if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
            self.http_session = aiohttp.ClientSession(
                loop=asyncio.get_event_loop())
        else:
            self.http_session = aiohttp.ClientSession()

        gcs_address = await self.get_gcs_address()

        # Dashboard will handle connection failure automatically
        self.gcs_client = GcsClient(address=gcs_address,
                                    nums_reconnect_retry=0)
        internal_kv._initialize_internal_kv(self.gcs_client)
        self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True)
        if gcs_pubsub_enabled():
            self.gcs_error_subscriber = GcsAioErrorSubscriber(
                address=gcs_address)
            self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address)
            await self.gcs_error_subscriber.subscribe()
            await self.gcs_log_subscriber.subscribe()

        self.health_check_thread = GCSHealthCheckThread(gcs_address)
        self.health_check_thread.start()

        # Start a grpc asyncio server.
        await self.server.start()

        async def _async_notify():
            """Notify signals from queue."""
            while True:
                co = await dashboard_utils.NotifyQueue.get()
                try:
                    await co
                except Exception:
                    logger.exception(f"Error notifying coroutine {co}")

        modules = self._load_modules()

        # Http server should be initialized after all modules loaded.
        # working_dir uploads for job submission can be up to 100MiB.
        app = aiohttp.web.Application(client_max_size=100 * 1024**2)
        app.add_routes(routes=routes.bound_routes())

        runner = aiohttp.web.AppRunner(app)
        await runner.setup()
        last_ex = None
        for i in range(1 + self.http_port_retries):
            try:
                site = aiohttp.web.TCPSite(runner, self.http_host,
                                           self.http_port)
                await site.start()
                break
            except OSError as e:
                last_ex = e
                self.http_port += 1
                logger.warning("Try to use port %s: %s", self.http_port, e)
        else:
            raise Exception(f"Failed to find a valid port for dashboard after "
                            f"{self.http_port_retries} retries: {last_ex}")
        http_host, http_port, *_ = site._server.sockets[0].getsockname()
        http_host = self.ip if ipaddress.ip_address(
            http_host).is_unspecified else http_host
        logger.info("Dashboard head http address: %s:%s", http_host, http_port)

        # TODO: Use async version if performance is an issue
        # Write the dashboard head port to gcs kv.
        internal_kv._internal_kv_put(
            ray_constants.DASHBOARD_ADDRESS,
            f"{http_host}:{http_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)
        internal_kv._internal_kv_put(
            dashboard_consts.DASHBOARD_RPC_ADDRESS,
            f"{self.ip}:{self.grpc_port}",
            namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        # Dump registered http routes.
        dump_routes = [
            r for r in app.router.routes() if r.method != hdrs.METH_HEAD
        ]
        for r in dump_routes:
            logger.info(r)
        logger.info("Registered %s routes.", len(dump_routes))

        # Freeze signal after all modules loaded.
        dashboard_utils.SignalManager.freeze()
        concurrent_tasks = [
            self._gcs_check_alive(),
            _async_notify(),
            DataOrganizer.purge(),
            DataOrganizer.organize(),
        ]
        await asyncio.gather(*concurrent_tasks,
                             *(m.run(self.server) for m in modules))
        await self.server.wait_for_termination()