Exemple #1
0
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(redis_address,
                                                 redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)

        (ip, port) = redis_address.split(":")
        self.gcs_client = connect_to_gcs(ip, int(port), redis_password)
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")

        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = grpc.insecure_channel(gcs_address, options=options)
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        worker.gcs_client = self.gcs_client
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None

        logger.info("Monitor: Started")
Exemple #2
0
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False,
                 monitor_ip=None,
                 stop_event: Optional[Event] = None):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(redis_address,
                                                 redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)
        if monitor_ip:
            self.redis.set("AutoscalerMetricsAddress",
                           f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}")
        (ip, port) = redis_address.split(":")
        self.gcs_client = connect_to_gcs(ip, int(port), redis_password)
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")

        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = grpc.insecure_channel(gcs_address, options=options)
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        worker.gcs_client = self.gcs_client
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.redis_address = redis_address
        self.redis_password = redis_password
        self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    AUTOSCALER_METRIC_PORT,
                    registry=self.prom_metrics.registry)
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")

        logger.info("Monitor: Started")
Exemple #3
0
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False,
                 monitor_ip=None,
                 stop_event: Optional[Event] = None):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(
            redis_address, redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)
        if monitor_ip:
            self.redis.set("AutoscalerMetricsAddress",
                           f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}")
        (ip, port) = redis_address.split(":")
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)
        self.gcs_node_info_stub = \
            gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        gcs_client = GcsClient.create_from_redis(self.redis)
        _initialize_internal_kv(gcs_client)
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.redis_address = redis_address
        self.redis_password = redis_password
        if os.environ.get("RAY_FAKE_CLUSTER"):
            self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID)
        else:
            self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry)
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning("`prometheus_client` not found, so metrics will "
                           "not be exported.")

        logger.info("Monitor: Started")
Exemple #4
0
    def __init__(
        self,
        address: str,
        autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
        redis_password: Optional[str] = None,
        prefix_cluster_info: bool = False,
        monitor_ip: Optional[str] = None,
        stop_event: Optional[Event] = None,
        retry_on_failure: bool = True,
    ):
        gcs_address = address
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = (
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel))
        self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(
            gcs_channel)
        if redis_password is not None:
            logger.warning("redis_password has been deprecated.")
        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        gcs_client = GcsClient(address=gcs_address)

        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        _initialize_internal_kv(gcs_client)
        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        worker.mode = 0
        head_node_ip = gcs_address.split(":")[0]

        self.load_metrics = LoadMetrics()
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.retry_on_failure = retry_on_failure
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry,
                )
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning(
                "`prometheus_client` not found, so metrics will not be exported."
            )

        logger.info("Monitor: Started")