def testRequestBundlesAccountsForHeadNode(self): config = MULTI_WORKER_CLUSTER.copy() config["head_node_type"] = "p2.8xlarge" config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: "head" }, 1) runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 1 # These requests fit on the head node. autoscaler.update() self.waitForNodes(1) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert len(self.provider.mock_nodes) == 1 autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(1) # This request requires an additional worker node. autoscaler.request_resources([{"GPU": 8}] * 2) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
def __init__(self, redis_address, autoscaling_config, redis_password=None): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) self.global_state_accessor = GlobalStateAccessor( redis_address, redis_password, False) self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 # Setup subscriptions to the primary Redis server and the Redis shards. self.primary_subscribe_client = self.redis.pubsub( ignore_subscribe_messages=True) # Keep a mapping from raylet client ID to IP address to use # for updating the load metrics. self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) if autoscaling_config: self.autoscaler = StandardAutoscaler(autoscaling_config, self.load_metrics) self.autoscaling_config = autoscaling_config else: self.autoscaler = None self.autoscaling_config = None
def testScaleUpLoadMetrics(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.update() lm.update("1.2.3.4", {}, True, {}, True, {}, waiting_bundles=[{ "GPU": 1 }], infeasible_bundles=[{ "CPU": 16 }]) autoscaler.update() self.waitForNodes(2) nodes = { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } assert nodes == {"p2.xlarge", "m4.4xlarge"}
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) self.global_state_accessor = GlobalStateAccessor( redis_address, redis_password, False) self.global_state_accessor.connect() # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.mode = 0 # Keep a mapping from raylet client ID to IP address to use # for updating the load metrics. self.raylet_id_to_ip_map = {} head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) if autoscaling_config: self.autoscaler = StandardAutoscaler( autoscaling_config, self.load_metrics, prefix_cluster_info=prefix_cluster_info) self.autoscaling_config = autoscaling_config else: self.autoscaler = None self.autoscaling_config = None
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) (ip, port) = redis_address.split(":") self.gcs_client = connect_to_gcs(ip, int(port), redis_password) # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = grpc.insecure_channel(gcs_address, options=options) self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.gcs_client = self.gcs_client worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info self.autoscaling_config = autoscaling_config self.autoscaler = None logger.info("Monitor: Started")
def testScaleUpIgnoreUsed(self): config = MULTI_WORKER_CLUSTER.copy() # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["target_utilization_fraction"] = 1.0 config_path = self.write_config(config) self.provider = MockProvider() self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "p2.xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] self.provider.finish_starting_nodes() runner = MockProcessRunner() lm = LoadMetrics(local_ip=head_ip) autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) autoscaler.update() self.waitForNodes(1) lm.update(head_ip, {"CPU": 4, "GPU": 1}, {}, {}) self.waitForNodes(1) lm.update(head_ip, { "CPU": 4, "GPU": 1 }, {"GPU": 0}, {}, waiting_bundles=[{ "GPU": 1 }]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.xlarge"
def _setup_autoscaler(self): self.runner = MockProcessRunner() self.config = yaml.safe_load(open(self.config_path).read()) self.provider.create_node( {}, { TAG_RAY_NODE_KIND: NODE_KIND_HEAD, TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"], }, 1, ) self.head_ip = self.provider.non_terminated_node_ips({})[0] self.load_metrics = LoadMetrics(local_ip=self.head_ip) self.autoscaler = StandardAutoscaler( self.config_path, self.load_metrics, # Don't let the autoscaler start any node launchers. Instead, we # will launch nodes ourself after every update call. max_concurrent_launches=0, max_failures=0, process_runner=self.runner, update_interval_s=0, ) # Manually create a node launcher. Note that we won't start it as a # separate thread. self.node_launcher = NodeLauncher( provider=self.autoscaler.provider, queue=self.autoscaler.launch_queue, index=0, pending=self.autoscaler.pending_launches, node_types=self.autoscaler.available_node_types, )
def testRequestBundles(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"CPU": 32}] * 4) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[2].node_type == "m4.16xlarge" assert self.provider.mock_nodes[3].node_type == "m4.16xlarge"
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state(redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") self.gcs_client = connect_to_gcs(ip, int(port), redis_password) # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = grpc.insecure_channel(gcs_address, options=options) self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis worker.gcs_client = self.gcs_client worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( AUTOSCALER_METRIC_PORT, registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") logger.info("Monitor: Started")
def testScaleUpMinWorkers(self): config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 2 config["max_workers"] = 50 config["idle_timeout_minutes"] = 1 # Since config["min_workers"] > 1, the remaining worker is started # with the default worker node type. config["available_node_types"]["p2.8xlarge"]["min_workers"] = 1 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler( config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) assert len(self.provider.mock_nodes) == 2 assert { self.provider.mock_nodes[0].node_type, self.provider.mock_nodes[1].node_type } == {"p2.8xlarge", "m4.large"} self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "p2.8xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, 2) self.provider.create_node({}, { TAG_RAY_USER_NODE_TYPE: "m4.16xlarge", TAG_RAY_NODE_KIND: NODE_KIND_WORKER }, 2) assert len(self.provider.non_terminated_nodes({})) == 6 # Make sure that after idle_timeout_minutes we don't kill idle # min workers. for node_id in self.provider.non_terminated_nodes({}): lm.last_used_time_by_ip[self.provider.internal_ip(node_id)] = -60 autoscaler.update() self.waitForNodes(2) cnt = 0 for id in self.provider.mock_nodes: if self.provider.mock_nodes[id].state == "running" or \ self.provider.mock_nodes[id].state == "pending": assert self.provider.mock_nodes[id].node_type in { "p2.8xlarge", "m4.large" } cnt += 1 assert cnt == 2
def testScaleUpMinSanity(self): config_path = self.write_config(MULTI_WORKER_CLUSTER) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) autoscaler.update() self.waitForNodes(2)
def testResourceDemandVector(self): lm = LoadMetrics() lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {}, waiting_bundles=[{ "GPU": 1 }], infeasible_bundles=[{ "CPU": 16 }]) assert same_elements(lm.get_resource_demand_vector(), [{ "CPU": 16 }, { "GPU": 1 }])
def testPlacementGroupLoad(self): lm = LoadMetrics() pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ] lm.update("1.1.1.1", {}, {}, {}, pending_placement_groups=pending_placement_groups) assert lm.get_pending_placement_groups() == pending_placement_groups
def testCommandPassing(self): t = "custom" config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"][ "worker_setup_commands"] = ["new_worker_setup_command"] config["available_node_types"]["p2.xlarge"][ "initialization_commands"] = ["new_worker_initialization_cmd"] config["available_node_types"]["p2.xlarge"]["resources"][t] = 1 # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler( config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "new_worker_setup_command") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "setup_cmd") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "worker_setup_cmd") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "new_worker_initialization_cmd") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "init_cmd")
def testUpdateConfig(self): config = MULTI_WORKER_CLUSTER.copy() config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(2) config["min_workers"] = 0 config["available_node_types"]["m4.large"]["node_config"][ "field_changed"] = 1 config_path = self.write_config(config) autoscaler.update() self.waitForNodes(0)
def testResourcePassing(self): config = MULTI_WORKER_CLUSTER.copy() config["min_workers"] = 0 config["max_workers"] = 50 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler( config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" # TODO (Alex): Autoscaler creates the node during one update then # starts the updater in the enxt update. The sleep is largely # unavoidable because the updater runs in its own thread and we have no # good way of ensuring that the commands are sent in time. autoscaler.update() sleep(0.1) # These checks are done separately because we have no guarantees on the # order the dict is serialized in. runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.0", "\"CPU\":2") runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=") runner.assert_has_call("172.0.0.1", "\"CPU\":32") runner.assert_has_call("172.0.0.1", "\"GPU\":8")
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state( redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) self.gcs_node_info_stub = \ gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis gcs_client = GcsClient.create_from_redis(self.redis) _initialize_internal_kv(gcs_client) worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password if os.environ.get("RAY_FAKE_CLUSTER"): self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID) else: self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning("`prometheus_client` not found, so metrics will " "not be exported.") logger.info("Monitor: Started")
def testDockerWorkers(self): config = MULTI_WORKER_CLUSTER.copy() config["available_node_types"]["p2.8xlarge"]["docker"] = { "worker_image": "p2.8x_image:latest", "worker_run_options": ["p2.8x-run-options"] } config["available_node_types"]["p2.xlarge"]["docker"] = { "worker_image": "p2x_image:nightly" } config["docker"]["worker_run_options"] = ["standard-run-options"] config["docker"]["image"] = "default-image:nightly" config["docker"]["worker_image"] = "default-image:nightly" # Commenting out this line causes the test case to fail?!?! config["min_workers"] = 0 config["max_workers"] = 10 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() autoscaler = StandardAutoscaler(config_path, LoadMetrics(), max_failures=0, process_runner=runner, update_interval_s=0) assert len(self.provider.non_terminated_nodes({})) == 0 autoscaler.update() self.waitForNodes(0) autoscaler.request_resources([{"CPU": 1}]) autoscaler.update() self.waitForNodes(1) assert self.provider.mock_nodes[0].node_type == "m4.large" autoscaler.request_resources([{"GPU": 8}]) autoscaler.update() self.waitForNodes(2) assert self.provider.mock_nodes[1].node_type == "p2.8xlarge" autoscaler.request_resources([{"GPU": 1}] * 9) autoscaler.update() self.waitForNodes(3) assert self.provider.mock_nodes[2].node_type == "p2.xlarge" autoscaler.update() # Fill up m4, p2.8, p2 and request 2 more CPUs autoscaler.request_resources([{ "CPU": 2 }, { "CPU": 16 }, { "CPU": 32 }, { "CPU": 2 }]) autoscaler.update() self.waitForNodes(4) assert self.provider.mock_nodes[3].node_type == "m4.16xlarge" autoscaler.update() sleep(0.1) runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[1].internal_ip, "p2.8x_image:latest") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "default-image:nightly") runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip, "standard-run-options") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "p2x_image:nightly") runner.assert_has_call(self.provider.mock_nodes[2].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip, "p2.8x-run-options") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "default-image:nightly") runner.assert_has_call(self.provider.mock_nodes[3].internal_ip, "standard-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2.8x-run-options") runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip, "p2x_image:nightly")
def __init__( self, address: str, autoscaling_config: Union[str, Callable[[], Dict[str, Any]]], redis_password: Optional[str] = None, prefix_cluster_info: bool = False, monitor_ip: Optional[str] = None, stop_event: Optional[Event] = None, retry_on_failure: bool = True, ): gcs_address = address options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = ( gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)) self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( gcs_channel) if redis_password is not None: logger.warning("redis_password has been deprecated.") # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker gcs_client = GcsClient(address=gcs_address) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) _initialize_internal_kv(gcs_client) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) worker.mode = 0 head_node_ip = gcs_address.split(":")[0] self.load_metrics = LoadMetrics() self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.retry_on_failure = retry_on_failure self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry, ) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning( "`prometheus_client` not found, so metrics will not be exported." ) logger.info("Monitor: Started")
def testPlacementGroup(self): # Note this is mostly an integration test. See # testPlacementGroupScaling for more comprehensive tests. config = copy.deepcopy(MULTI_WORKER_CLUSTER) config["min_workers"] = 0 config["max_workers"] = 999 config_path = self.write_config(config) self.provider = MockProvider() runner = MockProcessRunner() lm = LoadMetrics() autoscaler = StandardAutoscaler(config_path, lm, max_failures=0, process_runner=runner, update_interval_s=0) self.provider.create_node({}, { TAG_RAY_NODE_KIND: "head", TAG_RAY_USER_NODE_TYPE: "m4.4xlarge" }, 1) head_ip = self.provider.non_terminated_node_ips({})[0] assert len(self.provider.non_terminated_nodes({})) == 1 autoscaler.update() self.waitForNodes(1) pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_SPREAD, bundles=[Bundle(unit_resources={"GPU": 2})] * 3), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 5)), ] # Since placement groups are implemented with custom resources, this is # an example of the accompanying resource demands. Note the resource # demand autoscaler will be unable to fulfill these demands, but we # should still handle the other infeasible/waiting bundles. placement_group_resource_demands = [{ "GPU_group_0_6c2506ac733bc37496295b02c4fad446": 0.0101, "GPU_group_6c2506ac733bc37496295b02c4fad446": 0.0101 }] lm.update(head_ip, {"CPU": 16}, True, {"CPU": 16}, False, {}, infeasible_bundles=placement_group_resource_demands, waiting_bundles=[{ "GPU": 8 }], pending_placement_groups=pending_placement_groups) autoscaler.update() self.waitForNodes(5) for i in range(1, 5): assert self.provider.mock_nodes[i].node_type == "p2.8xlarge" pending_placement_groups = [ PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.STRICT_PACK, bundles=([Bundle(unit_resources={"GPU": 2})] * 4)), PlacementGroupTableData( state=PlacementGroupTableData.RESCHEDULING, strategy=PlacementStrategy.SPREAD, bundles=([Bundle(unit_resources={"GPU": 2})] * 2)), ]