def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" request = gcs_service_pb2.GetAllResourceUsageRequest() response = self.gcs_node_resources_stub.GetAllResourceUsage( request, timeout=4) resources_batch_data = response.resource_usage_data for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) use_node_id_as_ip = (self.autoscaler is not None and self.autoscaler.config["provider"].get( "use_node_id_as_ip", False)) if use_node_id_as_ip: ip = str(int(total_resources.get("NODE_ID_AS_RESOURCE", 0))) else: ip = resource_message.node_manager_address self.load_metrics.update( ip, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups)
def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" request = gcs_service_pb2.GetAllResourceUsageRequest() response = self.gcs_node_resources_stub.GetAllResourceUsage( request, timeout=4) resources_batch_data = response.resource_usage_data for resource_message in resources_batch_data.batch: resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) ip = resource_message.node_manager_address self.load_metrics.update( ip, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups)
def update_load_metrics(self): """Fetches resource usage data from GCS and updates load metrics.""" request = gcs_service_pb2.GetAllResourceUsageRequest() response = self.gcs_node_resources_stub.GetAllResourceUsage( request, timeout=60) resources_batch_data = response.resource_usage_data # Tell the readonly node provider what nodes to report. if self.readonly_config: new_nodes = [] for msg in list(resources_batch_data.batch): node_id = msg.node_id.hex() new_nodes.append((node_id, msg.node_manager_address)) self.autoscaler.provider._set_nodes(new_nodes) mirror_node_types = {} cluster_full = False for resource_message in resources_batch_data.batch: node_id = resource_message.node_id # Generate node type config based on GCS reported node list. if self.readonly_config: # Keep prefix in sync with ReadonlyNodeProvider. node_type = format_readonly_node_type(node_id.hex()) resources = {} for k, v in resource_message.resources_total.items(): resources[k] = v mirror_node_types[node_type] = { "resources": resources, "node_config": {}, "max_workers": 1, } if (hasattr(resource_message, "cluster_full_of_actors_detected") and resource_message.cluster_full_of_actors_detected): # Aggregate this flag across all batches. cluster_full = True resource_load = dict(resource_message.resource_load) total_resources = dict(resource_message.resources_total) available_resources = dict(resource_message.resources_available) waiting_bundles, infeasible_bundles = parse_resource_demands( resources_batch_data.resource_load_by_shape) pending_placement_groups = list( resources_batch_data.placement_group_load.placement_group_data) use_node_id_as_ip = (self.autoscaler is not None and self.autoscaler.config["provider"].get( "use_node_id_as_ip", False)) # "use_node_id_as_ip" is a hack meant to address situations in # which there's more than one Ray node residing at a given ip. # TODO (Dmitri): Stop using ips as node identifiers. # https://github.com/ray-project/ray/issues/19086 if use_node_id_as_ip: peloton_id = total_resources.get("NODE_ID_AS_RESOURCE") # Legacy support https://github.com/ray-project/ray/pull/17312 if peloton_id is not None: ip = str(int(peloton_id)) else: ip = node_id.hex() else: ip = resource_message.node_manager_address self.load_metrics.update(ip, node_id, total_resources, available_resources, resource_load, waiting_bundles, infeasible_bundles, pending_placement_groups, cluster_full) if self.readonly_config: self.readonly_config["available_node_types"].update( mirror_node_types)