def get_node_state(self, node, node_pods, pods_to_schedule): """ returns the ClusterNodeState for the given node params: node - KubeNode object asg - AutoScalingGroup object that this node belongs in. can be None. node_pods - list of KubePods assigned to this node pods_to_schedule - list of all pending pods running_inst_map - map of all (instance_id -> ec2.Instance object) idle_selector_hash - current map of idle nodes by type. may be modified. """ # we consider a node to be busy if it's running any non-DaemonSet pods # TODO: we can be a bit more aggressive in killing pods that are # replicated busy_list = [ p for p in node_pods if (not p.is_mirrored() and 'kube-proxy' not in p.name) ] age = (datetime.datetime.now(node.creation_time.tzinfo) - node.creation_time).seconds # TODO: Fix this kube-proxy issue, see # https://github.com/openai/kubernetes-ec2-autoscaler/issues/23 undrainable_list = [ p for p in node_pods if not (p.is_drainable() or 'kube-proxy' in p.name) ] utilization = sum((p.resources for p in busy_list), KubeResource()) under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible drainable = not undrainable_list if busy_list and not under_utilized: if node.unschedulable: state = ClusterNodeState.BUSY_UNSCHEDULABLE else: state = ClusterNodeState.BUSY elif pods_to_schedule and not node.unschedulable: state = ClusterNodeState.POD_PENDING # elif is_spare_agent: # state = ClusterNodeState.SPARE_AGENT elif age <= self.idle_threshold and not node.unschedulable: state = ClusterNodeState.GRACE_PERIOD elif under_utilized and (busy_list or not node.unschedulable): if drainable: state = ClusterNodeState.UNDER_UTILIZED_DRAINABLE else: state = ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE # logger.info('Undrainable pods: {}'.format( # undrainable_list)) else: if node.unschedulable: state = ClusterNodeState.IDLE_UNSCHEDULABLE else: state = ClusterNodeState.IDLE_SCHEDULABLE return state
def get_node_state(self, node, asg, node_pods, pods_to_schedule, running_insts_map, idle_selector_hash): """ returns the ClusterNodeState for the given node params: node - KubeNode object asg - AutoScalingGroup object that this node belongs in. can be None. node_pods - list of KubePods assigned to this node pods_to_schedule - list of all pending pods running_inst_map - map of all (instance_id -> ec2.Instance object) idle_selector_hash - current map of idle nodes by type. may be modified """ pending_list = [] for pods in pods_to_schedule.values(): for pod in pods: # a pod is considered schedulable onto this node if all the # node selectors match # AND it doesn't use pod affinity (which we don't support yet) if (node.is_match(pod) and 'scheduler.alpha.kubernetes.io/affinity' not in pod.annotations): pending_list.append(pod) # we consider a node to be busy if it's running any non-DaemonSet pods # TODO: we can be a bit more aggressive in killing pods that are # replicated busy_list = [p for p in node_pods if not p.is_mirrored()] undrainable_list = [p for p in node_pods if not p.is_drainable()] utilization = sum((p.resources for p in busy_list), KubeResource()) under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible drainable = not undrainable_list maybe_inst = running_insts_map.get(node.instance_id) if maybe_inst: age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo) - maybe_inst.launch_time).seconds logger.warn('AGE: %s', age) launch_hour_offset = age % 3600 else: age = None instance_type = utils.selectors_to_hash( asg.selectors) if asg else node.instance_type type_spare_capacity = (instance_type and self.type_idle_threshold and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT) if maybe_inst is None: return ClusterNodeState.INSTANCE_TERMINATED if node.is_detached(): return ClusterNodeState.DETACHED if node.is_dead(): return ClusterNodeState.DEAD if asg and len(asg.nodes) <= asg.min_size: return ClusterNodeState.ASG_MIN_SIZE if busy_list and not under_utilized: if node.unschedulable: return ClusterNodeState.BUSY_UNSCHEDULABLE return ClusterNodeState.BUSY if pending_list and not node.unschedulable: # logger.warn('PENDING: %s', pending_list) return ClusterNodeState.POD_PENDING if launch_hour_offset < self.LAUNCH_HOUR_THRESHOLD[node.provider] and not node.unschedulable: return ClusterNodeState.LAUNCH_HR_GRACE_PERIOD # elif node.provider == 'azure': # disabling scale down in azure for now while we ramp up # TODO: remove once azure is bootstrapped # state = ClusterNodeState.GRACE_PERIOD if (not type_spare_capacity and age <= self.idle_threshold) and not node.unschedulable: # there is already an instance of this type sitting idle # so we use the regular idle threshold for the grace period return ClusterNodeState.GRACE_PERIOD if (type_spare_capacity and age <= self.type_idle_threshold) and not node.unschedulable: # we don't have an instance of this type yet! # use the type idle threshold for the grace period # and mark the type as seen idle_selector_hash[instance_type] += 1 return ClusterNodeState.TYPE_GRACE_PERIOD if under_utilized and (busy_list or not node.unschedulable): # nodes that are under utilized (but not completely idle) # have their own states to tell if we should drain them # for better binpacking or not if drainable: return ClusterNodeState.UNDER_UTILIZED_DRAINABLE return ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE if node.unschedulable: return ClusterNodeState.IDLE_UNSCHEDULABLE return ClusterNodeState.IDLE_SCHEDULABLE
import json from autoscaler.config import Config from autoscaler.kube import KubeResource # RESOURCE_SPEC should denote the amount of resouces that are available # to workload pods on a new, clean node, i.e. resouces used by system pods # have to be accounted for with open(Config.CAPACITY_DATA, 'r') as f: data = json.loads(f.read()) RESOURCE_SPEC = {} for key, instance_map in data.items(): RESOURCE_SPEC[key] = {} for instance_type, resource_spec in instance_map.items(): resource_spec['cpu'] -= Config.CAPACITY_CPU_RESERVE resource = KubeResource(**resource_spec) RESOURCE_SPEC[key][instance_type] = resource DEFAULT_TYPE_SELECTOR_KEY = 'aws/type' DEFAULT_CLASS_SELECTOR_KEY = 'aws/class' COMPUTING_SELECTOR_KEY = 'openai/computing' def is_possible(pod): """ returns whether the pod is possible under the maximum allowable capacity """ computing = pod.selectors.get(COMPUTING_SELECTOR_KEY, 'false') selector = pod.selectors.get(DEFAULT_TYPE_SELECTOR_KEY) class_ = pod.selectors.get(DEFAULT_CLASS_SELECTOR_KEY)
def get_node_state(self, node, asg, node_pods, pods_to_schedule, running_insts_map, idle_selector_hash): """ returns the ClusterNodeState for the given node params: node - KubeNode object asg - AutoScalingGroup object that this node belongs in. can be None. node_pods - list of KubePods assigned to this node pods_to_schedule - list of all pending pods running_inst_map - map of all (instance_id -> ec2.Instance object) idle_selector_hash - current map of idle nodes by type. may be modified. """ pending_list = [] for pods in pods_to_schedule.values(): for pod in pods: if node.is_match(pod): pending_list.append(pod) # we consider a node to be busy if it's running any non-DaemonSet pods # TODO: we can be a bit more aggressive in killing pods that are # replicated busy_list = [p for p in node_pods if not p.is_mirrored()] undrainable_list = [p for p in node_pods if not p.is_replicated()] utilization = sum((p.resources for p in busy_list), KubeResource()) under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible drainable = not undrainable_list maybe_inst = running_insts_map.get(node.instance_id) instance_type = utils.selectors_to_hash(asg.selectors) if asg else None if maybe_inst: age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo) - maybe_inst.launch_time).seconds else: age = None instance_type = utils.selectors_to_hash(asg.selectors) if asg else node.instance_type if maybe_inst is None: state = ClusterNodeState.INSTANCE_TERMINATED elif asg and len(asg.nodes) <= asg.min_size: state = ClusterNodeState.ASG_MIN_SIZE elif busy_list and not under_utilized: if node.unschedulable: state = ClusterNodeState.BUSY_UNSCHEDULABLE else: state = ClusterNodeState.BUSY elif pending_list and not node.unschedulable: state = ClusterNodeState.POD_PENDING elif ((not self.type_idle_threshold or idle_selector_hash[instance_type] >= self.TYPE_IDLE_COUNT) and age <= self.idle_threshold) and not node.unschedulable: # there is already an instance of this type sitting idle # so we use the regular idle threshold for the grace period state = ClusterNodeState.GRACE_PERIOD elif (instance_type and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT and age <= self.type_idle_threshold) and not node.unschedulable: # we don't have an instance of this type yet! # use the type idle threshold for the grace period # and mark the type as seen idle_selector_hash[instance_type] += 1 state = ClusterNodeState.TYPE_GRACE_PERIOD elif under_utilized and not node.unschedulable: if drainable: state = ClusterNodeState.UNDER_UTILIZED_DRAINABLE else: state = ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE else: if node.unschedulable: state = ClusterNodeState.IDLE_UNSCHEDULABLE else: state = ClusterNodeState.IDLE_SCHEDULABLE return state