def get_pods_to_schedule(self, pods): """ given a list of KubePod objects, return a map of (selectors hash -> pods) to be scheduled """ pending_unassigned_pods = [ p for p in pods if p.is_pending_unassigned_and_scaleworthy(self.scale_label) ] # we only consider a pod to be schedulable if it's pending and # unassigned and feasible pods_to_schedule = {} for pod in pending_unassigned_pods: if capacity.is_possible(pod): pods_to_schedule.setdefault( utils.selectors_to_hash(pod.selectors), []).append(pod) else: recommended_capacity = capacity.max_capacity_for_selectors( pod.selectors) logger.warn( "Pending pod %s cannot fit %s. " "Please check that requested resource amount is " "consistent with node selectors (recommended max: %s). " "Scheduling skipped." % (pod.name, pod.selectors, recommended_capacity)) self.notifier.notify_invalid_pod_capacity( pod, recommended_capacity) return pods_to_schedule
def get_pods_to_schedule(self, pods): """ given a list of KubePod objects, return a map of (selectors hash -> pods) to be scheduled """ pending_unassigned_pods = [ p for p in pods if p.status == KubePodStatus.PENDING and (not p.node_name) ] # we only consider a pod to be schedulable if it's pending and # unassigned and feasible pods_to_schedule = {} now = datetime.datetime.now(pytz.utc) for pod in pending_unassigned_pods: age = (now - pod.creation_time).total_seconds() self.stats.histogram('autoscaler.scaling_loop.pending_pod_age', age) if capacity.is_possible(pod): pods_to_schedule.setdefault( utils.selectors_to_hash(pod.selectors), []).append(pod) else: recommended_capacity = capacity.max_capacity_for_selectors( pod.selectors, pod.resources) logger.warn( "Pending pod %s cannot fit %s. " "Please check that requested resource amount is " "consistent with node selectors (recommended max: %s). " "Scheduling skipped." % (pod.name, pod.selectors, recommended_capacity)) self.notifier.notify_invalid_pod_capacity( pod, recommended_capacity) return pods_to_schedule
def test_scale_up(self): pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
def test_scale_up_notification(self): big_pod_spec = copy.deepcopy(self.dummy_pod) for container in big_pod_spec['spec']['containers']: container['resources']['requests']['cpu'] = '100' pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) big_pod = KubePod(pykube.Pod(self.api, big_pod_spec)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod, big_pod]) self.cluster.notifier.notify_scale.assert_called_with(mock.ANY, mock.ANY, [pod])
def test_scale_up_selector(self): self.dummy_pod['spec']['nodeSelector'] = {'aws/type': 'm4.large'} pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
def test_scale_up_selector(self): self.dummy_pod['spec']['nodeSelector'] = { 'aws/type': 'm4.large' } pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
def scale(self, pods_to_schedule, all_nodes, asgs, running_insts_map): """ scale up logic """ # TODO: generalize to azure self.autoscaling_timeouts.refresh_timeouts( [asg for asg in asgs if asg.provider == 'aws'], dry_run=self.dry_run) cached_live_nodes = [] for node in all_nodes: # either we know the physical node behind it and know it's alive # or we don't know it and assume it's alive if (node.instance_id and node.instance_id in running_insts_map) \ or (not node.is_managed()): cached_live_nodes.append(node) # selectors -> pending KubePods pending_pods = {} # for each pending & unassigned job, try to fit them on current machines or count requested # resources towards future machines for selectors_hash, pods in pods_to_schedule.items(): for pod in pods: fitting = None for node in cached_live_nodes: if node.unschedulable: continue if node.is_match(pod) and node.can_fit(pod.resources): fitting = node break if fitting is None: # because a pod may be able to fit in multiple groups # pick a group now selectors = dict(pod.selectors) pending_pods.setdefault(utils.selectors_to_hash(selectors), []).append(pod) logger.info( "{pod} is pending ({selectors_hash})".format( pod=pod, selectors_hash=selectors_hash)) else: fitting.count_pod(pod) logger.info("{pod} fits on {node}".format(pod=pod, node=fitting)) # scale each node type to reach the new capacity for selectors_hash in set(pending_pods.keys()): self.fulfill_pending(asgs, selectors_hash, pending_pods.get(selectors_hash, []))
def test_timed_out_group(self): with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.is_timed_out') as is_timed_out: with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.scale') as scale: is_timed_out.return_value = True scale.return_value = utils.CompletedFuture(None) pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) scale.assert_not_called() response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 1) self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
def test_scale_up(self): pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) selectors_hash = utils.selectors_to_hash(pod.selectors) asgs = self.cluster.autoscaling_groups.get_all_groups([]) self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) response = self.asg_client.describe_auto_scaling_groups() self.assertEqual(len(response['AutoScalingGroups']), 2) big_gpu_asg, small_gpu_asg = {}, {} if (response['AutoScalingGroups'][0]['AutoScalingGroupName'] == 'dummy-asg-small-gpu'): small_gpu_asg = response['AutoScalingGroups'][0] big_gpu_asg = response['AutoScalingGroups'][1] else: small_gpu_asg = response['AutoScalingGroups'][1] big_gpu_asg = response['AutoScalingGroups'][0] self.assertGreater(big_gpu_asg['DesiredCapacity'], 0) self.assertEqual(small_gpu_asg['DesiredCapacity'], 0)
def get_pods_to_schedule(self, pods): """ given a list of KubePod objects, return a map of (selectors hash -> pods) to be scheduled """ pending_unassigned_pods = [ p for p in pods if p.status == KubePodStatus.PENDING and (not p.node_name) ] # we only consider a pod to be schedulable if it's pending and unassigned and feasible pods_to_schedule = {} for pod in pending_unassigned_pods: if capacity.is_possible(pod): pods_to_schedule.setdefault(utils.selectors_to_hash(pod.selectors), []).append(pod) else: logger.warn( "Pending pod %s cannot fit %s. Ignored" % (pod.name, pod.selectors)) return pods_to_schedule
def get_pods_to_schedule(self, pods): """ given a list of KubePod objects, return a map of (selectors hash -> pods) to be scheduled """ pending_unassigned_pods = [ p for p in pods if p.status == KubePodStatus.PENDING and (not p.node_name) ] # we only consider a pod to be schedulable if it's pending and unassigned and feasible pods_to_schedule = {} for pod in pending_unassigned_pods: if capacity.is_possible(pod): pods_to_schedule.setdefault(utils.selectors_to_hash(pod.selectors), []).append(pod) else: logger.warn( "Pending pod %s cannot fit %s. " "Please check that requested resource amount is " "consistent with node selectors. Scheduling skipped." % (pod.name, pod.selectors)) return pods_to_schedule
def get_pods_to_schedule(self, pods): """ given a list of KubePod objects, return a map of (selectors hash -> pods) to be scheduled """ pending_unassigned_pods = [ p for p in pods if p.status == KubePodStatus.PENDING and (not p.node_name) ] # we only consider a pod to be schedulable if it's pending and # unassigned and feasible pods_to_schedule = {} for pod in pending_unassigned_pods: if capacity.is_possible(pod, self.cs_instance_type): pods_to_schedule.setdefault( utils.selectors_to_hash(pod.selectors), []).append(pod) else: logger.warn("Pending pod %s cannot fit %s. " "Please check that requested resource amount is " "consistent with node size." "Scheduling skipped." % (pod.name, pod.selectors)) return pods_to_schedule
def get_node_state(self, node, asg, node_pods, pods_to_schedule, running_insts_map, idle_selector_hash): """ returns the ClusterNodeState for the given node params: node - KubeNode object asg - AutoScalingGroup object that this node belongs in. can be None. node_pods - list of KubePods assigned to this node pods_to_schedule - list of all pending pods running_inst_map - map of all (instance_id -> ec2.Instance object) idle_selector_hash - current map of idle nodes by type. may be modified """ pending_list = [] for pods in pods_to_schedule.values(): for pod in pods: # a pod is considered schedulable onto this node if all the # node selectors match # AND it doesn't use pod affinity (which we don't support yet) if (node.is_match(pod) and 'scheduler.alpha.kubernetes.io/affinity' not in pod.annotations): pending_list.append(pod) # we consider a node to be busy if it's running any non-DaemonSet pods # TODO: we can be a bit more aggressive in killing pods that are # replicated busy_list = [p for p in node_pods if not p.is_mirrored()] undrainable_list = [p for p in node_pods if not p.is_drainable()] utilization = sum((p.resources for p in busy_list), KubeResource()) under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible drainable = not undrainable_list maybe_inst = running_insts_map.get(node.instance_id) if maybe_inst: age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo) - maybe_inst.launch_time).seconds logger.warn('AGE: %s', age) launch_hour_offset = age % 3600 else: age = None instance_type = utils.selectors_to_hash( asg.selectors) if asg else node.instance_type type_spare_capacity = (instance_type and self.type_idle_threshold and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT) if maybe_inst is None: return ClusterNodeState.INSTANCE_TERMINATED if node.is_detached(): return ClusterNodeState.DETACHED if node.is_dead(): return ClusterNodeState.DEAD if asg and len(asg.nodes) <= asg.min_size: return ClusterNodeState.ASG_MIN_SIZE if busy_list and not under_utilized: if node.unschedulable: return ClusterNodeState.BUSY_UNSCHEDULABLE return ClusterNodeState.BUSY if pending_list and not node.unschedulable: # logger.warn('PENDING: %s', pending_list) return ClusterNodeState.POD_PENDING if launch_hour_offset < self.LAUNCH_HOUR_THRESHOLD[node.provider] and not node.unschedulable: return ClusterNodeState.LAUNCH_HR_GRACE_PERIOD # elif node.provider == 'azure': # disabling scale down in azure for now while we ramp up # TODO: remove once azure is bootstrapped # state = ClusterNodeState.GRACE_PERIOD if (not type_spare_capacity and age <= self.idle_threshold) and not node.unschedulable: # there is already an instance of this type sitting idle # so we use the regular idle threshold for the grace period return ClusterNodeState.GRACE_PERIOD if (type_spare_capacity and age <= self.type_idle_threshold) and not node.unschedulable: # we don't have an instance of this type yet! # use the type idle threshold for the grace period # and mark the type as seen idle_selector_hash[instance_type] += 1 return ClusterNodeState.TYPE_GRACE_PERIOD if under_utilized and (busy_list or not node.unschedulable): # nodes that are under utilized (but not completely idle) # have their own states to tell if we should drain them # for better binpacking or not if drainable: return ClusterNodeState.UNDER_UTILIZED_DRAINABLE return ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE if node.unschedulable: return ClusterNodeState.IDLE_UNSCHEDULABLE return ClusterNodeState.IDLE_SCHEDULABLE
def __str__(self): return "{}: {} ({})".format(self.name, self.instance_id, utils.selectors_to_hash(self.selectors))
def __str__(self): return 'AzureVirtualScaleSet({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
def get_node_state(self, node, asg, node_pods, pods_to_schedule, running_insts_map, idle_selector_hash): """ returns the ClusterNodeState for the given node params: node - KubeNode object asg - AutoScalingGroup object that this node belongs in. can be None. node_pods - list of KubePods assigned to this node pods_to_schedule - list of all pending pods running_inst_map - map of all (instance_id -> ec2.Instance object) idle_selector_hash - current map of idle nodes by type. may be modified. """ pending_list = [] for pods in pods_to_schedule.values(): for pod in pods: if node.is_match(pod): pending_list.append(pod) # we consider a node to be busy if it's running any non-DaemonSet pods # TODO: we can be a bit more aggressive in killing pods that are # replicated busy_list = [p for p in node_pods if not p.is_mirrored()] undrainable_list = [p for p in node_pods if not p.is_drainable()] utilization = sum((p.resources for p in busy_list), KubeResource()) under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible drainable = not undrainable_list maybe_inst = running_insts_map.get(node.instance_id) instance_type = utils.selectors_to_hash(asg.selectors) if asg else None if maybe_inst: age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo) - maybe_inst.launch_time).seconds else: age = None instance_type = utils.selectors_to_hash(asg.selectors) if asg else node.instance_type if maybe_inst is None: state = ClusterNodeState.INSTANCE_TERMINATED elif asg and len(asg.nodes) <= asg.min_size: state = ClusterNodeState.ASG_MIN_SIZE elif busy_list and not under_utilized: if node.unschedulable: state = ClusterNodeState.BUSY_UNSCHEDULABLE else: state = ClusterNodeState.BUSY elif pending_list and not node.unschedulable: state = ClusterNodeState.POD_PENDING elif ((not self.type_idle_threshold or idle_selector_hash[instance_type] >= self.TYPE_IDLE_COUNT) and age <= self.idle_threshold) and not node.unschedulable: # there is already an instance of this type sitting idle # so we use the regular idle threshold for the grace period state = ClusterNodeState.GRACE_PERIOD elif (instance_type and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT and age <= self.type_idle_threshold) and not node.unschedulable: # we don't have an instance of this type yet! # use the type idle threshold for the grace period # and mark the type as seen idle_selector_hash[instance_type] += 1 state = ClusterNodeState.TYPE_GRACE_PERIOD elif under_utilized and not node.unschedulable: if drainable: state = ClusterNodeState.UNDER_UTILIZED_DRAINABLE else: state = ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE else: if node.unschedulable: state = ClusterNodeState.IDLE_UNSCHEDULABLE else: state = ClusterNodeState.IDLE_SCHEDULABLE return state
def __str__(self): return 'AutoScalingGroup({name}, {selectors_hash})'.format( name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
def __str__(self): return 'AutoScalingGroup({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
def get_node_state(self, node, asg, node_pods, pods_to_schedule, running_insts_map, idle_selector_hash): """ returns the ClusterNodeState for the given node params: node - KubeNode object asg - AutoScalingGroup object that this node belongs in. can be None. node_pods - list of KubePods assigned to this node pods_to_schedule - list of all pending pods running_inst_map - map of all (instance_id -> ec2.Instance object) idle_selector_hash - current map of idle nodes by type. may be modified. """ pending_list = [] for pods in pods_to_schedule.itervalues(): for pod in pods: if node.is_match(pod): pending_list.append(pod) # we consider a node to be busy if it's running any non-DaemonSet pods # TODO: we can be a bit more aggressive in killing pods that are # replicated busy_list = [p for p in node_pods if not p.is_mirrored()] maybe_inst = running_insts_map.get(node.instance_id) instance_type = utils.selectors_to_hash(asg.selectors) if asg else None if maybe_inst: age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo) - maybe_inst.launch_time).seconds else: age = None instance_type = utils.selectors_to_hash(asg.selectors) if asg else node.instance_type if maybe_inst is None: state = ClusterNodeState.INSTANCE_TERMINATED elif asg and len(asg.nodes) <= asg.min_size: state = ClusterNodeState.ASG_MIN_SIZE elif busy_list: if node.unschedulable: state = ClusterNodeState.BUSY_UNSCHEDULABLE else: state = ClusterNodeState.BUSY elif pending_list: state = ClusterNodeState.POD_PENDING elif (idle_selector_hash[instance_type] >= self.TYPE_IDLE_COUNT and age <= self.idle_threshold): # there is already an instance of this type sitting idle # so we use the regular idle threshold for the grace period state = ClusterNodeState.GRACE_PERIOD elif (instance_type and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT and age <= self.type_idle_threshold): # we don't have an instance of this type yet! # use the type idle threshold for the grace period # and mark the type as seen idle_selector_hash[instance_type] += 1 state = ClusterNodeState.TYPE_GRACE_PERIOD else: if node.unschedulable: state = ClusterNodeState.IDLE_UNSCHEDULABLE else: state = ClusterNodeState.IDLE_SCHEDULABLE return state