def get_node_state(self, node, node_pods, pods_to_schedule):
        """
        returns the ClusterNodeState for the given node
        params:
        node - KubeNode object
        asg - AutoScalingGroup object that this node belongs in. can be None.
        node_pods - list of KubePods assigned to this node
        pods_to_schedule - list of all pending pods
        running_inst_map - map of all (instance_id -> ec2.Instance object)
        idle_selector_hash - current map of idle nodes by type. may be modified.
        """

        # we consider a node to be busy if it's running any non-DaemonSet pods
        # TODO: we can be a bit more aggressive in killing pods that are
        # replicated
        busy_list = [
            p for p in node_pods
            if (not p.is_mirrored() and 'kube-proxy' not in p.name)
        ]

        age = (datetime.datetime.now(node.creation_time.tzinfo) -
               node.creation_time).seconds

        # TODO: Fix this kube-proxy issue, see
        # https://github.com/openai/kubernetes-ec2-autoscaler/issues/23
        undrainable_list = [
            p for p in node_pods
            if not (p.is_drainable() or 'kube-proxy' in p.name)
        ]

        utilization = sum((p.resources for p in busy_list), KubeResource())
        under_utilized = (self.UTIL_THRESHOLD * node.capacity -
                          utilization).possible
        drainable = not undrainable_list

        if busy_list and not under_utilized:
            if node.unschedulable:
                state = ClusterNodeState.BUSY_UNSCHEDULABLE
            else:
                state = ClusterNodeState.BUSY
        elif pods_to_schedule and not node.unschedulable:
            state = ClusterNodeState.POD_PENDING
        # elif is_spare_agent:
        #     state = ClusterNodeState.SPARE_AGENT
        elif age <= self.idle_threshold and not node.unschedulable:
            state = ClusterNodeState.GRACE_PERIOD
        elif under_utilized and (busy_list or not node.unschedulable):
            if drainable:
                state = ClusterNodeState.UNDER_UTILIZED_DRAINABLE
            else:
                state = ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE
                # logger.info('Undrainable pods: {}'.format(
                #         undrainable_list))
        else:
            if node.unschedulable:
                state = ClusterNodeState.IDLE_UNSCHEDULABLE
            else:
                state = ClusterNodeState.IDLE_SCHEDULABLE

        return state
Example #2
0
    def get_node_state(self, node, asg, node_pods, pods_to_schedule,
                       running_insts_map, idle_selector_hash):
        """
        returns the ClusterNodeState for the given node

        params:
        node - KubeNode object
        asg - AutoScalingGroup object that this node belongs in. can be None.
        node_pods - list of KubePods assigned to this node
        pods_to_schedule - list of all pending pods
        running_inst_map - map of all (instance_id -> ec2.Instance object)
        idle_selector_hash - current map of idle nodes by type. may be modified
        """
        pending_list = []
        for pods in pods_to_schedule.values():
            for pod in pods:
                # a pod is considered schedulable onto this node if all the
                # node selectors match
                # AND it doesn't use pod affinity (which we don't support yet)
                if (node.is_match(pod) and
                        'scheduler.alpha.kubernetes.io/affinity' not in pod.annotations):
                    pending_list.append(pod)
        # we consider a node to be busy if it's running any non-DaemonSet pods
        # TODO: we can be a bit more aggressive in killing pods that are
        # replicated
        busy_list = [p for p in node_pods if not p.is_mirrored()]
        undrainable_list = [p for p in node_pods if not p.is_drainable()]
        utilization = sum((p.resources for p in busy_list), KubeResource())
        under_utilized = (self.UTIL_THRESHOLD *
                          node.capacity - utilization).possible
        drainable = not undrainable_list

        maybe_inst = running_insts_map.get(node.instance_id)
        if maybe_inst:
            age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo)
                   - maybe_inst.launch_time).seconds
            logger.warn('AGE: %s', age)
            launch_hour_offset = age % 3600
        else:
            age = None

        instance_type = utils.selectors_to_hash(
            asg.selectors) if asg else node.instance_type

        type_spare_capacity = (instance_type and self.type_idle_threshold and
                               idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT)

        if maybe_inst is None:
            return ClusterNodeState.INSTANCE_TERMINATED

        if node.is_detached():
            return ClusterNodeState.DETACHED

        if node.is_dead():
            return ClusterNodeState.DEAD

        if asg and len(asg.nodes) <= asg.min_size:
            return ClusterNodeState.ASG_MIN_SIZE

        if busy_list and not under_utilized:
            if node.unschedulable:
                return ClusterNodeState.BUSY_UNSCHEDULABLE
            return ClusterNodeState.BUSY

        if pending_list and not node.unschedulable:
            # logger.warn('PENDING: %s', pending_list)
            return ClusterNodeState.POD_PENDING

        if launch_hour_offset < self.LAUNCH_HOUR_THRESHOLD[node.provider] and not node.unschedulable:
            return ClusterNodeState.LAUNCH_HR_GRACE_PERIOD

        # elif node.provider == 'azure':
            # disabling scale down in azure for now while we ramp up
            # TODO: remove once azure is bootstrapped
            # state = ClusterNodeState.GRACE_PERIOD

        if (not type_spare_capacity and age <= self.idle_threshold) and not node.unschedulable:
            # there is already an instance of this type sitting idle
            # so we use the regular idle threshold for the grace period
            return ClusterNodeState.GRACE_PERIOD

        if (type_spare_capacity and age <= self.type_idle_threshold) and not node.unschedulable:
            # we don't have an instance of this type yet!
            # use the type idle threshold for the grace period
            # and mark the type as seen
            idle_selector_hash[instance_type] += 1
            return ClusterNodeState.TYPE_GRACE_PERIOD

        if under_utilized and (busy_list or not node.unschedulable):
            # nodes that are under utilized (but not completely idle)
            # have their own states to tell if we should drain them
            # for better binpacking or not
            if drainable:
                return ClusterNodeState.UNDER_UTILIZED_DRAINABLE
            return ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE

        if node.unschedulable:
            return ClusterNodeState.IDLE_UNSCHEDULABLE
        return ClusterNodeState.IDLE_SCHEDULABLE
import json

from autoscaler.config import Config
from autoscaler.kube import KubeResource

# RESOURCE_SPEC should denote the amount of resouces that are available
# to workload pods on a new, clean node, i.e. resouces used by system pods
# have to be accounted for
with open(Config.CAPACITY_DATA, 'r') as f:
    data = json.loads(f.read())
    RESOURCE_SPEC = {}
    for key, instance_map in data.items():
        RESOURCE_SPEC[key] = {}
        for instance_type, resource_spec in instance_map.items():
            resource_spec['cpu'] -= Config.CAPACITY_CPU_RESERVE
            resource = KubeResource(**resource_spec)
            RESOURCE_SPEC[key][instance_type] = resource

DEFAULT_TYPE_SELECTOR_KEY = 'aws/type'
DEFAULT_CLASS_SELECTOR_KEY = 'aws/class'
COMPUTING_SELECTOR_KEY = 'openai/computing'


def is_possible(pod):
    """
    returns whether the pod is possible under the maximum allowable capacity
    """

    computing = pod.selectors.get(COMPUTING_SELECTOR_KEY, 'false')
    selector = pod.selectors.get(DEFAULT_TYPE_SELECTOR_KEY)
    class_ = pod.selectors.get(DEFAULT_CLASS_SELECTOR_KEY)
    def get_node_state(self, node, asg, node_pods, pods_to_schedule,
                       running_insts_map, idle_selector_hash):
        """
        returns the ClusterNodeState for the given node

        params:
        node - KubeNode object
        asg - AutoScalingGroup object that this node belongs in. can be None.
        node_pods - list of KubePods assigned to this node
        pods_to_schedule - list of all pending pods
        running_inst_map - map of all (instance_id -> ec2.Instance object)
        idle_selector_hash - current map of idle nodes by type. may be modified.
        """
        pending_list = []
        for pods in pods_to_schedule.values():
            for pod in pods:
                if node.is_match(pod):
                    pending_list.append(pod)
        # we consider a node to be busy if it's running any non-DaemonSet pods
        # TODO: we can be a bit more aggressive in killing pods that are
        # replicated
        busy_list = [p for p in node_pods if not p.is_mirrored()]
        undrainable_list = [p for p in node_pods if not p.is_replicated()]
        utilization = sum((p.resources for p in busy_list), KubeResource())
        under_utilized = (self.UTIL_THRESHOLD * node.capacity - utilization).possible
        drainable = not undrainable_list

        maybe_inst = running_insts_map.get(node.instance_id)
        instance_type = utils.selectors_to_hash(asg.selectors) if asg else None

        if maybe_inst:
            age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo)
                   - maybe_inst.launch_time).seconds
        else:
            age = None

        instance_type = utils.selectors_to_hash(asg.selectors) if asg else node.instance_type

        if maybe_inst is None:
            state = ClusterNodeState.INSTANCE_TERMINATED
        elif asg and len(asg.nodes) <= asg.min_size:
            state = ClusterNodeState.ASG_MIN_SIZE
        elif busy_list and not under_utilized:
            if node.unschedulable:
                state = ClusterNodeState.BUSY_UNSCHEDULABLE
            else:
                state = ClusterNodeState.BUSY
        elif pending_list and not node.unschedulable:
            state = ClusterNodeState.POD_PENDING
        elif ((not self.type_idle_threshold or idle_selector_hash[instance_type] >= self.TYPE_IDLE_COUNT)
              and age <= self.idle_threshold) and not node.unschedulable:
            # there is already an instance of this type sitting idle
            # so we use the regular idle threshold for the grace period
            state = ClusterNodeState.GRACE_PERIOD
        elif (instance_type and idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT
              and age <= self.type_idle_threshold) and not node.unschedulable:
            # we don't have an instance of this type yet!
            # use the type idle threshold for the grace period
            # and mark the type as seen
            idle_selector_hash[instance_type] += 1
            state = ClusterNodeState.TYPE_GRACE_PERIOD
        elif under_utilized and not node.unschedulable:
            if drainable:
                state = ClusterNodeState.UNDER_UTILIZED_DRAINABLE
            else:
                state = ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE
        else:
            if node.unschedulable:
                state = ClusterNodeState.IDLE_UNSCHEDULABLE
            else:
                state = ClusterNodeState.IDLE_SCHEDULABLE

        return state