Ejemplo n.º 1
0
async def MonitorTriadSets(spec, meta, **kwargs):
    logger = NHDCommon.GetLogger(__name__)
    logger.debug(f'Kicking off controller timer for {meta["namespace"]}/{meta["name"]}')
    try:
        config.load_incluster_config()
    except:
        config.load_kube_config()

    v1   = client.CoreV1Api()        

    for ord in range(spec['replicas']):
        podname = f'{spec["serviceName"]}-{ord}'
        try:
            p = v1.read_namespaced_pod(name = podname, namespace = meta["namespace"])
        except ApiException as e:
            logger.info(f'Triad pod {podname} not found in namespace {meta["namespace"]}, but TriadSet is still active. Restarting pod')
            podspec = yaml.dump(spec["template"])
            
            # Indent the pod spec to line up with the rest of the yaml
            podspec = f"apiVersion: v1\nkind: Pod\n{podspec}"

            # Reload the yaml to patch some fields
            podyaml = yaml.safe_load(podspec)                
            podyaml['metadata']['name'] = podname # Give it the canonical statefulset-type name

            # Patch in the hostname and subdomain to create a DNS record like a statefulset
            podyaml['spec']['hostname']  = podname
            podyaml['spec']['subdomain'] = meta["name"]
            kopf.adopt(podyaml)
            obj = v1.create_namespaced_pod(namespace = meta['namespace'], body = podyaml)
Ejemplo n.º 2
0
    def __init__(self, dat, isFile):
        self.logger = NHDCommon.GetLogger(__name__)
        self.cfg = None
        self.top : CfgTopology = CfgTopology()

        if not isFile:
            self.LoadCfgStr(dat)
        else:
            self.LoadCfgFile(dat)
Ejemplo n.º 3
0
    def __init__(self, q: Queue):
        threading.Thread.__init__(self)
        self.logger = NHDCommon.GetLogger(__name__)
        self.nodes = {}
        self.k8s = K8SMgr.GetInstance()
        self.sched_name = NHD_SCHED_NAME
        self.whitelist = []
        self.matcher = Matcher()
        self.pod_state = {}
        self.mainq = q

        self.ver = pkg_resources.get_distribution("nhd").version
        self.logger.warning(f'NHD version {self.ver}')
Ejemplo n.º 4
0
    def __init__(self):
        self.arch: CpuType = CpuType.CPU_TYPE_ALL
        self.misc_cores: List[Core] = []
        self.proc_groups: List[ProcGroup] = []
        self.nic_core_pairing: List[NICGroup] = []
        self.misc_cores_smt: SMTSetting = SMTSetting.SMT_DISABLED
        self.map_type: TopologyMapType = TopologyMapType.TOPOLOGY_MAP_INVALID
        self.ctrl_vlan: VLANInfo = None
        self.data_default_gw: str = ''
        self.hugepages_gb = 0

        self.logger = NHDCommon.GetLogger(__name__)

        self.logger.info('Initializing matcher')
Ejemplo n.º 5
0
    def __init__(self, name):
        self.logger = NHDCommon.GetLogger(__name__)

        self.name = name
        self.cores: List[NodeCore] = []
        self.gpus = []
        self.nics = []

        self.sockets = 0
        self.numa_nodes = 0
        self.smt_enabled = False
        self.cores_per_proc = 0
        self.pods_scheduled = set()
        self.sriov_en = False
        self.data_vlan = 0
        self.gwip: str = '0.0.0.0/32'
        self.mem: NodeMemory = NodeMemory()
        self.reserved_cores = []  # Reserved CPU cores
Ejemplo n.º 6
0
    def __init__(self):
        """
        Initializes the logger and loads Kubernetes configuration
        """
        self.logger = NHDCommon.GetLogger(__name__)

        #config.load_incluster_config()
        if K8SMgr.__instance != None:
            raise Exception("Cannot create more than one K8SMgr!")
        else:
            try:
                config.load_incluster_config()
            except:
                config.load_kube_config()

            self.v1 = client.CoreV1Api()
            self.last_seen_ver = None

            K8SMgr.__instance = self
Ejemplo n.º 7
0
    def __init__(self, name, active = True):
        self.logger = NHDCommon.GetLogger(__name__)

        self.name = name
        self.active = active
        self.cores: List[NodeCore] = []
        self.gpus = []
        self.nics = []

        self.sockets = 0
        self.numa_nodes = 0 
        self.smt_enabled = False
        self.cores_per_proc = 0
        self.pod_info = {}
        self.data_vlan = 0
        self.groups: List[str] = ['default']
        self.gwip : str = '0.0.0.0/32'
        self.mem: NodeMemory = NodeMemory()
        self.reserved_cores = [] # Reserved CPU cores
Ejemplo n.º 8
0
def TriadNodeUpdate(spec, old, new, meta, **_):
    logger = NHDCommon.GetLogger(__name__)
    NHDTainted = lambda obj: any([x['key'] == 'sigproc.viasat.io/nhd_scheduler' for x in obj['spec']['taints']])

    k8sq = qinst
    # If the NHD taint has been added/removed or the code has been cordoned/uncordoned, detect it here
    if (not NHDTainted(old) and NHDTainted(new)) or (('unschedulable' in old['spec'] and 'unschedulable' not in new['spec']) and NHDTainted(new)): # Uncordon
        logger.info(f'Uncordoning node {meta["name"]}')
        k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_NODE_UNCORDON, "node": meta["name"]})
    elif (not NHDTainted(new) and NHDTainted(old)) or ('unschedulable' not in old['spec'] and 'unschedulable' in new['spec']): # Cordon:
        logger.info(f'Cordoning node {meta["name"]}')
        k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_NODE_CORDON, "node": meta["name"]})

    # Detect NHD group changes. If the label didn't exist, or it's now different than the old one, send the new one
    if ('NHD_GROUP' not in old['metadata']['labels'] and 'NHD_GROUP' in new['metadata']['labels']) or \
       ('NHD_GROUP' in old['metadata']['labels'] and 'NHD_GROUP' in new['metadata']['labels'] and old['metadata']['labels'] != new['metadata']['labels']):

       logger.info(f'Updating NHD group for node {meta["name"]} to {new["metadata"]["labels"]["NHD_GROUP"]}')
       k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_GROUP_UPDATE, "node": meta["name"], "groups": new['metadata']['labels']['NHD_GROUP']})
    elif ('NHD_GROUP' in old['metadata']['labels']) and ('NHD_GROUP' not in new['metadata']['labels']): # Label removed

       logger.info(f'Updating NHD group for node {meta["name"]} to default')
       k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_GROUP_UPDATE, "node": meta["name"], "groups" : "default"})
Ejemplo n.º 9
0
def delete_fn(meta, **_):
    logger = NHDCommon.GetLogger(__name__)
    logger.info('Received delete request for TriadSet')
Ejemplo n.º 10
0
def TriadSetCreate(spec, meta, **_):
    logger = NHDCommon.GetLogger(__name__)
    logger.info(f'Found new TriadSet for component {spec["serviceName"]} with {spec["replicas"]} replicas in namespace {meta["namespace"]}')
Ejemplo n.º 11
0
def HandleExceptions(loop, context):
    logger = NHDCommon.GetLogger(__name__)   
    msg = context.get("exception", context["message"])
    logger.error(f"Caught exception: {msg}")
    logger.info("Shutting down...")
    os._exit(-1) # Kill entire application and let k8s restart it. No state needs to be preserved
Ejemplo n.º 12
0
def TriadPodDelete(spec, meta, **_):
    logger = NHDCommon.GetLogger(__name__)   
    logger.info(f'Saw deleted Triad pod {meta["namespace"]}.{meta["name"]}')

    k8sq = qinst # Get the watch queue so we can notify NHD of events from the controller
    k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_TRIAD_POD_DELETE, "pod": {"ns": meta["namespace"], "name": meta["name"]}})        
Ejemplo n.º 13
0
 def __init__(self, q: Queue):
     self.mainq = q
     self.logger = NHDCommon.GetLogger(__name__)
Ejemplo n.º 14
0
 def __init__(self, q: Queue):
     self.logger = NHDCommon.GetLogger(__name__)
     self.mainq = q
     threading.Thread.__init__(self)
Ejemplo n.º 15
0
 def __init__(self):
     self.logger = NHDCommon.GetLogger(__name__)
     self.logger.info('Initializing matcher')