async def MonitorTriadSets(spec, meta, **kwargs): logger = NHDCommon.GetLogger(__name__) logger.debug(f'Kicking off controller timer for {meta["namespace"]}/{meta["name"]}') try: config.load_incluster_config() except: config.load_kube_config() v1 = client.CoreV1Api() for ord in range(spec['replicas']): podname = f'{spec["serviceName"]}-{ord}' try: p = v1.read_namespaced_pod(name = podname, namespace = meta["namespace"]) except ApiException as e: logger.info(f'Triad pod {podname} not found in namespace {meta["namespace"]}, but TriadSet is still active. Restarting pod') podspec = yaml.dump(spec["template"]) # Indent the pod spec to line up with the rest of the yaml podspec = f"apiVersion: v1\nkind: Pod\n{podspec}" # Reload the yaml to patch some fields podyaml = yaml.safe_load(podspec) podyaml['metadata']['name'] = podname # Give it the canonical statefulset-type name # Patch in the hostname and subdomain to create a DNS record like a statefulset podyaml['spec']['hostname'] = podname podyaml['spec']['subdomain'] = meta["name"] kopf.adopt(podyaml) obj = v1.create_namespaced_pod(namespace = meta['namespace'], body = podyaml)
def __init__(self, dat, isFile): self.logger = NHDCommon.GetLogger(__name__) self.cfg = None self.top : CfgTopology = CfgTopology() if not isFile: self.LoadCfgStr(dat) else: self.LoadCfgFile(dat)
def __init__(self, q: Queue): threading.Thread.__init__(self) self.logger = NHDCommon.GetLogger(__name__) self.nodes = {} self.k8s = K8SMgr.GetInstance() self.sched_name = NHD_SCHED_NAME self.whitelist = [] self.matcher = Matcher() self.pod_state = {} self.mainq = q self.ver = pkg_resources.get_distribution("nhd").version self.logger.warning(f'NHD version {self.ver}')
def __init__(self): self.arch: CpuType = CpuType.CPU_TYPE_ALL self.misc_cores: List[Core] = [] self.proc_groups: List[ProcGroup] = [] self.nic_core_pairing: List[NICGroup] = [] self.misc_cores_smt: SMTSetting = SMTSetting.SMT_DISABLED self.map_type: TopologyMapType = TopologyMapType.TOPOLOGY_MAP_INVALID self.ctrl_vlan: VLANInfo = None self.data_default_gw: str = '' self.hugepages_gb = 0 self.logger = NHDCommon.GetLogger(__name__) self.logger.info('Initializing matcher')
def __init__(self, name): self.logger = NHDCommon.GetLogger(__name__) self.name = name self.cores: List[NodeCore] = [] self.gpus = [] self.nics = [] self.sockets = 0 self.numa_nodes = 0 self.smt_enabled = False self.cores_per_proc = 0 self.pods_scheduled = set() self.sriov_en = False self.data_vlan = 0 self.gwip: str = '0.0.0.0/32' self.mem: NodeMemory = NodeMemory() self.reserved_cores = [] # Reserved CPU cores
def __init__(self): """ Initializes the logger and loads Kubernetes configuration """ self.logger = NHDCommon.GetLogger(__name__) #config.load_incluster_config() if K8SMgr.__instance != None: raise Exception("Cannot create more than one K8SMgr!") else: try: config.load_incluster_config() except: config.load_kube_config() self.v1 = client.CoreV1Api() self.last_seen_ver = None K8SMgr.__instance = self
def __init__(self, name, active = True): self.logger = NHDCommon.GetLogger(__name__) self.name = name self.active = active self.cores: List[NodeCore] = [] self.gpus = [] self.nics = [] self.sockets = 0 self.numa_nodes = 0 self.smt_enabled = False self.cores_per_proc = 0 self.pod_info = {} self.data_vlan = 0 self.groups: List[str] = ['default'] self.gwip : str = '0.0.0.0/32' self.mem: NodeMemory = NodeMemory() self.reserved_cores = [] # Reserved CPU cores
def TriadNodeUpdate(spec, old, new, meta, **_): logger = NHDCommon.GetLogger(__name__) NHDTainted = lambda obj: any([x['key'] == 'sigproc.viasat.io/nhd_scheduler' for x in obj['spec']['taints']]) k8sq = qinst # If the NHD taint has been added/removed or the code has been cordoned/uncordoned, detect it here if (not NHDTainted(old) and NHDTainted(new)) or (('unschedulable' in old['spec'] and 'unschedulable' not in new['spec']) and NHDTainted(new)): # Uncordon logger.info(f'Uncordoning node {meta["name"]}') k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_NODE_UNCORDON, "node": meta["name"]}) elif (not NHDTainted(new) and NHDTainted(old)) or ('unschedulable' not in old['spec'] and 'unschedulable' in new['spec']): # Cordon: logger.info(f'Cordoning node {meta["name"]}') k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_NODE_CORDON, "node": meta["name"]}) # Detect NHD group changes. If the label didn't exist, or it's now different than the old one, send the new one if ('NHD_GROUP' not in old['metadata']['labels'] and 'NHD_GROUP' in new['metadata']['labels']) or \ ('NHD_GROUP' in old['metadata']['labels'] and 'NHD_GROUP' in new['metadata']['labels'] and old['metadata']['labels'] != new['metadata']['labels']): logger.info(f'Updating NHD group for node {meta["name"]} to {new["metadata"]["labels"]["NHD_GROUP"]}') k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_GROUP_UPDATE, "node": meta["name"], "groups": new['metadata']['labels']['NHD_GROUP']}) elif ('NHD_GROUP' in old['metadata']['labels']) and ('NHD_GROUP' not in new['metadata']['labels']): # Label removed logger.info(f'Updating NHD group for node {meta["name"]} to default') k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_GROUP_UPDATE, "node": meta["name"], "groups" : "default"})
def delete_fn(meta, **_): logger = NHDCommon.GetLogger(__name__) logger.info('Received delete request for TriadSet')
def TriadSetCreate(spec, meta, **_): logger = NHDCommon.GetLogger(__name__) logger.info(f'Found new TriadSet for component {spec["serviceName"]} with {spec["replicas"]} replicas in namespace {meta["namespace"]}')
def HandleExceptions(loop, context): logger = NHDCommon.GetLogger(__name__) msg = context.get("exception", context["message"]) logger.error(f"Caught exception: {msg}") logger.info("Shutting down...") os._exit(-1) # Kill entire application and let k8s restart it. No state needs to be preserved
def TriadPodDelete(spec, meta, **_): logger = NHDCommon.GetLogger(__name__) logger.info(f'Saw deleted Triad pod {meta["namespace"]}.{meta["name"]}') k8sq = qinst # Get the watch queue so we can notify NHD of events from the controller k8sq.put({"type": NHDWatchTypes.NHD_WATCH_TYPE_TRIAD_POD_DELETE, "pod": {"ns": meta["namespace"], "name": meta["name"]}})
def __init__(self, q: Queue): self.mainq = q self.logger = NHDCommon.GetLogger(__name__)
def __init__(self, q: Queue): self.logger = NHDCommon.GetLogger(__name__) self.mainq = q threading.Thread.__init__(self)
def __init__(self): self.logger = NHDCommon.GetLogger(__name__) self.logger.info('Initializing matcher')