def activate_profile(self, profile_name=None, job=None): self._check_pmi() if job is None: job = pbs.event().job try: ret = self.__pmi._activate_profile(profile_name, job) if profile_name is not None: hosts = _get_vnode_names(job) for h in hosts: try: pbs.event().vnode_list[h].current_eoe = profile_name except: pass return ret except BackendError as e: # get fresh set of profile names, ignore errors mynode = pbs.event().vnode_list[pbs.get_local_nodename()] if mynode.power_provisioning: try: profiles = self.__pmi._query(pbs.Power.QUERY_PROFILE) names = self._map_profile_names(profiles) mynode.resources_available["eoe"] = names pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set eoe: %s" % names) except: pass raise BackendError(e) except InternalError as e: # couldn't do activation so set vnode offline me = pbs.get_local_nodename() pbs.event().vnode_list[me].state += pbs.ND_OFFLINE pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline") raise InternalError(e)
def activate_profile(self, profile_name=None, job=None): self._check_pmi() if job is None: job = pbs.event().job try: ret = self.__pmi._activate_profile(profile_name, job) if profile_name is not None: hosts = _get_vnode_names(job) for h in hosts: try: pbs.event().vnode_list[h].current_eoe = profile_name except: pass return ret except BackendError as e: # get fresh set of profile names, ignore errors mynode = pbs.event().vnode_list[pbs.get_local_nodename()] if mynode.power_provisioning: try: profiles = self.__pmi._query( pbs.Power.QUERY_PROFILE) names = self._map_profile_names(profiles) mynode.resources_available["eoe"] = names pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set eoe: %s" % names) except: pass raise BackendError(e) except InternalError as e: # couldn't do activation so set vnode offline me = pbs.get_local_nodename() pbs.event().vnode_list[me].state += pbs.ND_OFFLINE pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline") raise InternalError(e)
def ContinueChk(self,status,comment=''): if isinstance(status,list): comment = str(status[1]) status = status[0].lower() elif isinstance(status,bool) != True: status = status.lower() # Check to see how to handle the status pbs.logmsg(pbs.EVENT_DEBUG3,'Status: %s\tComment: %s'%(status,comment)) if status == False: return False elif status == 'warn': pbs.logmsg(pbs.EVENT_DEBUG,'WARNING: %s'%comment) return True elif status == 'offline' or status == 'reboot': pbs.logmsg(pbs.EVENT_DEBUG,"Status: %s\tComment: %s"%(status,comment)) # Get the node, offline it, pbs.logmsg(pbs.EVENT_DEBUG,"Offline node: %s"%(self.host)) myvnode = pbs.event().vnode_list[self.host] myvnode.state = pbs.ND_OFFLINE pbs.logmsg(pbs.EVENT_DEBUG,"Offline node type: %s, comment: %s"%(type(str(comment)),comment)) myvnode.comment = "-attn_nhc: "+comment #pbs.logmsg(pbs.EVENT_DEBUG,"restart scheduler: %s %s"%(self.host,repr(myvnode.state))) #pbs.server().scheduler_restart_cycle() # Check to see if the node should be rebooted if status == 'reboot': pbs.logmsg(pbs.EVENT_DEBUG,"Comment: %s\nOfflined node: %s and rebooted"%(comment,self.host)) pbs.event().job.rerun() pbs.reboot('reboot') # Run this command if the node is rebooted # The event().reject function ends the script pbs.logmsg(pbs.EVENT_DEBUG,"Comment: %s\nOfflined node: %s and restarted scheduling cycle"%(comment,self.host)) pbs.event().reject("Offlined node, sent the reboot signal, and restarted scheduling cycle") # Reject the job pbs.event().reject("Offlined node and restarted scheduling cycle") elif status == 'online': pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(self.host)) mynodename = pbs.get_local_nodename() myvnode = pbs.event().vnode_list[mynodename] mynodename = pbs.get_local_nodename() pbs.logmsg(pbs.EVENT_DEBUG3,"got node: %s"%(mynodename)) myvnode.state = pbs.ND_FREE pbs.logmsg(pbs.EVENT_DEBUG,"Changed node state to ND_FREE: %s"%(mynodename)) myvnode.comment = None pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(mynodename)) else: return True
def __init__(self, **kwords): self.host = '' self.user = '' self.job_id = '' self.nhc_cfg = None # Set up the values for host and user pbs.logmsg(pbs.EVENT_DEBUG3,"get node name") self.host = pbs.get_local_nodename() # Read in the configurations file pbs_hook_cfg = pbs.hook_config_filename if pbs_hook_cfg == None: pbs.logmsg(pbs.EVENT_DEBUG3,"%s"%os.environ) pbs_hook_cfg = os.environ["PBS_HOOK_CONFIG_FILE"] pbs.logmsg(pbs.EVENT_DEBUG3,"read config file: %s"%pbs.hook_config_filename) config_file = open(pbs.hook_config_filename).read() self.nhc_cfg = json.loads(config_file) pbs.logmsg(pbs.EVENT_DEBUG3,"config file: %s"%self.nhc_cfg) # Check to make sure the event has a user associated with it pbs.logmsg(pbs.EVENT_DEBUG3,'Event: %s'%pbs.event().type) if pbs.event().type != pbs.EXECHOST_PERIODIC: self.user = repr(pbs.event().job.Job_Owner).split("@")[0].replace("'","") self.job_id = pbs.event().job.id else: self.user = '******' self.job_id = str(time.time()) pbs.logmsg(pbs.EVENT_DEBUG3,'Done initializing NodeHealthCheck')
def __init__(self, **kwords): self.host = '' self.user = '' self.job_id = '' self.nhc_cfg = None # Set up the values for host and user pbs.logmsg(pbs.EVENT_DEBUG3, "get node name") self.host = pbs.get_local_nodename() # Read in the configurations file pbs_hook_cfg = pbs.hook_config_filename if pbs_hook_cfg is None: pbs.logmsg(pbs.EVENT_DEBUG3, "%s" % os.environ) pbs_hook_cfg = os.environ["PBS_HOOK_CONFIG_FILE"] pbs.logmsg(pbs.EVENT_DEBUG3, "read config file: %s" % pbs.hook_config_filename) config_file = open(pbs.hook_config_filename).read() self.nhc_cfg = json.loads(config_file) pbs.logmsg(pbs.EVENT_DEBUG3, "config file: %s" % self.nhc_cfg) # Check to make sure the event has a user associated with it pbs.logmsg(pbs.EVENT_DEBUG3, 'Event: %s' % pbs.event().type) if pbs.event().type != pbs.EXECHOST_PERIODIC: self.user = repr(pbs.event().job.Job_Owner).split("@")[ 0].replace("'", "") self.job_id = pbs.event().job.id else: self.user = '******' self.job_id = str(time.time()) pbs.logmsg(pbs.EVENT_DEBUG3, 'Done initializing NodeHealthCheck')
def is_it_exclusive(job): """ check to see if the job requested exclusive, or if the nodes are marked exclusive. This needs to be passed to ATOM. """ place = str(job.Resource_List["place"]) log_with_caller(pbs.EVENT_DEBUG4, "place is %s" % place) # See if the node sharing value has exclusive vn = pbs.server().vnode(pbs.get_local_nodename()) sharing = vn.sharing log_with_caller(pbs.EVENT_DEBUG4, "The sharing value is %s type %s" % (str(sharing), str(type(sharing)))) # Uses the same logic as the scheduler (is_excl()) if sharing == pbs.ND_FORCE_EXCL or sharing == pbs.ND_FORCE_EXCLHOST: return True if sharing == pbs.ND_IGNORE_EXCL: return False if any(s.startswith('excl') for s in place.split(':')): return True if any(s.startswith('shared') for s in place.split(':')): return False if (sharing == pbs.ND_DEFAULT_EXCL or sharing == pbs.ND_DEFAULT_EXCLHOST): return True if sharing == pbs.ND_DEFAULT_SHARED: return False return False
def __init__(self, pbs_event): self.hook_events = { pbs.EXECHOST_STARTUP: self.__setallresources_handler, pbs.EXECHOST_PERIODIC: self.__setallresources_handler, } self.e = pbs_event self.vnl = pbs.event().vnode_list self.local_node = pbs.get_local_nodename() if self.vnl == None or self.local_node == None: pbs.logmsg(pbs.EVENT_DEBUG, "%s, failed to get local_node or vnl" % self.hook_name) self.e.accept()
def __init__(self, e): self.rc = -1 self.e = e self.parse_cfg() self.nodename = pbs.get_local_nodename() try: self.node = pbs.server().vnode(self.nodename) except: pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; failed to get node info from server") self.e.reject() self.vnl = self.e.vnode_list
def main(): """ Main function for execution """ log_function_name() hostname = pbs.get_local_nodename() # Log the hook event type event = pbs.event() handlers = { pbs.EXECJOB_BEGIN: (handle_execjob_begin, OfflineError), pbs.EXECJOB_END: (handle_execjob_end, RejectError) } handler, timeout_exc = handlers.get(event.type, (None, None)) if not handler: log_with_caller(pbs.EVENT_ERROR, '%s event is not handled by this hook' % event.type, jobid=False) event.accept() try: handler() except KeyboardInterrupt: raise timeout_exc('Handler alarmed')
# to ignore any offline nids so we won't generate a # spurious HUP of the MoM if there are offline vnodes # whose nids are present in the apstat output. We use # the set.discard() method because it doesn't throw an # error if the nid isn't present in the apstat_nids_set. offline_nids_list.append(int(pbs_craynid)) else: pbs_nids_set.add(int(pbs_craynid)) if len(cray_login_list) == 0: msg += ["ALPS Inventory Check: No eligible " + "login nodes to perform inventory check"] __exit_hook(0, msg) cray_login_local_name = pbs.get_local_nodename() try: inventory_node = cray_login_list[0] inventory_addr = socket.gethostbyname(inventory_node) if ((inventory_addr not in my_addresses()) and ( inventory_node != cray_login_local_name)): msg += ["ALPS Inventory Check: Login node '%s' is in charge of " "verification, skipping check on '%s'." % (inventory_node, socket.gethostname())] __exit_hook(0, msg) start = time.time() apstat_nids_set = get_apstat_nids(msg) apstat_query_duration = time.time() - start
import pbs import os import sys import time def print_attribs(pbs_obj): for a in pbs_obj.attributes: v = getattr(pbs_obj, a) if (v != None) and str(v) != "": pbs.logmsg(pbs.LOG_DEBUG, "%s = %s" % (a, v)) #print_attribs(pbs.server()) e = pbs.event() print_attribs(e) vn = e.vnode_list print_attribs(vn[pbs.get_local_nodename()]) # Setting the resource value explicitly x = 10 pbs.logmsg(pbs.LOG_DEBUG, "foo value is %s" % (x)) e.accept(0)
pbs.logmsg(pbs.EVENT_DEBUG, "stdout: %s" % out) pbs.logmsg(pbs.EVENT_DEBUG, "stderr: %s" % err) process = subprocess.Popen(['/sbin/ifconfig', 'ib0', ib0_ip], stdout=subprocess.PIPE) out, err = process.communicate() pbs.logmsg(pbs.EVENT_DEBUG, "stdout: %s" % out) pbs.logmsg(pbs.EVENT_DEBUG, "stderr: %s" % err) # Check to see if ib0 is back up try: pbs.logmsg(pbs.EVENT_DEBUG, get_ip_address('ib0')) except IOError: pbs.logmsg(pbs.EVENT_DEBUG, "Offline the node") # Offline the node vnlist = pbs.event().vnode_list hostname = pbs.get_local_nodename() pbs.logmsg(pbs.EVENT_DEBUG, "Offline hostname") for v in vnlist.keys(): pbs.logmsg(pbs.EVENT_DEBUG, "Node: %s" % v) vnlist[v].state = pbs.ND_OFFLINE vnlist[v].comment = "No ib0 on node" pbs.event().reject("No IB on node") else: try: pbs.logmsg(pbs.EVENT_DEBUG, get_ip_address('eth1')) except IOError: pbs.logmsg(pbs.EVENT_DEBUG, "eth1: not found") # Search the waagent logs and see if RDMA is enabled wa_fin = open("/var/log/waagent.log") wa_data = wa_fin.readlines() wa_fin.close()
elif event == 'epilogue': return_action = DELETE if proc.returncode == 2: return_action = RERUN rejectjob( '%s exited with a status of %s.' % (p + event, proc.returncode), return_action) else: if DEBUG: pbs.logmsg( pbs.LOG_DEBUG, '%s;%s;[DEBUG] %s exited with a status of 0.' % (hook_name, job.id, p + event)) if pbs_event.type == pbs.EXECJOB_PROLOGUE and VERBOSE_USER_OUTPUT: print '%s: attached as primary execution host.' % \ pbs.get_local_nodename() pbs_event.accept() else: rejectjob("The %s does not have the correct " % (p+event) + \ 'permissions. See the section entitled, ' + \ '"Prologue and Epilogue Requirements" in the PBS Pro ' + \ "Administrator's Guide.", RERUN) except SystemExit: pass except: trace_hook()
m = re.search('.*scratch_shared.*', i) if m: scratch_shared = True if scratch_shared: if "place" in j.Resource_List.keys(): m = re.search('.*group=.*', str(j.Resource_List["place"])) if not m: j.Resource_List["place"] = pbs.place( str(j.Resource_List["place"]) + ":group=cluster") else: j.Resource_List["place"] = pbs.place("group=cluster") if e.type == pbs.EXECJOB_BEGIN: j = e.job scratch_type = None node = pbs.get_local_nodename() config = parse_cfg() pbs.logmsg(pbs.EVENT_DEBUG, "scratch hook, node: %s" % node) pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook, %s has exec_vnode: %s" % (j.id, str(j.exec_vnode))) resources = parse_exec_vnode(j.exec_vnode) pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook, %s scratch resources: %s" % (j.id, str(resources))) # pokud byl pro node zadan typ scratche, nastavime ho do scratch_type
this_event.reject(str(e)) # Set eoe values for my node if this_event.type == pbs.EXECHOST_STARTUP: from pbs.v1._pmi_utils import _is_node_provisionable # Don't connect if the server or sched is running. if not _is_node_provisionable(): pbs.logmsg(pbs.LOG_DEBUG, "Provisioning cannot be enabled on this host") this_event.accept() power = init_power(this_event) profiles = power.query(pbs.Power.QUERY_PROFILE) if profiles is not None: me = pbs.get_local_nodename() this_event.vnode_list[me].resources_available[ "eoe"] = power._map_profile_names(profiles) power.disconnect() this_event.accept() # Gather energy usage for all jobs if this_event.type == pbs.EXECHOST_PERIODIC: # Check if any jobs are running if len(this_event.job_list) == 0: this_event.accept() power = init_power(this_event) for jobid in this_event.job_list: # set energy usage
def get_avail(scratch_type, running_jobs): # first, we get the total size and then we subtract the dead size from total size global scratch_paths total_size = 0 total_dead_size = 0 try: # the total size of the scratch partition s = os.statvfs("/%s/" % scratch_paths[scratch_type]) total_size = (s.f_bsize * s.f_blocks) / 1024 except: return 0 try: s = os.statvfs("/%s/" % scratch_paths[scratch_type]) free_size = (s.f_bsize * s.f_bavail) / 1024 except: free_size = 0 # for each user directory and each job directory in the scratch dir # check the deadsize but only if the job is not runnning for user in os.listdir( os.path.join("/", scratch_paths[scratch_type])): user_path = os.path.join("/", scratch_paths[scratch_type], user) if not os.path.isdir(user_path): continue for job in os.listdir(user_path): job_path = os.path.join("/", scratch_paths[scratch_type], user, job) if not os.path.isdir(job_path): continue if job.replace("job_", "") in running_jobs: # do not count running jobs continue total_dead_size += get_deadsize(job_path) # check files outside job directories total_dead_size += get_nonjob_trash( os.path.join("/", scratch_paths[scratch_type])) workspace = total_size - total_dead_size # reserved = SUM(reserved[j], j in running_jobs) reserved = 0 for job in pbs.event().job_list.keys(): local_node = pbs.get_local_nodename() resources = parse_exec_vnode( pbs.event().job_list[job].exec_vnode) if local_node in resources.keys( ) and "scratch_type" in resources[local_node].keys( ) and scratch_type == resources[local_node]["scratch_type"]: reserved = resources[local_node][scratch_type] free_correction = max(0, workspace - reserved - free_size) pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook %s total_dead_size: %d workspace: %d reserved: %d free_correction %d free_size %s" % (scratch_type, total_dead_size, workspace, reserved, free_correction, free_size)) return max(0, workspace - free_correction)
cray_login_total = len(cray_login) if cray_login_total > 0: pbs.logmsg( pbs.LOG_DEBUG, "PBS/ALP Inventory Check: Total Eligible Cray Login Nodes = %s" % (cray_login_total)) else: pbs.logmsg( pbs.LOG_ERROR, "PBS/ALP Inventory Check: NO ELIGIBLE Cray Login Nodes to perform inventory check!!" ) pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: FINISH") sys.exit(0) # Determine the local cray_login index number cray_login_local_name = pbs.get_local_nodename() if ADDITIONAL_DEBUG: pbs.logmsg( pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, Cray Login Local Name = %s" % (cray_login_local_name)) # Evaluate whether cray_login_local_name is in the cray_login list try: cray_login_index = cray_login.index(str(cray_login_local_name)) pbs.logmsg( pbs.LOG_DEBUG, "PBS/ALP Inventory Check: Evaluating Cray Login node (%s, %s) for executing hook" % (cray_login_local_name, cray_login_index))
pbs.logmsg(pbs.EVENT_DEBUG4, "cmd: %s" % cmd) # Collect the job substate information process = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() pbs.logmsg(pbs.EVENT_DEBUG, "%s: Output: %s" % (caller_name(), out)) pbs.logmsg(pbs.EVENT_DEBUG, "%s: Error: %s" % (caller_name(), err)) pbs.logmsg(pbs.EVENT_DEBUG, "%s: Return Code: %s" % (caller_name(), process.returncode)) return(True) except Exception as exc: pbs.logmsg(pbs.EVENT_DEBUG, "%s: Unexpected error: %s" % (caller_name(), exc)) return(False) mom = pbs.get_local_nodename().lower() mom = mom.split(".")[0] pbs.logmsg(pbs.EVENT_DEBUG, "Mom: %s" % mom) try: if e.job.in_ms_mom(): # See if the job requests beeond # [Rework] Look for ENV that requests beeond or do it at the chunk level if "place" in j.Resource_List: results = str(j.Resource_List["place"]).find("excl") pbs.logmsg(pbs.EVENT_DEBUG, "Results: %s" % (results)) if str(j.Resource_List["place"]).find("excl") == -1: pbs.logmsg(pbs.EVENT_DEBUG, "Not an exclusive job: %s" % (j.Resource_List["place"])) e.accept() else: # Check to see if PBS_JOB_FS=BEEOND is in the job environment env = j.Variable_List
if ADDITIONAL_DEBUG: for cl in cray_login: pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, Eligible Cray Login Nodes = %s" % (cl)) # Determine the total number of cray_login nodes cray_login_total = len(cray_login) if cray_login_total > 0: pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: Total Eligible Cray Login Nodes = %s" % (cray_login_total)) else: pbs.logmsg(pbs.LOG_ERROR, "PBS/ALP Inventory Check: NO ELIGIBLE Cray Login Nodes to perform inventory check!!") pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: FINISH") sys.exit(0) # Determine the local cray_login index number cray_login_local_name = pbs.get_local_nodename() if ADDITIONAL_DEBUG: pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, Cray Login Local Name = %s" % (cray_login_local_name)) # Evaluate whether cray_login_local_name is in the cray_login list try: cray_login_index = cray_login.index(str(cray_login_local_name)) pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: Evaluating Cray Login node (%s, %s) for executing hook" % (cray_login_local_name, cray_login_index)) # Start the clock at the beginning of the hour: 0mins time = 0 time = (cray_login_index - 1) * 5