def classify_job(selection, walltime, express=False, queue=None): names = "" ret = [] for clssname in sorted(classifications.keys()): clssset = classifications[clssname] for clss in clssset: if (queue == None) or clssname == queue: if match_class(selection, walltime, clssname, clss, express): clss["target_queue"] = clssname ret.append(clss) names = names + clssname + " " # Always go with the first if more than one class has matched. Lexical order of classname # if len(ret) > 1: # pbs.event().reject( "Job matches multiple classes: [ " + names.strip() + "]" ) if len(ret): return ret[0] else: pbs.logmsg( pbs.LOG_ERROR, "FAILED TO MATCH A JOB:" + repr(pbs.event().job.Resource_List["select"]) + " walltime " + repr(pbs.event().job.Resource_List["walltime"])) pbs.event().reject( "Job resource selection does not match any permitted configuration.\n Please review the CX1 Job Sizing guidance on:\n https://bit.ly/2AInEIj\n" )
def __init__(self, **kwords): self.host = '' self.user = '' self.job_id = '' self.nhc_cfg = None # Set up the values for host and user pbs.logmsg(pbs.EVENT_DEBUG3,"get node name") self.host = pbs.get_local_nodename() # Read in the configurations file pbs_hook_cfg = pbs.hook_config_filename if pbs_hook_cfg == None: pbs.logmsg(pbs.EVENT_DEBUG3,"%s"%os.environ) pbs_hook_cfg = os.environ["PBS_HOOK_CONFIG_FILE"] pbs.logmsg(pbs.EVENT_DEBUG3,"read config file: %s"%pbs.hook_config_filename) config_file = open(pbs.hook_config_filename).read() self.nhc_cfg = json.loads(config_file) pbs.logmsg(pbs.EVENT_DEBUG3,"config file: %s"%self.nhc_cfg) # Check to make sure the event has a user associated with it pbs.logmsg(pbs.EVENT_DEBUG3,'Event: %s'%pbs.event().type) if pbs.event().type != pbs.EXECHOST_PERIODIC: self.user = repr(pbs.event().job.Job_Owner).split("@")[0].replace("'","") self.job_id = pbs.event().job.id else: self.user = '******' self.job_id = str(time.time()) pbs.logmsg(pbs.EVENT_DEBUG3,'Done initializing NodeHealthCheck')
def queue_name(): if pbs.event().job.queue == None: return "" if isinstance(pbs.event().job.queue, str): return pbs.event().job.queue else: return pbs.event().job.queue.name
def activate_profile(self, profile_name=None, job=None): self._check_pmi() if job is None: job = pbs.event().job try: ret = self.__pmi._activate_profile(profile_name, job) if profile_name is not None: hosts = _get_vnode_names(job) for h in hosts: try: pbs.event().vnode_list[h].current_eoe = profile_name except: pass return ret except BackendError as e: # get fresh set of profile names, ignore errors mynode = pbs.event().vnode_list[pbs.get_local_nodename()] if mynode.power_provisioning: try: profiles = self.__pmi._query( pbs.Power.QUERY_PROFILE) names = self._map_profile_names(profiles) mynode.resources_available["eoe"] = names pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set eoe: %s" % names) except: pass raise BackendError(e) except InternalError as e: # couldn't do activation so set vnode offline me = pbs.get_local_nodename() pbs.event().vnode_list[me].state += pbs.ND_OFFLINE pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline") raise InternalError(e)
def __init__(self, **kwords): self.host = '' self.user = '' self.job_id = '' self.nhc_cfg = None # Set up the values for host and user pbs.logmsg(pbs.EVENT_DEBUG3, "get node name") self.host = pbs.get_local_nodename() # Read in the configurations file pbs_hook_cfg = pbs.hook_config_filename if pbs_hook_cfg is None: pbs.logmsg(pbs.EVENT_DEBUG3, "%s" % os.environ) pbs_hook_cfg = os.environ["PBS_HOOK_CONFIG_FILE"] pbs.logmsg(pbs.EVENT_DEBUG3, "read config file: %s" % pbs.hook_config_filename) config_file = open(pbs.hook_config_filename).read() self.nhc_cfg = json.loads(config_file) pbs.logmsg(pbs.EVENT_DEBUG3, "config file: %s" % self.nhc_cfg) # Check to make sure the event has a user associated with it pbs.logmsg(pbs.EVENT_DEBUG3, 'Event: %s' % pbs.event().type) if pbs.event().type != pbs.EXECHOST_PERIODIC: self.user = repr(pbs.event().job.Job_Owner).split("@")[ 0].replace("'", "") self.job_id = pbs.event().job.id else: self.user = '******' self.job_id = str(time.time()) pbs.logmsg(pbs.EVENT_DEBUG3, 'Done initializing NodeHealthCheck')
def activate_profile(self, profile_name=None, job=None): self._check_pmi() if job is None: job = pbs.event().job try: ret = self.__pmi._activate_profile(profile_name, job) if profile_name is not None: hosts = _get_vnode_names(job) for h in hosts: try: pbs.event().vnode_list[h].current_eoe = profile_name except: pass return ret except BackendError as e: # get fresh set of profile names, ignore errors mynode = pbs.event().vnode_list[pbs.get_local_nodename()] if mynode.power_provisioning: try: profiles = self.__pmi._query(pbs.Power.QUERY_PROFILE) names = self._map_profile_names(profiles) mynode.resources_available["eoe"] = names pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set eoe: %s" % names) except: pass raise BackendError(e) except InternalError as e: # couldn't do activation so set vnode offline me = pbs.get_local_nodename() pbs.event().vnode_list[me].state += pbs.ND_OFFLINE pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline") raise InternalError(e)
def extract_walltime(): hrs=0; if ( pbs.event().job.Resource_List["walltime"] == None ): pbs.event().reject("You must specify a walltime using the format\n -lwalltime=HH:MM:00") wt = pbs.event().job.Resource_List["walltime"] wt = float(wt)/ 3600. return wt
def execjob_end(): pbs.logmsg(pbs.LOG_DEBUG, "Removing any GPUs assigned to the job") job = pbs.event().job.id vn = pbs.event().vnode_list # does not matter what the value is just remove the job from file after completion value = "" UpdateGpuJobs(job, value, GpuJobsPath, add=False)
def check_pq_restriction( selection, walltime, queue ): if queue not in private_queue_restrictions: # There wasn't a restriction for this pq, so accept it return clss = private_queue_restrictions[ queue ] for c in clss: if match_class( selection, walltime, queue, c, False ): return True pbs.event().reject( "The private queue has insufficient resources for a job this size.\nPlease consult https://selfservice.rcs.imperial.ac.uk/pqs/nodes/%s to see what's available.\nFor any queries please contact us via [email protected]" % ( queue ) )
def validate_config(cls): """ Validate the config file This will check if the unix_socket_file resolves to a file. """ if not os.path.exists(cls.get_config()['unix_socket_file']): log_with_caller(pbs.EVENT_DEBUG4, 'Unix socket file does not exist, skipping hook', jobid=False) pbs.event().accept()
def __init__(self): PBS_SPOOL=os.path.join(pbs_conf()['PBS_MOM_HOME'], 'spool') self.stdout_log=os.path.join(PBS_SPOOL, '%s.OU' % str(pbs.event().job.id)) self.stderr_log=os.path.join(PBS_SPOOL, '%s.ER' % str(pbs.event().job.id)) if str(pbs.event().job.Join_Path) == 'oe': self.stderr_log=self.stdout_log elif str(pbs.event().job.Join_Path) == 'eo': self.stdout_log=self.stderr_log
def __init__(self): PBS_SPOOL = os.path.join(pbs_conf()['PBS_MOM_HOME'], 'spool') self.stdout_log = os.path.join(PBS_SPOOL, '%s.OU' % str(pbs.event().job.id)) self.stderr_log = os.path.join(PBS_SPOOL, '%s.ER' % str(pbs.event().job.id)) if str(pbs.event().job.Join_Path) == 'oe': self.stderr_log = self.stdout_log elif str(pbs.event().job.Join_Path) == 'eo': self.stdout_log = self.stderr_log
def deactivate_profile(self, job=None): self._check_pmi() if job is None: job = pbs.event().job if _running_excl(job): pbs.logjobmsg(job.id, "PMI: reset current_eoe") for h in _get_vnode_names(job): try: pbs.event().vnode_list[h].current_eoe = None except: pass return self.__pmi._deactivate_profile(job)
def log_with_caller(sev, mes, caller=0, jobid=True): """ Wrapper to pbs.logmsg with caller's name prepended Increment caller to get the caller of the calling function If jobid is true, add the jobid from the event to the log message """ if jobid: pbs.logmsg(sev, '%s:%s:%s: %s' % (pbs.event().hook_name, pbs.event().job.id, caller_name(2 + caller), mes)) else: pbs.logmsg(sev, '%s:%s: %s' % (pbs.event().hook_name, caller_name(2 + caller), mes))
def test_group_membership(permitted_groups): import pbs import sys # PBS_EXEC = '/opt/pbs/default' GETENT_CMD = '/bin/getent' # Main # sys.path.append(PBS_EXEC + '/python/lib/python2.7') # sys.path.append(PBS_EXEC + '/python/lib/python2.7/lib-dynload') from subprocess import Popen, PIPE from sets import Set e = pbs.event() j = e.job # Get the username who = str(e.requestor) # Build a list of users from all permitted groups for g in permitted_groups: output = Popen([GETENT_CMD, "group", g], stdout=PIPE).communicate()[0].strip() output = output.split(':')[-1].split(',') if who in output: return True return False
def handle_execjob_begin(): """ Handler for execjob_begin events. """ log_function_name() event = pbs.event() jid = event.job.id uid = pwd.getpwnam(event.job.euser).pw_uid log_with_caller(pbs.EVENT_DEBUG4, 'UID is %d' % uid) data = { 'jobid': jid, 'uid': uid } url = HookHelper.build_path(resource='job') timeout = HookHelper.get_config()['post_timeout'] try: r = post(url, json=data, timeout=timeout) r.raise_for_status() except requests.Timeout: log_with_caller(pbs.EVENT_ERROR, 'POST timed out') raise OfflineError('Job POST timed out') except requests.HTTPError: if r.status_code == 400: retry_post(data) else: log_with_caller(pbs.EVENT_ERROR, 'Invalid status code %d' % r.status_code) raise OfflineError('Job POST encountered invalid status code') log_with_caller(pbs.EVENT_DEBUG, 'Job %s registered' % jid)
def connect(self, endpoint=None, port=None, job=None): self._check_pmi() if job is None: try: job = pbs.event().job except EventIncompatibleError: pass return self.__pmi._connect(endpoint, port, job)
def disconnect(self, job=None): self._check_pmi() if job is None: try: job = pbs.event().job except EventIncompatibleError: pass return self.__pmi._disconnect(job)
def ContinueChk(self,status,comment=''): if isinstance(status,list): comment = str(status[1]) status = status[0].lower() elif isinstance(status,bool) != True: status = status.lower() # Check to see how to handle the status pbs.logmsg(pbs.EVENT_DEBUG3,'Status: %s\tComment: %s'%(status,comment)) if status == False: return False elif status == 'warn': pbs.logmsg(pbs.EVENT_DEBUG,'WARNING: %s'%comment) return True elif status == 'offline' or status == 'reboot': pbs.logmsg(pbs.EVENT_DEBUG,"Status: %s\tComment: %s"%(status,comment)) # Get the node, offline it, pbs.logmsg(pbs.EVENT_DEBUG,"Offline node: %s"%(self.host)) myvnode = pbs.event().vnode_list[self.host] myvnode.state = pbs.ND_OFFLINE pbs.logmsg(pbs.EVENT_DEBUG,"Offline node type: %s, comment: %s"%(type(str(comment)),comment)) myvnode.comment = "-attn_nhc: "+comment #pbs.logmsg(pbs.EVENT_DEBUG,"restart scheduler: %s %s"%(self.host,repr(myvnode.state))) #pbs.server().scheduler_restart_cycle() # Check to see if the node should be rebooted if status == 'reboot': pbs.logmsg(pbs.EVENT_DEBUG,"Comment: %s\nOfflined node: %s and rebooted"%(comment,self.host)) pbs.event().job.rerun() pbs.reboot('reboot') # Run this command if the node is rebooted # The event().reject function ends the script pbs.logmsg(pbs.EVENT_DEBUG,"Comment: %s\nOfflined node: %s and restarted scheduling cycle"%(comment,self.host)) pbs.event().reject("Offlined node, sent the reboot signal, and restarted scheduling cycle") # Reject the job pbs.event().reject("Offlined node and restarted scheduling cycle") elif status == 'online': pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(self.host)) mynodename = pbs.get_local_nodename() myvnode = pbs.event().vnode_list[mynodename] mynodename = pbs.get_local_nodename() pbs.logmsg(pbs.EVENT_DEBUG3,"got node: %s"%(mynodename)) myvnode.state = pbs.ND_FREE pbs.logmsg(pbs.EVENT_DEBUG,"Changed node state to ND_FREE: %s"%(mynodename)) myvnode.comment = None pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(mynodename)) else: return True
def GetReqGpus(): sel = repr(pbs.event().job.schedselect) req_gpus = 0 for chunk in sel.split('+'): for c in chunk.split(":"): kv = c.split("=") if (kv[0] == "ngpus"): req_gpus = kv[1] pbs.logmsg(pbs.LOG_DEBUG, "Selected ngpus = value=%s" % (str(kv[1]))) return req_gpus
def pbs_expandrange(seq): # Deal with the ranges first range = list(seq.partition('-')) lower = int(range[0]) upper = int(range[2]) if (upper <= lower ): pbs.logmsg(pbs.LOG_ERROR, "translate mpp: ERROR: bad range '%s', " \ "the first number (%d) must be less than the " \ "second number (%d)" %(seq, lower, upper)) errstr="Bad range '%s', the first number (%d) must be less than " \ "second number (%d)" %(seq, lower, upper) pbs.event().reject("The following error was encountered: \n"+errstr) expandedlist=list() while (lower <= upper): expandedlist.append(str(lower)) lower=lower+1 return expandedlist
def stderr(self, msg): """Write msg to appropriate file handle for stdout""" import sys try: if not pbs.event().job.interactive and pbs.event().job.in_ms_mom(): logfile = open(self.stderr_log, 'ab+') else: logfile = sys.stderr if DEBUG: pbs.logmsg( pbs.EVENT_DEBUG3, '%s;%s;[DEBUG3]: writing %s to %s' % (pbs.event().hook_name, pbs.event().job.id, repr(msg), logfile.name)) logfile.write(msg) logfile.flush() logfile.close() except IOError: trace_hook()
def fixup_mpiprocs_ompthreads( sel ): selstr = repr(pbs.event().job.Resource_List["select"]) if "mpiprocs" not in sel and "ompthreads" not in sel: mpiprocs = int(sel["ncpus"]) ompthreads = 1 pbs.event().job.Resource_List["select"] = pbs.select( selstr + ":mpiprocs=" + str(mpiprocs ) + ":ompthreads=" + str(ompthreads) ) elif "mpiprocs" not in sel and "ompthreads" in sel: ompthreads = int(sel["ompthreads"]) mpiprocs = ( sel["ncpus"] / ompthreads ) if mpiprocs < 1: mpiprocs = 1 pbs.event().job.Resource_List["select"] = pbs.select( selstr + ":mpiprocs=" + str(mpiprocs ) ) # Add mpiprocs = ncpus / ompthreads elif "mpiprocs" in sel and "ompthreads" not in sel: mpiprocs = int( sel["mpiprocs"] ) ompthreads = int( sel["ncpus"] ) / mpiprocs if ompthreads < 1: ompthreads =1 pbs.event().job.Resource_List["select"] = pbs.select( selstr + ":ompthreads=" + str(ompthreads ) ) else: mpiprocs = int(sel["mpiprocs"]) ompthreads= int(sel["ompthreads"]) if (mpiprocs * ompthreads) != int(sel["ncpus"]): pbs.event().reject( "mpiprocs * ompthreads must equal ncpus" )
def __init__(self, pbs_event): self.hook_events = { pbs.EXECHOST_STARTUP: self.__setallresources_handler, pbs.EXECHOST_PERIODIC: self.__setallresources_handler, } self.e = pbs_event self.vnl = pbs.event().vnode_list self.local_node = pbs.get_local_nodename() if self.vnl == None or self.local_node == None: pbs.logmsg(pbs.EVENT_DEBUG, "%s, failed to get local_node or vnl" % self.hook_name) self.e.accept()
def execjob_launch(): try: if pbs.event().env["PBS_TASKNUM"] != "1": pbs.logmsg( pbs.LOG_DEBUG, "Not the first task, so not setting CUDA_VISIBLE_DEVICES") return except: pbs.logmsg(pbs.LOG_DEBUG, "Exception in getting PBS_TASKNUM from env") job = pbs.event().job.id vn = pbs.event().vnode_list req_gpus = GetReqGpus() if req_gpus: cuda_visible_devices = "" available_gpus = [ i for i in range(0, numGpusOnHost()) ] # pbs.server().vnode(local_node).resources_available['ngpus']) ] ## Check the gpus already assigned on the node used_gpus = GetUsedGpus(GpuJobsPath) pbs.logmsg(pbs.LOG_DEBUG, "Used GPUs = %s" % (used_gpus)) if used_gpus: available_gpus = [ item for item in available_gpus if item not in used_gpus ] pbs.logmsg(pbs.LOG_DEBUG, "GPUs available for assignment %s" % (available_gpus)) for i in range(int(req_gpus)): if cuda_visible_devices != "": cuda_visible_devices += "\\," cuda_visible_devices += str(available_gpus.pop(0)) value = cuda_visible_devices pbs.logmsg(pbs.LOG_DEBUG, "The Cuda visible devices is ==> %s" % (value)) pbs.event().env['CUDA_VISIBLE_DEVICES'] = str(value) UpdateGpuJobs(job, value, GpuJobsPath, add=True)
def exechost_periodic(): vn = pbs.event().vnode_list JobsOnNode = getJobs() JobsOnFile = GetJobsInFile(GpuJobsPath) pbs.logmsg(pbs.LOG_DEBUG, "Jobs in File %s" % (JobsOnFile)) for job in JobsOnFile: if not (str(job) in JobsOnNode.keys()): # Remove this job value = "" UpdateGpuJobs(job, value, GpuJobsPath, add=False) pbs.logmsg( pbs.LOG_DEBUG, "Removing Job %s from file %s --> The job is not running on the node" % (job, GpuJobsPath))
def stderr(self, msg): """Write msg to appropriate file handle for stdout""" import sys try: if not pbs.event().job.interactive and pbs.event().job.in_ms_mom(): logfile=open(self.stderr_log, 'ab+') else: logfile=sys.stderr if DEBUG: pbs.logmsg(pbs.EVENT_DEBUG3, '%s;%s;[DEBUG3]: writing %s to %s' % (pbs.event().hook_name, pbs.event().job.id, repr(msg), logfile.name)) logfile.write(msg) logfile.flush() logfile.close() except IOError: trace_hook()
def check_express_project_code(): project = pbs.event().job.project if not project: pbs.event().reject( "You must specify an express code with -P when submitting express jobs" ) project = repr(project) if not re.match("^exp-[a-z0-9]+$", project ): pbs.event().reject( "Invalid express code: these have the format 'exp-XXXX'" ) if not test_group_membership( [ project ] ): pbs.event().reject( "You are not authorised to use this express code" ) try: import requests r = requests.get( "https://api.rcs.imperial.ac.uk/v1.0/express/%s/enabled" % ( project, ) ) if (r.status_code == 200) and (r.text != "1"): pbs.event().reject("This express code is not enabled. Please contact [email protected]" ) except :# # pbs.event().reject("Exception checking express enabled " ) pass return project
def extract_selection(): if ("select" not in pbs.event().job.Resource_List) or ( pbs.event().job.Resource_List["select"] == None): pbs.event().reject( "You must specify a resource selection using the format\n -lselect=N:ncpus=X:mem=Ygb" ) select = repr(pbs.event().job.Resource_List["select"]) select = select.split("+") if len(select) > 1: pbs.event().reject("Only one -lselect is permitted.") chunk = select[0] ret = dict() try: nodect = 0 chunk = chunk.split(":") nodect = int(chunk[0]) ret["nodect"] = nodect for rs in chunk[1:]: key = rs.split("=")[0] val = rs.split("=")[1] if key not in list_of_resources: pbs.event().reject("Resource [" + key + "] not permitted in -lselect.") # Try converting the value to an integer if it happens to be one try: val = int(val) except: pass ret[key] = val except: pass return ret
def retry_post(data): """ In the case where a POST fails due to a 400 error, it could be because there is already a job on the cray side. In that case, we should try to delete the existing job and resubmit a new one. If a previous POST timedout so we rejected it, but the service just took too long to respond, it would exist on the service. """ event = pbs.event() jid = event.job.id joburl = HookHelper.build_path(resource='job', jobid=jid) del_timeout = HookHelper.get_config()['delete_timeout'] try: r_del = delete(joburl, timeout=del_timeout) r_del.raise_for_status() except requests.Timeout: log_with_caller(pbs.EVENT_ERROR, 'DELETE timed out') raise OfflineError('Job delete timed out') except requests.HTTPError: # If 404, then maybe the job that was there is now gone, # try posting again. Otherwise, raise an OfflineError if r_del.status_code != 404: log_with_caller(pbs.EVENT_ERROR, 'DELETE job failed') raise OfflineError('Job delete failed') url = HookHelper.build_path(resource='job') post_timeout = HookHelper.get_config()['post_timeout'] try: r_post = post(url, json=data, timeout=post_timeout) r_post.raise_for_status() except requests.Timeout: log_with_caller(pbs.EVENT_ERROR, 'POST timed out') raise OfflineError('Job POST timed out') except requests.HTTPError: log_with_caller(pbs.EVENT_ERROR, 'Invalid status code %d' % r_post.status_code) raise OfflineError('Job POST encountered invalid status code') # if we got here, we've successfully deleted and re-posted the job log_with_caller(pbs.EVENT_DEBUG, 'Job %s registered' % jid) return
def _get_usage(self, job): pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id)) try: f = open(energy_file(job), "r") start = int(f.read()) f.close() except Exception: return None e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: # This function will be called for each job in turn when # running from a periodic hook. Here we fill in some # global variables just once and use the information # for each job in turn. Save the result of calling capmc # for all running jobs in the variable ninfo. Keep a # dictionary with the job id's as keys holding a set # of nid numbers. if Pmi.ninfo is None: allnids = set() for jobid in e.job_list.keys(): j = e.job_list[jobid] nidset = jobnids(j) allnids.update(nidset) Pmi.nidarray[jobid] = nidset nids, cnt = nidlist(None, allnids) Pmi.ninfo = node_energy("all", nids, cnt) nidset = Pmi.nidarray[job.id] energy = None if Pmi.ninfo is not None and "nodes" in Pmi.ninfo: energy = 0 for node in Pmi.ninfo["nodes"]: if node["nid"] in nidset: # owned by job of interest energy += node["energy_ctr"] pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy) else: nids, cnt = nidlist(job) energy = job_energy(job, nids, cnt) if energy is not None: return float(energy - start) / 3600000.0 else: return None
def _get_usage(self, job): pbs.logjobmsg(job.id, "Cray: get_usage") try: f = open(energy_file(job), "r") start = int(f.read()) f.close() except Exception: return None e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: # This function will be called for each job in turn when # running from a periodic hook. Here we fill in some # global variables just once and use the information # for each job in turn. Save the result of calling capmc # for all running jobs in the variable ninfo. Keep a # dictionary with the job id's as keys holding a set # of nid numbers. if Pmi.ninfo is None: allnids = set() for jobid in e.job_list.keys(): j = e.job_list[jobid] nidset = jobnids(j) allnids.update(nidset) Pmi.nidarray[jobid] = nidset nids, cnt = nidlist(None, allnids) Pmi.ninfo = node_energy("all", nids, cnt) nidset = Pmi.nidarray[job.id] energy = None if Pmi.ninfo is not None and "nodes" in Pmi.ninfo: energy = 0 for node in Pmi.ninfo["nodes"]: if node["nid"] in nidset: # owned by job of interest energy += node["energy_ctr"] pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy) else: nids, cnt = nidlist(job) energy = job_energy(job, nids, cnt) if energy is not None: return float(energy - start) / 3600000.0 else: return None
def handle_execjob_end(): """ Handler for execjob_end events. """ log_function_name() jid = pbs.event().job.id url = HookHelper.build_path(resource='job', jobid=jid) timeout = HookHelper.get_config()['delete_timeout'] try: r = delete(url, timeout=timeout) r.raise_for_status() except requests.Timeout: log_with_caller(pbs.EVENT_ERROR, 'DELETE timed out') raise RejectError('Job delete timed out') except requests.HTTPError: log_with_caller(pbs.EVENT_ERROR, 'DELETE job failed') raise RejectError('Job delete failed') log_with_caller(pbs.EVENT_DEBUG, 'Job %s deleted' % jid)
def jobobit_hook(): import pbs import sys try: e = pbs.event() job = e.job pbs.logjobmsg(job.id, 'jobobit hook started for test %s' % (e.hook_name, )) pbs.logjobmsg(job.id, 'jobobit hook, job starttime:%s' % (job.stime, )) pbs.logjobmsg(job.id, 'jobobit hook, job obittime:%s' % (job.obittime, )) pbs.logjobmsg(job.id, 'jobobit hook, job_state=%s' % (job.job_state, )) pbs.logjobmsg(job.id, 'jobobit hook, job_substate=%s' % (job.substate, )) state_desc = pbs.REVERSE_JOB_STATE.get(job.job_state, '(None)') substate_desc = pbs.REVERSE_JOB_SUBSTATE.get(job.substate, '(None)') pbs.logjobmsg(job.id, 'jobobit hook, job_state_desc=%s' % (state_desc, )) pbs.logjobmsg(job.id, 'jobobit hook, job_substate_desc=%s' % (substate_desc, )) if hasattr(job, "resv") and job.resv: pbs.logjobmsg(job.id, 'jobobit hook, resv:%s' % (job.resv.resvid, )) pbs.logjobmsg( job.id, 'jobobit hook, resv_nodes:%s' % (job.resv.resv_nodes, )) pbs.logjobmsg( job.id, 'jobobit hook, resv_state:%s' % (job.resv.reserve_state, )) else: pbs.logjobmsg(job.id, 'jobobit hook, resv:(None)') pbs.logjobmsg(job.id, 'jobobit hook finished for test %s' % (e.hook_name, )) except Exception as err: ty, _, tb = sys.exc_info() pbs.logmsg( pbs.LOG_DEBUG, str(ty) + str(tb.tb_frame.f_code.co_filename) + str(tb.tb_lineno)) e.reject() else: e.accept()
def extract_queue_type(): if pbs.event().job.queue == "" or pbs.event().job.queue == None: return "common" if (pbs.event().job.queue != ""): queue_name = pbs.event().job.queue.name if queue_name == "express": # len(queue_name)>0 and queue_name[0] == "e": return "express" elif len(queue_name) > 0 and (queue_name[0] == "p" or queue_name == "med-bio" or queue_name == "viz"): return "private" elif queue_name.startswith(queue_config_version): queue_name = re.sub("^" + queue_config_version, "", queue_name) return "common:" + queue_name elif queue_name == "gpgpu": pbs.event().reject( "-q gpgpu no longer required. Please submit without a queue qualification" ) else: pbs.event().reject("Unknown queue name.")
def parse_config_file(): # Turn everything off by default. These settings be modified # when the configuration file is read. global pbs_home global pbs_exec global power_ramp_rate_enable global power_on_off_enable global node_idle_limit global min_node_down_delay global max_jobs_analyze_limit global max_concurrent_nodes try: # This block will work for PBS Pro versions 13 and later pbs_conf = pbs.get_pbs_conf() pbs_home = pbs_conf['PBS_HOME'] pbs_exec = pbs_conf['PBS_EXEC'] except: pbs.logmsg(pbs.EVENT_DEBUG, "PBS_HOME needs to be defined in the config file") pbs.logmsg(pbs.EVENT_DEBUG, "Exiting the power hook") pbs.event().accept() # Identify the config file and read in the data config_file = '' if 'PBS_HOOK_CONFIG_FILE' in os.environ: config_file = os.environ["PBS_HOOK_CONFIG_FILE"] tmpcfg = '' if not config_file: tmpcfg = os.path.join(pbs_home, 'server_priv', 'hooks', 'PBS_power.CF') if os.path.isfile(tmpcfg): config_file = tmpcfg if not config_file: tmpcfg = os.path.join(pbs_home, 'mom_priv', 'hooks', 'PBS_power.CF') if os.path.isfile(tmpcfg): config_file = tmpcfg if not config_file: raise Exception("Config file not found") pbs.logmsg(pbs.EVENT_DEBUG3, "Config file is %s" % config_file) try: fd = open(config_file, 'r') config = json.load(fd) fd.close() except IOError: raise Exception("I/O error reading config file") except: raise Exception("Error reading config file") # Assign default values to attributes power_ramp_rate_enable = False power_on_off_enable = False node_idle_limit = 1800 min_node_down_delay = 1800 max_jobs_analyze_limit = 100 max_concurrent_nodes = 10 # Now assgin values read from config file if 'power_on_off_enable' in config: power_on_off_enable = config['power_on_off_enable'] pbs.logmsg(pbs.EVENT_DEBUG3, "power_on_off_enable is set to %s" % str(power_on_off_enable)) if 'power_ramp_rate_enable' in config: power_ramp_rate_enable = config['power_ramp_rate_enable'] pbs.logmsg(pbs.EVENT_DEBUG3, "power_ramp_rate_enable is set to %s" % str(power_ramp_rate_enable)) if 'node_idle_limit' in config: node_idle_limit = int(config['node_idle_limit']) if not node_idle_limit or node_idle_limit < 0: node_idle_limit = 1800 pbs.logmsg(pbs.EVENT_DEBUG3, "node_idle_limit is set to %d" % node_idle_limit) if 'min_node_down_delay' in config: min_node_down_delay = int(config['min_node_down_delay']) if not min_node_down_delay or min_node_down_delay < 0: min_node_down_delay = 1800 pbs.logmsg(pbs.EVENT_DEBUG3, "min_node_down_delay is set to %d" % min_node_down_delay) if 'max_jobs_analyze_limit' in config: max_jobs_analyze_limit = int(config['max_jobs_analyze_limit']) if not max_jobs_analyze_limit or max_jobs_analyze_limit < 0: max_jobs_analyze_limit = 100 pbs.logmsg(pbs.EVENT_DEBUG3, "max_jobs_analyze_limit is set to %d" % max_jobs_analyze_limit) if 'max_concurrent_nodes' in config: max_concurrent_nodes = int(config['max_concurrent_nodes']) if not max_concurrent_nodes or max_concurrent_nodes < 0: max_concurrent_nodes = 10 pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" % max_concurrent_nodes)
if 'max_jobs_analyze_limit' in config: max_jobs_analyze_limit = int(config['max_jobs_analyze_limit']) if not max_jobs_analyze_limit or max_jobs_analyze_limit < 0: max_jobs_analyze_limit = 100 pbs.logmsg(pbs.EVENT_DEBUG3, "max_jobs_analyze_limit is set to %d" % max_jobs_analyze_limit) if 'max_concurrent_nodes' in config: max_concurrent_nodes = int(config['max_concurrent_nodes']) if not max_concurrent_nodes or max_concurrent_nodes < 0: max_concurrent_nodes = 10 pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" % max_concurrent_nodes) # Accept if event not serviceable. this_event = pbs.event() if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE, pbs.EXECJOB_BEGIN, pbs.EXECJOB_END, pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC, pbs.PERIODIC]: pbs.logmsg(pbs.LOG_WARNING, "Event not serviceable for power provisioning.") this_event.accept() if this_event.type == pbs.PERIODIC: vnlist = this_event.vnode_list resvlist = this_event.resv_list time_now = time.time() # Parse the config file for power attributes
def trace_hook(**kwargs): """Simple exception trace logger for PBS hooks loglevel=<int> (pbs.LOG_DEBUG): log level to pass to pbs.logmsg() reject=True: reject the job upon completion of logging trace trace_in_reject=<bool> (False): pass trace to pbs.event().reject() trace_in_reject=<str>: message to pass to pbs.event().reject() with trace """ import sys if 'loglevel' in kwargs: loglevel=kwargs['loglevel'] else: loglevel=pbs.LOG_ERROR if 'reject' in kwargs: reject=kwargs['reject'] else: reject=True if 'trace_in_reject' in kwargs: trace_in_reject=kwargs['trace_in_reject'] else: trace_in_reject=False # Associate hook events with the appropriate PBS constant. This is a list # of all hook events as of PBS Pro 13.0. If the event does not exist, it is # removed from the list. hook_events=['queuejob', 'modifyjob', 'movejob', 'runjob', 'execjob_begin', 'execjob_prologue', 'execjob_launch', 'execjob_attach', 'execjob_preterm', 'execjob_epilogue', 'execjob_end', 'resvsub', 'provision', 'exechost_periodic', 'exechost_startup'] hook_event={} for he in hook_events: # Only set available hooks for the current version of PBS. if hasattr(pbs, he.upper()): event_code=eval('pbs.'+he.upper()) hook_event[event_code]=he hook_event[he]=event_code hook_event[he.upper()]=event_code del event_code else: del hook_events[hook_events.index(he)] trace={ 'line': sys.exc_info()[2].tb_lineno, 'module': sys.exc_info()[2].tb_frame.f_code.co_name, 'exception': sys.exc_info()[0].__name__, 'message': sys.exc_info()[1].message, } tracemsg='%s hook %s encountered an exception: Line %s in %s %s: %s' %( hook_event[pbs.event().type], pbs.event().hook_name, trace['line'], trace['module'], trace['exception'], trace['message'] ) rejectmsg="Hook Error: request rejected as filter hook '%s' encountered " \ "an exception. Please inform Admin" % pbs.event().hook_name if not isinstance(loglevel, int): loglevel=pbs.LOG_ERROR tracemsg='trace_hook() called with invalid argument (loglevel=%s), '\ 'setting to pbs.LOG_ERROR. ' + tracemsg pbs.logmsg(pbs.LOG_ERROR, tracemsg) if reject: tracemsg+=', request rejected' if isinstance(trace_in_reject, bool): if trace_in_reject: pbs.event().reject(tracemsg) else: pbs.event().reject(rejectmsg) else: pbs.event().reject(str(trace_in_reject)+'Line %s in %s %s:\n%s' % ( trace['line'],trace['module'],trace['exception'], trace['message'] ))
# resides. # def get_filesystem_avail_unprivileged( dirname ): o = os.statvfs(dirname) return pbs.size( "%skb" % ((o.f_bsize * o.f_bavail) / 1024) ) # get_filesystem_avail_privileged: returns available size in kbytes # (in pbs.size type) to privileged users, of the filesystem where 'dirname' # resides. # def get_filesystem_avail_privileged( dirname ): o = os.statvfs(dirname) return pbs.size( "%skb" % ((o.f_bsize * o.f_bfree) / 1024) ) # Define here the custom resources as key, and the function and its argument # for obtaining the value of the custom resource: # Format: dyn_res[<resource_name>] = [<function_name>, <function_argument>] # So "<function_name>(<function_argument>)" is called to return the value # for custom <resource_name>. dyn_res = {} dyn_res["scratch"] = [get_filesystem_avail_unprivileged, "/tmp"] dyn_res["home"] = [get_filesystem_avail_unprivileged, "/home"] vnl = pbs.event().vnode_list local_node = pbs.get_local_nodename() for k in dyn_res.keys(): vnl[local_node].resources_available[k] = dyn_res[k][0](dyn_res[k][1])
def get_usage(self, job=None): self._check_pmi() if job is None: job = pbs.event().job return self.__pmi._get_usage(job)
# (C) Alberto Coduti 2019 import pbs import sys try: if pbs.event().requestor == "a.coduti": print pbs.event().requestor pbs.event().accept() else: pbs.event().reject("Tu non sei Albè!") except SystemExit: pass except: pbs.event().reject("Exception")
# For a copy of the commercial license terms and conditions, # go to: (http://www.pbspro.com/UserArea/agreement.html) # or contact the Altair Legal Department. # # Altair’s dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of PBS Pro and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair’s trademarks, including but not limited to "PBS™", # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's # trademark licensing policies. import pbs e = pbs.event() if e.type == pbs.RESVSUB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resvsub") elif e.type == pbs.RESV_END: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resv_end") elif e.type == pbs.QUEUEJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is queuejob") elif e.type == pbs.MODIFYJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is modifyjob") elif e.type == pbs.MOVEJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is movejob") elif e.type == pbs.RUNJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is runjob") elif e.type == pbs.PERIODIC: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is periodic") elif e.type == pbs.EXECJOB_BEGIN:
''' # Define the version __version__ = '0.0.2' import sys import os import json try: import pbs # Remember, periodic events do not have a job associated to them. if pbs.event().type != pbs.EXECHOST_PERIODIC: who = pbs.event().job.euser # For limiting testing to 1 user's jobs, uncomment this and change username # pbs.logmsg(pbs.EVENT_DEBUG3,'User: %s'%who) # if who != 'jshelley': # pbs.logmsg(pbs.EVENT_DEBUG,'jshelley != %s'%who) # pbs.event().accept() pbs.logmsg(pbs.EVENT_DEBUG3,'Event: %s'%pbs.event().type) # Add the site-packages paths to the sys path pbs_conf = pbs.pbs_conf # py_path = '/opt/pbs/default/python/lib' py_path = pbs_conf['PBS_EXEC']+os.sep+'python/lib'
def ChkTouchFileAsUser(self): if self.nhc_cfg["as_user_operations"]["check"] == False: pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping touch file as user check") return True for file_dir in self.nhc_cfg["as_user_operations"]["touch_files"]: file_dir_orig=file_dir # Check to see if this is a periodic hook. If so skip pbsuser file touches if pbs.event().type == pbs.EXECHOST_PERIODIC and self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0] == 'pbsuser': pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping this check dir: %s, since this is a periodic hook"%file_dir) continue # pbs.logmsg(pbs.EVENT_DEBUG3,"Dir: %s\tUser: %s"%(file_dir,str(self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0]))) # pbs.logmsg(pbs.EVENT_DEBUG3,"Job User: %s"%(self.user)) try: new_file_dir = '' if file_dir.startswith('$') != -1: # I need to flesh out how to best handle this. # It will require looking through the job environment varilables V = pbs.event().job.Variable_List pbs.logmsg(pbs.EVENT_DEBUG3,"Type(V): %s"%(type(V))) pbs.logmsg(pbs.EVENT_DEBUG3,"Job variable list: %s"%(V)) for var in V: pbs.logmsg(pbs.EVENT_DEBUG3,"var: %s, file_dir: %s"%(var,file_dir)) pbs.logmsg(pbs.EVENT_DEBUG3,"V[var]: %s"%(V[var])) if var.startswith(file_dir[1:]): new_file_dir = V[var] pbs.logmsg(pbs.EVENT_DEBUG3,"New dir: %s"%(file_dir)) break pass # Check to see what user this test should be run as. # Options: pbsuser or pbsadmin status = '' if self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0] == 'pbsadmin': pbs.logmsg(pbs.EVENT_DEBUG3,"TouchFileAsAdmin: %s"%(file_dir)) if new_file_dir != '': status = self.TouchFileAsUser('root',new_file_dir,file_dir_orig) else: status = self.TouchFileAsUser('root',file_dir,file_dir_orig) elif self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0] == 'pbsuser': # Check to see if check is to be written to a specific user dir pbs.logmsg(pbs.EVENT_DEBUG3,"TouchFileAsUser: User: %s, Dir: %s"%(self.user,file_dir)) if file_dir.find('<userid>') != -1: file_dir = file_dir.replace('<userid>',self.user) # Try to touch the file if new_file_dir != '': status = self.TouchFileAsUser(self.user,new_file_dir,file_dir_orig) else: status = self.TouchFileAsUser(self.user,file_dir,file_dir_orig) else: pbs.logmsg(pbs.EVENT_DEBUG,"Unknown User: %s. Please specify either pbsadmin or pbsuser"% \ (str(self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0]))) return [self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][1],"Unknown User: %s. Please specify either pbsadmin or pbsuser"% \ (str(self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0]))] if status != True: return status except OSError: return [self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][1],'Can not find file/dir: %s'%file_dir] except Exception, e: return [self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][1],'Encountered an error %s for file/dir: %s'%(e,file_dir)]
conn.close() if result is not None: return float(result[0]) pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project %s has %s hours" % (project, result[0])) else: pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project not found") return None # Setting some messages contact_msg = ", please contact Admins at [email protected] and report this message." # Let's start! try: # If it's a system user accept the job if pbs.event().requestor in ["PBS_Server", "Scheduler", "pbs_mom"]: pbs.event().accept() # Check if project has been set if pbs.event().job.project is not None: project = str(pbs.event().job.project) else: project = "_pbs_project_default" # Accept if it's default project # TODO change it in production if project == "_pbs_project_default": pbs.event().accept() pbs.logmsg(pbs.LOG_DEBUG, "---> Queuejob Hook Start! Requestor is %s and project is %s" % (pbs.event().requestor, project)) pbs.logmsg(pbs.LOG_DEBUG, "---> select line is %s" % pbs.event().job.Resource_List.select)
def setBudget(project, budget): pbs.logmsg(pbs.LOG_DEBUG, "---> setBudget: " + str(project) + " set to " + str(budget)) conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432") cur = conn.cursor() cur.execute("UPDATE projects SET project_hours = %s WHERE project_name = %s;",(budget, project)) conn.commit() cur.close() conn.close() # Setting some messages contact_msg = ", please contact Admins at [email protected] and report this message." # Let's start! try: project = str(pbs.event().job.project) myQueue = str(pbs.event().job.queue) # Accept if it's default project if project == "_pbs_project_default": pbs.event().accept() pbs.logmsg(pbs.LOG_DEBUG, "---> Runjob Hook Start! User is %s and project is %s" % (pbs.event().job.euser, project)) pbs.logmsg(pbs.LOG_DEBUG, "---> select line is %s" % pbs.event().job.Resource_List.select) # ncpus = pbs.event().job.Resource_List.ncpus if ncpus is None: pbs.logmsg(pbs.LOG_DEBUG, "---> ncpus is None, parsing select line") select = str(pbs.event().job.Resource_List.select) if "ncpus=" in select:
# The following constants can be modified in run_pelog_shell.ini to match # site preferences. ENABLE_PARALLEL=False VERBOSE_USER_OUTPUT=False DEFAULT_ACTION=RERUN TORQUE_COMPAT=False import pbs import os, sys import time # Set up a few variables start_time=time.time() pbs_event=pbs.event() hook_name=pbs_event.hook_name hook_alarm=30 # default, we'll read it from the .HK later DEBUG=False # default, we'll read it from the .HK later job=pbs_event.job # The trace_hook function has been written to be portable between hooks. def trace_hook(**kwargs): """Simple exception trace logger for PBS hooks loglevel=<int> (pbs.LOG_DEBUG): log level to pass to pbs.logmsg() reject=True: reject the job upon completion of logging trace trace_in_reject=<bool> (False): pass trace to pbs.event().reject() trace_in_reject=<str>: message to pass to pbs.event().reject() with trace """ import sys
event.reject(str(e)) return power def vnodes_enabled(job): # see if power operations are allowed on all job vnodes for vn in _get_vnode_names(job): if not _svr_vnode(vn).power_provisioning: pbs.logjobmsg(job.id, "power functionality is disabled on vnode %s" % vn) return False return True # Accept if event not serviceable. this_event = pbs.event() if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE, pbs.EXECJOB_BEGIN, pbs.EXECJOB_END, pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC]: pbs.logmsg(pbs.LOG_WARNING, "Event not serviceable for power provisioning.") this_event.accept() # Set eoe values for my node if this_event.type == pbs.EXECHOST_STARTUP: from pbs.v1._pmi_utils import _is_node_provisionable # Don't connect if the server or sched is running. if not _is_node_provisionable(): pbs.logmsg(pbs.LOG_DEBUG,
eventsDict = { 1: "pbs.QUEUEJOB", 2: "pbs.MODIFYJOB", 4: "pbs.RESVSUB", 8: "pbs.MOVEJOB", 16: "pbs.RUNJOB", 32: "pbs.PROVISION", 64: "pbs.EXECJOB_BEGIN", 128: "pbs.EXECJOB_PROLOGUE", 256: "pbs.EXECJOB_EPILOGUE", 512: "pbs.EXECJOB_END", 1024: "pbs.EXECJOB_PRETERM", 4096: "pbs.EXECHOST_PERIODIC" } try: event_type = pbs.event().type user = str(pbs.event().requestor) jobID = pbs.event().job.id if event_type == 1: pbs.logmsg(pbs.LOG_DEBUG, "---> Hook called! Event pbs.QUEUEJOB by %s" % user) else: pbs.logmsg(pbs.LOG_DEBUG, "---> Hook called! Event %s by %s for job %s" % (eventsDict[event_type], user, jobID)) pbs.event().accept() except SystemExit: pass except Exception, e: pbs.event().reject('Something went wrong, just got an Exception: %s' % str(e))
mynode = pbs.event().vnode_list[pbs.get_local_nodename()] if mynode.power_provisioning: try: profiles = self.__pmi._query( pbs.Power.QUERY_PROFILE) names = self._map_profile_names(profiles) mynode.resources_available["eoe"] = names pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set eoe: %s" % names) except: pass raise BackendError(e) except InternalError, e: # couldn't do activation so set vnode offline me = pbs.get_local_nodename() pbs.event().vnode_list[me].state += pbs.ND_OFFLINE pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline") raise InternalError(e) def deactivate_profile(self, job=None): self._check_pmi() if job is None: job = pbs.event().job if _running_excl(job): pbs.logjobmsg(job.id, "PMI: reset current_eoe") for h in _get_vnode_names(job): try: pbs.event().vnode_list[h].current_eoe = None except: pass
#accounting_file = '/cm/shared/apps/pbspro/var/spool/avogadro_accounting.json' accounting_file = 'projects.acct' # Loading accounts dictionary from file if os.path.isfile(accounting_file): accounts_dictionary = {} accountFile = open(accounting_file, 'r') for line in accountFile: temp = split(line, " = ") key = str(temp[0]) value = float(temp[1]) accounts_dictionary.update({key: value}) accountFile.close() else: pbs.event().reject("No accounts database found") # Let's start! try: # If it's a system user accept the job if pbs.event().requestor in ["PBS_Server", "Scheduler", "pbs_mom"]: pbs.event().accept() # I'm setting these variables for readability if pbs.event().job.project is not None: project = str(pbs.event().job.project) project_budget = accounts_dictionary[project] else: pbs.event().reject("No project set (project=None), please contact Admins at [email protected] and report this message.")
# (C) Alberto Coduti 2019 import pbs, sys import os from string import split #accounting_file = '/cm/shared/apps/pbspro/var/spool/avogadro_accounting.json' accounting_file = 'test_avogadro_accounts' hook_debug_file = str(pbs.event().job.Variable_List["PBS_O_WORKDIR"]) + '/hook_debug.log' def durationToHours(duration): hours,minutes,seconds = str(duration).split(":") temp = float(hours) + (float(minutes)/60) + (float(seconds)/3600) return temp def hoursToDuration(hours): hours = str(int(hours)) minutes = str(int(hours*60 # Loading accounts dictionary from file if os.path.isfile(accounting_file): accounts_dictionary = {} accountFile = open(accounting_file, 'r') for line in accountFile: temp = line.split(" = ") key = str(temp[0]) value = float(temp[1]) accounts_dictionary.update({key: value}) accountFile.close() else: pbs.event().reject("No accounts database found")
# test_end_postgres.py # by Alberto Coduti - Feb 2019 import pbs, sys sys.path.append('/cm/shared/apps/pbspro/default/python/lib/python2.5/site-packages') import psycopg2 try: pbs.logmsg(pbs.LOG_DEBUG, "---> EXECJOB_END Hook called") pbs.logmsg(pbs.LOG_DEBUG, "---> pbs.event().job.euser = " + str(pbs.event().job.euser)) pbs.event().accept() except SystemExit: pass except Exception, e: pbs.event().reject('Something went wrong, just got an Exception: ' + str(e) + contact_msg)
conn.close() def walltimeToHours(walltime): hours,minutes,seconds = str(walltime).split(":") converted = float(hours) + (float(minutes)/60) + (float(seconds)/3600) pbs.logmsg(pbs.LOG_DEBUG, "---> Walltime %s converted to %s" % (walltime, converted)) return converted # Setting some messages contact_msg = ", please contact Admins at [email protected] and report this message." # Let's start! try: # Check if project has been set if pbs.event().job.project is not None: project = str(pbs.event().job.project) else: project = "_pbs_project_default" # Accept if it's default project # TODO change it in production if project == "_pbs_project_default": pbs.event().accept() # myQueue = str(pbs.event().job.queue) if myQueue == "": pbs.event().reject("No queue selected, please select a queue") if myQueue == "workq": pbs.event().reject("Queue workq is not enabled")
# test_preterm.py # by Alberto Coduti - Feb 2019 import pbs try: pbs.logmsg(pbs.LOG_DEBUG, "---> EXECJOB_PRETERM Hook called") pbs.event().accept() except SystemExit: pass except Exception, e: pbs.event().reject('Something went wrong, just got an Exception: ' + str(e) + contact_msg)
# (C) Alberto Coduti 2019 import pbs, sys import os from string import split #accounting_file = '/cm/shared/apps/pbspro/var/spool/avogadro_accounting.json' accounting_file = 'test_avogadro_accounts' hook_debug_file = str(pbs.event().job.Variable_List["PBS_O_WORKDIR"]) + '/hook_debug.log' # Loading accounts dictionary from file if os.path.isfile(accounting_file): accounts_dictionary = {} accountFile = open(accounting_file, 'r') for line in accountFile: temp = split(line, " = ") key = str(temp[0]) value = float(temp[1]) accounts_dictionary.update({key: value}) accountFile.close() else: pbs.event().reject("No accounts database found") # Let's start! try: # I'm setting these variables for readability user = str(pbs.event().requestor) user_budget = int(accounts_dictionary[user]) # If system user accept the job