def flush_log_messages(msg=None): """ Prints msg to the log file """ if msg is not None: for m in msg: pbs.logmsg(pbs.LOG_DEBUG, m)
def _get_usage(self, job): pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: %s get_usage" % (job.id)) report = api.MonitorReport(job.id) if report is not None and report[0] == 'total_energy': pbs.logjobmsg(job.id, "SGI: energy %fkWh" % report[1]) return report[1] return None
def _deactivate_profile(self, job): pbs.logmsg(pbs.LOG_DEBUG, "SGI: deactivate") try: api.MonitorStop(job.id) finally: # be sure to remove the nodeset api.NodesetDelete(job.id) return False
def activate_profile(self, profile_name=None, job=None): self._check_pmi() if job is None: job = pbs.event().job try: ret = self.__pmi._activate_profile(profile_name, job) if profile_name is not None: hosts = _get_vnode_names(job) for h in hosts: try: pbs.event().vnode_list[h].current_eoe = profile_name except: pass return ret except BackendError as e: # get fresh set of profile names, ignore errors mynode = pbs.event().vnode_list[pbs.get_local_nodename()] if mynode.power_provisioning: try: profiles = self.__pmi._query( pbs.Power.QUERY_PROFILE) names = self._map_profile_names(profiles) mynode.resources_available["eoe"] = names pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set eoe: %s" % names) except: pass raise BackendError(e) except InternalError as e: # couldn't do activation so set vnode offline me = pbs.get_local_nodename() pbs.event().vnode_list[me].state += pbs.ND_OFFLINE pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline") raise InternalError(e)
def _connect(self, endpoint, port, job): if job is None: pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: connect") else: pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: %s connect" % (job.id)) api.VerifyConnection() return
def setBudget(project, budget): pbs.logmsg(pbs.LOG_DEBUG, "---> setBudget " + str(project) + " to " + str(budget)) conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432") cur = conn.cursor() cur.execute("UPDATE projects SET project_hours = %s WHERE project_name = %s;",(budget, project)) conn.commit() cur.close() conn.close()
def _pmi_power_on(self, hosts): pbs.logmsg(pbs.LOG_DEBUG, "Cray: powering-on the node") nidset = nodenids(hosts) nids, _ = nidlist(None, nidset) cmd = "node_on --nids " + nids func = "pmi_power_on" launch(func, cmd) return True
def TouchFile(self,fname,times=None): try: open(fname, 'a').close() os.utime(fname, times) return True except IOError: pbs.logmsg(pbs.EVENT_DEBUG3,"Failed to touch file: %s"%(fname)) return False
def isEntitled(user, project): conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432") cur = conn.cursor() cur.execute("SELECT user_projects FROM users WHERE user_name = '" + user + "';") #result = cur.fetchone()[0] result = cur.fetchone() if result is not None: pbs.logmsg(pbs.LOG_DEBUG, "---> isEntitled: user %s is part of project(s) %s, we are checking project %s" % (user, result[0], project)) return (project in result[0]) else: pbs.logmsg(pbs.LOG_DEBUG, "---> isEntitled: user %s is not on database" % user) return False
def getBudget(project): conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432") cur = conn.cursor() cur.execute("SELECT project_hours FROM projects WHERE project_name = '%s';" % project) result = cur.fetchone() cur.close() conn.close() if result is not None: pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project and hours are " + str(project) + " : " + str(result[0])) return float(result[0]) else: pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project not found") return None
def ConvertToBytes(self,value): # Determine what units the user would like to use. if self.nhc_cfg["disk_space"]["units"].lower() == 'binary': units = {'kb':1024,'mb':1048576,'gb':1073741824,'tb':1099511627776} elif self.nhc_cfg["disk_space"]["units"].lower() == 'decimal': units = {'kb':1000,'mb':1000000,'gb':1000000000,'tb':1000000000000} else: pbs.logmsg(pbs.EVENT_DEBUG3,"I'm not sure how to handle units: %s\nSo I will default to binary"%\ (self.nhc_cfg["disk_space"]["units"])) units = {'kb':1024,'mb':1048576,'gb':1073741824,'tb':1099511627776} value = value.lower() if value.find('%') !=-1: pbs.logmsg(pbs.EVENT_DEBUG3,"found a % symbol") # Returned as a float so that I can distinguish between percentage vs free space value = float(value.strip('%')) pbs.logmsg(pbs.EVENT_DEBUG3,"value: %s"%value) else: for key in units.keys(): if value.find(key) != -1: try: value = int(value[:-2].strip())*units[key] except Exception, e: pbs.logmsg(pbs.EVENT_DEBUG,"Error convertion value to int: %s\tkey: %s"%(value,key)) return False break
def ChkMountPoints(self): if self.nhc_cfg['mounts']['check'] == False: pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping mounts check") return True for mnt_pnt in self.nhc_cfg["mounts"]["mount_points"]: pbs.logmsg(pbs.EVENT_DEBUG3,"mount point: %s, %s"%(mnt_pnt,self.nhc_cfg["mounts"]["mount_points"][mnt_pnt])) try: # Added the line below to check to see if the real path is a mount or not if not os.path.ismount(os.path.realpath(mnt_pnt)): pbs.logmsg(pbs.EVENT_DEBUG3,"Mount: %s\tAction: %s"%(mnt_pnt,self.nhc_cfg["mounts"]["mount_points"][mnt_pnt])) return [self.nhc_cfg["mounts"]["mount_points"][mnt_pnt],'%s does not appear to be mounted'%mnt_pnt] except Exception, e: pbs.logmsg(pbs.EVENT_DEBUG,"Mount check error: %s"%e) return False pbs.logmsg(pbs.EVENT_DEBUG3,"mount point %s checked out"%(mnt_pnt))
def _pmi_ramp_up(self, hosts): pbs.logmsg(pbs.LOG_DEBUG, "Cray: ramping up the node") nidset = nodenids(hosts) nids, _ = nidlist(None, nidset) cmd = "get_sleep_state_limit_capabilities --nids " + nids func = "pmi_ramp_up" out = launch(func, cmd) for n in out["nids"]: if "data" in n: nid = n["nid"] states = n["data"]["PWR_Attrs"][0]["PWR_AttrValueCapabilities"] for s in reversed(states): if int(s) != 0: cmd = "set_sleep_state_limit --nids " + str(nid) + " --limit " + str(s) launch(func, cmd) sleep_time = random.randint(1, 10) time.sleep(sleep_time) return True
def _get_usage(self, job): pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id)) try: f = open(energy_file(job), "r") start = int(f.read()) f.close() except Exception: return None e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: # This function will be called for each job in turn when # running from a periodic hook. Here we fill in some # global variables just once and use the information # for each job in turn. Save the result of calling capmc # for all running jobs in the variable ninfo. Keep a # dictionary with the job id's as keys holding a set # of nid numbers. if Pmi.ninfo is None: allnids = set() for jobid in e.job_list.keys(): j = e.job_list[jobid] nidset = jobnids(j) allnids.update(nidset) Pmi.nidarray[jobid] = nidset nids, cnt = nidlist(None, allnids) Pmi.ninfo = node_energy("all", nids, cnt) nidset = Pmi.nidarray[job.id] energy = None if Pmi.ninfo is not None and "nodes" in Pmi.ninfo: energy = 0 for node in Pmi.ninfo["nodes"]: if node["nid"] in nidset: # owned by job of interest energy += node["energy_ctr"] pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy) else: nids, cnt = nidlist(job) energy = job_energy(job, nids, cnt) if energy is not None: return float(energy - start) / 3600000.0 else: return None
def rejectjob(reason, action=DEFAULT_ACTION): """Log job rejection and then call pbs.event().reject()""" # Arguments to pbs.event().reject() do nothing in execjob events. Log a # warning instead, update the job comment, then reject the job. if action == RERUN: job.rerun() reason='Requeued - %s' % reason elif action == DELETE: job.delete() reason='Deleted - %s' % reason else: reason='Rejected - %s' % reason job.comment='%s: %s' % (hook_name, reason) pbs.logmsg(pbs.LOG_WARNING, ';'.join([hook_name, job.id, reason])) pbs.logjobmsg(job.id, reason) # Add a message that can be tracejob'd if VERBOSE_USER_OUTPUT: print reason pbs_event.reject()
def ChkDirFilePermissions(self): """ Returns True if the permissions match. The permissions from python are returned as string with the '0100600'. The last three digits are the file permissions for user,group, world Return action if the permissions don't match and NoFileOrDir if it can't find the file/dir """ if self.nhc_cfg["permissions"]["check"] == False: pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping permissions check") return True for file_dir in self.nhc_cfg["permissions"]["check_dirs_and_files"]: pbs.logmsg(pbs.EVENT_DEBUG3,"File/Dir: %s\t%s"%(file_dir,str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]))) try: st = os.stat(file_dir) permissions = oct(st.st_mode) if permissions[-len(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):] != str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]): pbs.logmsg(pbs.EVENT_DEBUG3,"Required permissions: %s\tpermissions: %s"%(str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]),permissions[-len(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):])) return [self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][1],"File/Dir: %s\tRequired permissions: %s\tpermissions: %s"% \ (file_dir, str(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]),\ permissions[-len(self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][0]):])] except OSError: return [self.nhc_cfg["permissions"]["check_dirs_and_files"][file_dir][1],"Can not find file/dir: %s"%file_dir] except: return False return True
def ChkProcesses(self): if self.nhc_cfg["processes"]["check"] == False: pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping processes check") return True # List all of the processes procs = {} if platform.uname()[0] == 'Linux': #out, err = subprocess.Popen(['ps', '-Af'], stdout=subprocess.PIPE).communicate() out, err = subprocess.Popen(['top', '-bn1'], stdout=subprocess.PIPE).communicate() lines = out.split('\n') for line in lines[1:]: if line != "": line = line.split() # If ps -Af is used #procs[os.path.split(line[-1].split()[0])[-1]] = line[0] # If top -bn1 is used procs[os.path.split(line[-1].split()[0])[-1]] = line[1] pbs.logmsg(pbs.EVENT_DEBUG3,"Processes: %s"%procs) # store procs that violate the checks chk_procs = {} chk_procs['running'] = [] chk_procs['stopped'] = [] chk_action = "" # Loop through processes for proc in self.nhc_cfg["processes"]["running"]: if proc not in procs.keys(): pbs.logmsg(pbs.EVENT_DEBUG,"Process: %s is not in the running process list but should be"%proc) chk_procs['running'].append(proc) if chk_action == "": chk_action = self.nhc_cfg['processes']['running'][proc][1] for proc in self.nhc_cfg['processes']['stopped']: if proc in procs.keys(): pbs.logmsg(pbs.EVENT_DEBUG,"Process: %s is in the stopped process list but was found to be running"%proc) chk_procs['stopped'].append(proc) if chk_action == "": chk_action = self.nhc_cfg['processes']['stopped'][proc][1] if len(chk_procs['running']) > 0 or len(chk_procs['stopped']) > 0: line = "running: %s\nstopped: %s"%(join(chk_procs['running'],','),join(chk_procs['stopped'],',')) return [chk_action, "CheckProcesses: One or more processes were found which violates the check\n%s"%line] return True
def CheckNodePeriodic(self): #Setup the fail counter failCnt = 0 pbs.logmsg(pbs.EVENT_DEBUG3,"Ready perform check node periodic") # Run block of code with timeouts pbs.logmsg(pbs.EVENT_DEBUG3,"Ready to check the mounts") if not c.ContinueChk(c.ChkMountPoints()): failCnt+=1 pbs.logmsg(pbs.EVENT_DEBUG3,"Ready to check the disk usage") if not c.ContinueChk(c.ChkDiskUsage()): failCnt+=1 pbs.logmsg(pbs.EVENT_DEBUG3,"Ready to check the file permissions") if not c.ContinueChk(c.ChkDirFilePermissions()): failCnt+=1 pbs.logmsg(pbs.EVENT_DEBUG3,"Exiting CheckNode function") return failCnt
def _pmi_power_status(self, hosts): # Do a capmc node_status and return a list of ready nodes. pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: status of the nodes") nidset = nodenids(hosts) nids, _ = nidlist(nidset=nidset) cmd = "node_status --nids " + nids func = "pmi_power_status" out = launch(func, cmd) ready = [] nodeset = set() if 'ready' in out: ready = out['ready'] else: return nodeset craynid = "PBScraynid" for vnames in hosts: vnode = _svr_vnode(vnames) if craynid in vnode.resources_available: nid = int(vnode.resources_available[craynid]) if nid in ready: nodeset.add(vnames) return nodeset
def stderr(self, msg): """Write msg to appropriate file handle for stdout""" import sys try: if not pbs.event().job.interactive and pbs.event().job.in_ms_mom(): logfile=open(self.stderr_log, 'ab+') else: logfile=sys.stderr if DEBUG: pbs.logmsg(pbs.EVENT_DEBUG3, '%s;%s;[DEBUG3]: writing %s to %s' % (pbs.event().hook_name, pbs.event().job.id, repr(msg), logfile.name)) logfile.write(msg) logfile.flush() logfile.close() except IOError: trace_hook()
def _activate_profile(self, profile_name, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: %s activate '%s'" % (job.id, str(profile_name))) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False energy = job_energy(job, nids, cnt) if energy is not None: f = open(energy_file(job), "w") f.write(str(energy)) f.close() # If this is the only job, set nodes to capped power. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap) cmd += " --node " + str(pcap) doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap) cmd += " --accel " + str(pcap) doit = True if doit: launch(job.id, cmd) else: pbs.logjobmsg(job.id, "Cray: no power cap to set") return True
def _pbs_conf(confvar): # Return the value of a setting in the pbs.conf file if it exists. # Save the values in a global dictionary for future use. if confvar in os.environ: return os.environ[confvar] global pmi_pbsconf if "pmi_pbsconf" not in globals(): pmi_pbsconf = dict() cfile = "PBS_CONF_FILE" if cfile in os.environ: pbsconf = os.environ[cfile] else: pbsconf = "/etc/pbs.conf" try: fp = open(pbsconf) except: pbs.logmsg(pbs.DEBUG, "%s: Unable to open conf file." % pbsconf) return None else: for line in fp: line = line.strip() # ignore empty lines or those beginning with '#' if line == "" or line[0] == "#": continue var, eq, val = line.partition('=') if val == "": continue pmi_pbsconf[var] = val fp.close() if confvar in pmi_pbsconf: return pmi_pbsconf[confvar] else: return None
def check_express_project_code(): project = pbs.event().job.project if not project: pbs.event().reject( "You must specify an express code with -P when submitting express jobs" ) project = repr(project) if not re.match("^exp-[a-z0-9]+$", project): pbs.event().reject( "Invalid express code: these have the format 'exp-XXXX'") if not test_group_membership([project]): pbs.event().reject("You are not authorised to use this express code") r = None try: import requests r = requests.get( "https://api.rcs.imperial.ac.uk/v1.0/express/%s/enabled" % (project, )) except: # pass if r: pbs.logmsg(pbs.LOG_ERROR, str(r.status_code)) pbs.logmsg(pbs.LOG_ERROR, str(r.text)) if (r.status_code == 200) and (r.text != "1"): pbs.event().reject( "This express code is not enabled. Please contact [email protected]" ) else: pbs.event().reject( "This express code cannot be used at this time. Please try later or contact [email protected]" ) return project
def send_notification(subject, email_message, job_owner_email_address): try: ses_client = boto3.client('ses', region_name=ses_region) ses_client.send_email( Source=ses_sender_email, Destination={'ToAddresses': [ job_owner_email_address, ]}, Message={ 'Subject': { 'Data': subject, }, 'Body': { 'Html': { 'Data': email_message, } } }, ) pbs.logmsg(pbs.LOG_DEBUG, 'notify_job_status: SES output' + str(ses_client)) except Exception as err: pbs.logmsg(pbs.LOG_DEBUG, 'notify_job_status: Error sending email' + str(err))
def find_users_in_ldap_group(group_dn): if os.path.isdir( "/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation" ): pbs.logmsg( pbs.LOG_DEBUG, 'queue_acl: find_users_in_ldap_group: Detected Active Directory') # Active Directory with open( '/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation/join_domain_user.cache', 'r') as f: ad_user = f.read() with open( '/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation/join_domain.cache', 'r') as f: ad_password = f.read() with open( '/apps/soca/%SOCA_CONFIGURATION/cluster_node_bootstrap/ad_automation/domain_name.cache', 'r') as f: domain_name = f.read() ldapsearch = 'ldapsearch -x -h ' + domain_name + ' -D "' + ad_user + '@' + domain_name + '" -w "' + ad_password + '" -b "' + group_dn + '" | grep member | awk \'{print $2}\' | cut -d, -f1 | tr -d "CN="' pbs.logmsg( pbs.LOG_DEBUG, 'queue_acl: generated ldapsearch command: ' + ldapsearch.replace(ad_password, "<REDACTED_PASSWORD>")) else: # OpenLdap pbs.logmsg(pbs.LOG_DEBUG, 'queue_acl: find_users_in_ldap_group: Detected OpenLDAP') ldapsearch = "ldapsearch -x -b " + group_dn + " -LLL | grep memberUid | awk '{print $2}'" pbs.logmsg(pbs.LOG_DEBUG, 'queue_acl: generated ldapsearch command: ' + ldapsearch) users_in_group = os.popen(ldapsearch).read() # nosec pbs.logmsg(pbs.LOG_DEBUG, 'queue_acl: find_users_in_ldap_group' + str(users_in_group)) return list(filter(None, users_in_group.split('\n')))
def __execjob_end_handler(self): pbs.logmsg(pbs.LOG_DEBUG, "Docker execjob_end handler start") call = "docker stop " + str(self.jid) pbs.logmsg(pbs.LOG_DEBUG, "Call is: %s" % call) os.system(call) call = "docker rm " + str(self.jid) pbs.logmsg(pbs.LOG_DEBUG, "Call is: %s" % call) os.system(call)
def ConvertToBytes(self, value): # Determine what units the user would like to use. if self.nhc_cfg["disk_space"]["units"].lower() == 'binary': units = { 'kb': 1024, 'mb': 1048576, 'gb': 1073741824, 'tb': 1099511627776 } elif self.nhc_cfg["disk_space"]["units"].lower() == 'decimal': units = { 'kb': 1000, 'mb': 1000000, 'gb': 1000000000, 'tb': 1000000000000 } else: pbs.logmsg(pbs.EVENT_DEBUG3,"I'm not sure how to handle units: %s\nSo I will default to binary"%\ (self.nhc_cfg["disk_space"]["units"])) units = { 'kb': 1024, 'mb': 1048576, 'gb': 1073741824, 'tb': 1099511627776 } value = value.lower() if value.find('%') != -1: pbs.logmsg(pbs.EVENT_DEBUG3, "found a % symbol") # Returned as a float so that I can distinguish between percentage vs free space value = float(value.strip('%')) pbs.logmsg(pbs.EVENT_DEBUG3, "value: %s" % value) else: for key in units.keys(): if value.find(key) != -1: try: value = int(value[:-2].strip()) * units[key] except Exception, e: pbs.logmsg( pbs.EVENT_DEBUG, "Error convertion value to int: %s\tkey: %s" % (value, key)) return False break
def ChkDirFilePermissions(self): """ Returns True if the permissions match. The permissions from python are returned as string with the '0100600'. The last three digits are the file permissions for user,group, world Return action if the permissions don't match and NoFileOrDir if it can't find the file/dir """ if not self.nhc_cfg["permissions"]["check"]: pbs.logmsg(pbs.EVENT_DEBUG3, "Skipping permissions check") return True for file_dir in self.nhc_cfg["permissions"]["check_dirs_and_files"]: pbs.logmsg( pbs.EVENT_DEBUG3, "File/Dir: %s\t%s" % (file_dir, str(self.nhc_cfg["permissions"]["check_dirs_and_files"] [file_dir][0]))) try: st = os.stat(file_dir) permissions = oct(st.st_mode) if permissions[-len(self.nhc_cfg["permissions"][ "check_dirs_and_files"][file_dir][0]):] != str( self.nhc_cfg["permissions"]["check_dirs_and_files"] [file_dir][0]): pbs.logmsg( pbs.EVENT_DEBUG3, "Required permissions: %s\tpermissions: %s" % (str(self.nhc_cfg["permissions"] ["check_dirs_and_files"][file_dir][0]), permissions[-len(self.nhc_cfg["permissions"][ "check_dirs_and_files"][file_dir][0]):])) return [ self.nhc_cfg["permissions"]["check_dirs_and_files"] [file_dir][1], "File/Dir: %s\tRequired permissions: %s\tpermissions: %s" % (file_dir, str(self.nhc_cfg["permissions"] ["check_dirs_and_files"][file_dir][0]), permissions[-len(self.nhc_cfg["permissions"][ "check_dirs_and_files"][file_dir][0]):]) ] except OSError: return [ self.nhc_cfg["permissions"]["check_dirs_and_files"] [file_dir][1], "Can not find file/dir: %s" % file_dir ] except BaseException: return False return True
def call_hc(self, script): stdout = None stderr = None my_env = os.environ.copy() my_env[ "PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/bin:/usr/sbin" try: proc = subprocess.Popen(script, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=my_env, shell=True) stdout, stderr = proc.communicate() self.rc = proc.returncode except Exception as err: pbs.logmsg( pbs.EVENT_DEBUG, "Health-check hook; run script %s error: '%s'" % (script, err)) self.e.reject() pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; finished with exit code: %d" % self.rc) if stdout or stderr: pbs.logmsg( pbs.EVENT_DEBUG, "Health-check hook; stdout: '%s' stderr: '%s'" % (str(stdout).replace("\n", " "), str(stderr).replace( "\n", " "))) if stdout: lines = stdout.strip().split("\n") self.comment = lines[len(lines) - 1] if stderr: pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; stderr not empty, skipping") self.e.reject()
def __setallresources_handler(self): if self.getandset_cgroups() == False: pbs.logmsg( pbs.EVENT_DEBUG, "%s, failed to get and set cgroups resource" % self.hook_name) if self.getandset_cpu_flag() == False: pbs.logmsg( pbs.EVENT_DEBUG, "%s, failed to get and set cpu_flag resource" % self.hook_name) if self.getandset_os() == False: pbs.logmsg( pbs.EVENT_DEBUG, "%s, failed to get and set os resource" % self.hook_name) if self.getandset_cuda_version() == False: pbs.logmsg( pbs.EVENT_DEBUG, "%s, failed to get and set cuda_version resource" % self.hook_name)
def parse_cfg(): config = {} if 'PBS_HOOK_CONFIG_FILE' in os.environ: config_file = os.environ["PBS_HOOK_CONFIG_FILE"] try: config = json.loads(open(config_file, 'r').read()) except Exception as err: pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook; failed to open config file %s: %s" % (config_file, str(err))) config = {} return config for i in config.keys(): if not i in scratch_types.keys(): pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook; failed to parse config file, incorrect scratch type %s" % str(i)) config = {} return config for j in config[i].keys(): if not j in scratch_types.keys(): pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook; failed to parse config file, incorrect scratch type %s" % str(j)) if not list == type(config[i][j]): pbs.logmsg( pbs.EVENT_DEBUG, "scratch hook; failed to parse config file, incorrect nodes type" ) config = {} return config return config
def run_file(fpath): try: pbs.logmsg(pbs.EVENT_DEBUG, "external hook started: %s" % fpath) command = fpath #new_env = os.environ.copy() new_env = j.Variable_List new_env['JOBID'] = j.id new_env['USER'] = j.euser #Job_Owner.split("@")[0] new_env['GROUP'] = j.egroup new_env['HOSTNAME'] = socket.gethostname() proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=new_env) (out, err) = proc.communicate() pbs.logmsg( pbs.EVENT_DEBUG, "external hook %s stdout: '%s' stderr: '%s'" % (fpath, out.replace("\n", ","), err.replace("\n", ","))) pbs.logmsg(pbs.EVENT_DEBUG, "external hook ended with exitcode: %d" % proc.returncode) except Exception as error: pbs.logmsg(pbs.EVENT_DEBUG, "external hook %s failed: %s" % (fpath, str(error)))
def file_check(self): if not os.path.isfile(self.script): pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; %s is not a file or not found" % self.script) return False file_permission = oct(stat.S_IMODE(os.lstat(self.script).st_mode)) if file_permission != self.allowed_permission: pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; incorrect file permission: %s" % file_permission) return False s = os.stat(self.script) if s.st_uid != self.allowed_uid: pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; incorrect file owner: %d:%d" % (s.st_uid, s.st_gid)) return False if s.st_gid != self.allowed_gid: pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; incorrect file owner: %d:%d" % (s.st_uid, s.st_gid)) return False return True
def __queuejob_handler(self): pbs.logmsg(pbs.LOG_DEBUG, "Docker queuejob handler start") newselect = [] if "select" in self.j.Resource_List.keys(): for i in str(self.j.Resource_List["select"]).split("+"): if re.search("docker=[Tt]{1}rue", i): newselect.append(i) continue newselect.append(i + ":docker=true") else: newselect.append("docker=true") pbs.logmsg(pbs.LOG_DEBUG, "Old select: %s" % str(self.j.Resource_List)) self.j.Resource_List["select"] = pbs.select("+".join(newselect)) pbs.logmsg(pbs.LOG_DEBUG, "New select: %s" % str(self.j.Resource_List))
def getandset_os(self): files_to_check = ["/etc/os-release"] lines = [] version_aliases = {"rhel7.6": "centos7"} os = "" version = "" try: for file in files_to_check: with open(file) as f: l = f.readlines() lines += l except Exception as err: pbs.logmsg( pbs.EVENT_DEBUG, "%s, getandset_os error: %s" % (self.hook_name, str(err))) pass try: for line in lines: line = line.split("=") if line[0] == "ID": os = line[1].replace('"', '').strip() if line[0] == "VERSION_ID": version = line[1].replace('"', '').strip() except Exception as err: pbs.logmsg( pbs.EVENT_DEBUG, "%s, getandset_os error: %s" % (self.hook_name, str(err))) return False if os == "": return False else: res_value = os + version if res_value in version_aliases.keys(): res_value = version_aliases[res_value] self.vnl[self.local_node].resources_available["os"] = res_value pbs.logmsg( pbs.EVENT_DEBUG, "%s, resource os set to: %s" % (self.hook_name, res_value)) return True
def main(): try: hook_config = {} if pbs.hook_config_filename: with open(pbs.hook_config_filename) as fr: hook_config.update(json.load(fr)) e = pbs.event() if e.type == pbs.QUEUEJOB: j = e.job hold_on_submit(hook_config, j) elif e.type == pbs.PERIODIC: periodic_release_hook(hook_config, e) else: pbs.logmsg(pbs.EVENT_ERROR, "Unknown event type %s" % e.type) except SystemExit: pbs.logmsg(pbs.LOG_DEBUG, "cycle - Exited with SystemExit") raise except: pbs.logmsg(pbs.EVENT_ERROR, "cycle - %s" % traceback.format_exc()) raise
def launch_job(self): args = copy.deepcopy(self.e.argv) pbs.logmsg(pbs.LOG_DEBUG, "args are %s" % self.e.progname) self.e.progname = "/usr/bin/docker" self.e.argv = [] self.e.argv.append("docker") self.e.argv.append("exec") if self.container_type == "interactive": self.e.argv.append("-it") self.e.argv.append(self.jid) if self.container_type == "interactive": self.e.argv.append("/bin/bash") return if self.container_type == "script": self.e.argv.append("/bin/bash") if not self.job_file: pbs.logmsg(pbs.LOG_DEBUG, "Job file is missing") return self.e.argv.append("-c") self.e.argv.append(self.job_file) return if self.container_type == "service": self.e.argv.append("/bin/sh") return if self.container_type == "executable": self.e.argv.append("/bin/bash") self.e.argv.append("-c") executable = "" for arg in args: executable += arg + " " pbs.logmsg(pbs.LOG_DEBUG, "arg: %s" % arg) self.e.argv.append(executable) return
def _connect(self, endpoint=None, port=None, job=None): if job is None: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: connect") else: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s connect" % (job.id)) return
def __init__(self, pyhome=None): pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: init")
# or contact the Altair Legal Department. # # Altair’s dual-license business model allows companies, individuals, and # organizations to create proprietary derivative works of PBS Pro and # distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair’s trademarks, including but not limited to "PBS™", # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's # trademark licensing policies. import pbs e = pbs.event() if e.type == pbs.RESVSUB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resvsub") elif e.type == pbs.RESV_END: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resv_end") elif e.type == pbs.QUEUEJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is queuejob") elif e.type == pbs.MODIFYJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is modifyjob") elif e.type == pbs.MOVEJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is movejob") elif e.type == pbs.RUNJOB: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is runjob") elif e.type == pbs.PERIODIC: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is periodic") elif e.type == pbs.EXECJOB_BEGIN: pbs.logmsg(pbs.LOG_DEBUG, "hook event type is execjob_begin") elif e.type == pbs.EXECJOB_PROLOGUE:
# The failed nodes are offlined. The 's' accounting record is generated. # To register the hook, as root via qmgr: # qmgr << RJS # create hook rjs_hook # set hook rjs_hook event = 'queuejob,execjob_launch' # set hook rjs_hook enabled = true # import hook rjs_hook application/x-python default ReliableJobStartup.py # RJS import pbs e = pbs.event() if e.type == pbs.QUEUEJOB: # add a log entry in server logs pbs.logmsg(pbs.LOG_DEBUG, "queuejob hook executed") e.job.tolerate_node_failures = "job_start" # Save current select spec in resource 'site' selspec = e.job.Resource_List["select"] if selspec is None: e.reject("Event job does not have select spec!") e.job.Resource_List["site"] = str(selspec) # increment_chunks() can use a percentage argument or an integer. For # example add 1 chunk to each chunk (except the first) in the job's # select spec new_select = selspec.increment_chunks(1) e.job.Resource_List["select"] = new_select pbs.logmsg(pbs.LOG_DEBUG, "job's select spec changed to %s" % new_select)
def trace_hook(**kwargs): """Simple exception trace logger for PBS hooks loglevel=<int> (pbs.LOG_DEBUG): log level to pass to pbs.logmsg() reject=True: reject the job upon completion of logging trace trace_in_reject=<bool> (False): pass trace to pbs.event().reject() trace_in_reject=<str>: message to pass to pbs.event().reject() with trace """ import sys if 'loglevel' in kwargs: loglevel = kwargs['loglevel'] else: loglevel = pbs.LOG_ERROR if 'reject' in kwargs: reject = kwargs['reject'] else: reject = True if 'trace_in_reject' in kwargs: trace_in_reject = kwargs['trace_in_reject'] else: trace_in_reject = False # Associate hook events with the appropriate PBS constant. This is a list # of all hook events as of PBS Pro 13.0. If the event does not exist, it is # removed from the list. hook_events = [ 'queuejob', 'modifyjob', 'movejob', 'runjob', 'execjob_begin', 'execjob_prologue', 'execjob_launch', 'execjob_attach', 'execjob_preterm', 'execjob_epilogue', 'execjob_end', 'resvsub', 'provision', 'exechost_periodic', 'exechost_startup', 'execjob_resize', 'execjob_abort' ] hook_event = {} for he in hook_events: # Only set available hooks for the current version of PBS. if hasattr(pbs, he.upper()): event_code = eval('pbs.' + he.upper()) hook_event[event_code] = he hook_event[he] = event_code hook_event[he.upper()] = event_code del event_code else: del hook_events[hook_events.index(he)] trace = { 'line': sys.exc_info()[2].tb_lineno, 'module': sys.exc_info()[2].tb_frame.f_code.co_name, 'exception': sys.exc_info()[0].__name__, 'message': sys.exc_info()[1].message, } tracemsg = '%s hook %s encountered an exception: Line %s in %s %s: %s' % ( hook_event[pbs.event().type], pbs.event().hook_name, trace['line'], trace['module'], trace['exception'], trace['message']) rejectmsg="Hook Error: request rejected as filter hook '%s' encountered " \ "an exception. Please inform Admin" % pbs.event().hook_name if not isinstance(loglevel, int): loglevel = pbs.LOG_ERROR tracemsg='trace_hook() called with invalid argument (loglevel=%s), '\ 'setting to pbs.LOG_ERROR. ' + tracemsg pbs.logmsg(pbs.LOG_ERROR, tracemsg) if reject: tracemsg += ', request rejected' if isinstance(trace_in_reject, bool): if trace_in_reject: pbs.event().reject(tracemsg) else: pbs.event().reject(rejectmsg) else: pbs.event().reject( str(trace_in_reject) + 'Line %s in %s %s:\n%s' % (trace['line'], trace['module'], trace['exception'], trace['message']))
def _deactivate_profile(self, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False # remove initial energy file try: os.unlink(energy_file(job)) except Exception: pass # If this is the only job, undo any power cap we set. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap) cmd += " --node 0" doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap) cmd += " --accel 0" doit = True if doit: try: launch(job.id, cmd) except Exception: pass else: pbs.logjobmsg(job.id, "Cray: no power cap to remove") # Get final energy value from RUR data name = rur_file(job) try: rurfp = open(name, "r") except Exception: pbs.logjobmsg(job.id, "Cray: no RUR data") return False sbuf = os.fstat(rurfp.fileno()) if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH): pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name) rurfp.close() os.unlink(name) return False pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name) energy = 0 seen = False # track if energy plugin is seen for line in rurfp: plugin, _, rest = line.partition(" : ") if plugin != "energy": # check that the plugin is energy continue apid, _, metstr = rest.partition(" : ") seen = True try: # parse the metric list metlist = eval(metstr, {}) metrics = dict(metlist[i:i + 2] for i in range(0, len(metlist), 2)) joules = metrics["energy_used"] energy += joules pbs.logjobmsg( job.id, 'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}' % (apid, joules, energy)) except Exception as e: pbs.logjobmsg(job.id, "Cray:RUR: energy_used not found: %s" % str(e)) rurfp.close() os.unlink(name) if not seen: pbs.logjobmsg(job.id, "Cray:RUR: no energy plugin") return False old_energy = job.resources_used["energy"] new_energy = float(energy) / 3600000.0 if old_energy is None: pbs.logjobmsg(job.id, "Cray:RUR: energy %fkWh" % new_energy) job.resources_used["energy"] = new_energy elif new_energy > old_energy: pbs.logjobmsg( job.id, "Cray:RUR: energy %fkWh replaces periodic energy %fkWh" % (new_energy, old_energy)) job.resources_used["energy"] = new_energy else: pbs.logjobmsg( job.id, "Cray:RUR: energy %fkWh last periodic usage %fkWh" % (new_energy, old_energy)) return True
def power_status(self, hosts=None): self._check_pmi() pbs.logmsg(pbs.EVENT_DEBUG3, "PMI:powerstatus: status of nodes") return self.__pmi._pmi_power_status(hosts)
def debug(msg): pbs.logmsg(pbs.EVENT_DEBUG3, 'LA debug: %s' % msg)
def log_function_name(): """ Log the caller's name """ pbs.logmsg(pbs.EVENT_DEBUG4, '%s:%s: Method called' % (pbs.event().hook_name, caller_name(2)))
# If it's a system user accept the job if pbs.event().requestor in ["PBS_Server", "Scheduler", "pbs_mom"]: pbs.event().accept() # Check if project has been set if pbs.event().job.project is not None: project = str(pbs.event().job.project) else: project = "_pbs_project_default" # Accept if it's default project # TODO change it in production if project == "_pbs_project_default": pbs.event().accept() pbs.logmsg(pbs.LOG_DEBUG, "---> Queuejob Hook Start! Requestor is %s and project is %s" % (pbs.event().requestor, project)) pbs.logmsg(pbs.LOG_DEBUG, "---> select line is %s" % pbs.event().job.Resource_List.select) # Check if user is part of the project if not isEntitled(pbs.event().requestor, project): pbs.logmsg(pbs.LOG_DEBUG, "---> user " + str(pbs.event().requestor) + " is not part of project " + str(project)) pbs.event().reject("You are not allowed to use the budget of project " + str(project)) # myQueue = str(pbs.event().job.queue) if myQueue == "": pbs.event().reject("No queue selected, please select a queue") if myQueue == "workq": pbs.event().reject("Queue workq is not enabled") #
def _disconnect(self, job=None): if job is None: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: disconnect") else: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s disconnect" % (job.id)) return
def parse_config_file(): # Turn everything off by default. These settings be modified # when the configuration file is read. global pbs_home global pbs_exec global power_ramp_rate_enable global power_on_off_enable global node_idle_limit global min_node_down_delay global max_jobs_analyze_limit global max_concurrent_nodes try: # This block will work for PBS Pro versions 13 and later pbs_conf = pbs.get_pbs_conf() pbs_home = pbs_conf['PBS_HOME'] pbs_exec = pbs_conf['PBS_EXEC'] except: pbs.logmsg(pbs.EVENT_DEBUG, "PBS_HOME needs to be defined in the config file") pbs.logmsg(pbs.EVENT_DEBUG, "Exiting the power hook") pbs.event().accept() # Identify the config file and read in the data config_file = '' if 'PBS_HOOK_CONFIG_FILE' in os.environ: config_file = os.environ["PBS_HOOK_CONFIG_FILE"] tmpcfg = '' if not config_file: tmpcfg = os.path.join(pbs_home, 'server_priv', 'hooks', 'PBS_power.CF') if os.path.isfile(tmpcfg): config_file = tmpcfg if not config_file: tmpcfg = os.path.join(pbs_home, 'mom_priv', 'hooks', 'PBS_power.CF') if os.path.isfile(tmpcfg): config_file = tmpcfg if not config_file: raise Exception("Config file not found") pbs.logmsg(pbs.EVENT_DEBUG3, "Config file is %s" % config_file) try: fd = open(config_file, 'r') config = json.load(fd) fd.close() except IOError: raise Exception("I/O error reading config file") except: raise Exception("Error reading config file") # Assign default values to attributes power_ramp_rate_enable = False power_on_off_enable = False node_idle_limit = 1800 min_node_down_delay = 1800 max_jobs_analyze_limit = 100 max_concurrent_nodes = 10 # Now assgin values read from config file if 'power_on_off_enable' in config: power_on_off_enable = config['power_on_off_enable'] pbs.logmsg(pbs.EVENT_DEBUG3, "power_on_off_enable is set to %s" % str(power_on_off_enable)) if 'power_ramp_rate_enable' in config: power_ramp_rate_enable = config['power_ramp_rate_enable'] pbs.logmsg(pbs.EVENT_DEBUG3, "power_ramp_rate_enable is set to %s" % str(power_ramp_rate_enable)) if 'node_idle_limit' in config: node_idle_limit = int(config['node_idle_limit']) if not node_idle_limit or node_idle_limit < 0: node_idle_limit = 1800 pbs.logmsg(pbs.EVENT_DEBUG3, "node_idle_limit is set to %d" % node_idle_limit) if 'min_node_down_delay' in config: min_node_down_delay = int(config['min_node_down_delay']) if not min_node_down_delay or min_node_down_delay < 0: min_node_down_delay = 1800 pbs.logmsg(pbs.EVENT_DEBUG3, "min_node_down_delay is set to %d" % min_node_down_delay) if 'max_jobs_analyze_limit' in config: max_jobs_analyze_limit = int(config['max_jobs_analyze_limit']) if not max_jobs_analyze_limit or max_jobs_analyze_limit < 0: max_jobs_analyze_limit = 100 pbs.logmsg(pbs.EVENT_DEBUG3, "max_jobs_analyze_limit is set to %d" % max_jobs_analyze_limit) if 'max_concurrent_nodes' in config: max_concurrent_nodes = int(config['max_concurrent_nodes']) if not max_concurrent_nodes or max_concurrent_nodes < 0: max_concurrent_nodes = 10 pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" % max_concurrent_nodes)
def _query(self, query_type): pbs.logmsg(pbs.LOG_DEBUG, "Cray: query") return None
# pbs_conf() will return PBS_HOME if it is not. mom_priv = os.path.abspath( os.path.join(pbs_conf()['PBS_MOM_HOME'], 'mom_priv')) # Get the hook alarm time from the .HK file if it exists. hk_file = os.path.join(mom_priv, 'hooks', '%s.HK' % hook_name) if os.path.exists(hk_file): hook_settings = dict( [l.strip().split('=') for l in open(hk_file, 'r').readlines()]) if 'alarm' in hook_settings.keys(): hook_alarm = int(hook_settings['alarm']) if 'debug' in hook_settings.keys(): DEBUG = True if hook_settings['debug'] == 'true' else False if DEBUG: pbs.logmsg(pbs.LOG_DEBUG, '%s;%s;[DEBUG] starting.' % (hook_name, job.id)) if 'PBS_HOOK_CONFIG_FILE' in os.environ: config_file = os.environ["PBS_HOOK_CONFIG_FILE"] config = dict([ l.split('#')[0].strip().split('=') for l in open(config_file, 'r').readlines() if '=' in l ]) # Set the true/false configurations if 'ENABLE_PARALLEL' in config.keys(): ENABLE_PARALLEL = config['ENABLE_PARALLEL'].lower()[0] in [ 't', '1' ] if 'VERBOSE_USER_OUTPUT' in config.keys(): VEROSE_USER_OUTPUT = config['VERBOSE_USER_OUTPUT'].lower()[0] in [
# distribute them - whether embedded or bundled with other software - # under a commercial license agreement. # # Use of Altair’s trademarks, including but not limited to "PBS™", # "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's # trademark licensing policies. # import pbs import os e = pbs.event() vnode = e.vnode aoe = e.aoe pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: Env = %s" % repr(os.environ)) pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: PBS Node = %s" % vnode) pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: AOE = %s" % aoe) # Provision hook will run on PBS Server but provisioning is started from Admin node, both may not run on same node. # Check for admin node? Read from json config file. if 'PBS_HOOK_CONFIG_FILE' in os.environ: import json config_file = os.environ["PBS_HOOK_CONFIG_FILE"] #pbs.logmsg(pbs.EVENT_DEBUG, "%s: Config file is %s" % (caller_name(), config_file)) config = json.load(open(config_file, 'r'), object_hook=decode_dict) server = pbs.server().name admin = config['admin-node'] pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: server name = %s" % server)
import pbs e = pbs.event() j = e.job who = e.requestor pbs.logmsg(pbs.LOG_DEBUG, "requestor=%s" % (who,)) admin_ulist = ["PBS_Server", "Scheduler", "pbs_mom", "root"] if who not in admin_ulist: e.reject("Normal users are not allowed to modify their jobs")
def error(msg): pbs.logmsg(pbs.EVENT_ERROR, 'LA error: %s' % msg)
max_jobs_analyze_limit) if 'max_concurrent_nodes' in config: max_concurrent_nodes = int(config['max_concurrent_nodes']) if not max_concurrent_nodes or max_concurrent_nodes < 0: max_concurrent_nodes = 10 pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" % max_concurrent_nodes) # Accept if event not serviceable. this_event = pbs.event() if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE, pbs.EXECJOB_BEGIN, pbs.EXECJOB_END, pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC, pbs.PERIODIC]: pbs.logmsg(pbs.LOG_WARNING, "Event not serviceable for power provisioning.") this_event.accept() if this_event.type == pbs.PERIODIC: vnlist = this_event.vnode_list resvlist = this_event.resv_list time_now = time.time() # Parse the config file for power attributes try: parse_config_file() except Exception as e: this_event.reject(str(e)) if power_ramp_rate_enable == 0 and power_on_off_enable == 0:
''' This hook output resource_user.instance_type_used to the current EC2 instance type to the accounting logs create hook soca_aws_infos event=execjob_begin import hook soca_aws_infos application/x-python default /apps/soca/<cLUSTER_ID>/cluster_hooks/execjob_begin/soca_aws_infos.py ''' import re import socket import pbs import urllib2 pbs.logmsg(pbs.LOG_DEBUG, 'soca_aws_infos: start') instance_type = urllib2.urlopen( "http://169.254.169.254/latest/meta-data/instance-type").read() instance_type = instance_type.replace('.', '_') pbs.logmsg(pbs.LOG_DEBUG, 'soca_aws_infos: detected instance: ' + str(instance_type)) e = pbs.event() j = e.job host = (socket.gethostname()).split('.')[0] regex_vnode = r'\(.*?\)' exec_vnode = str(j.exec_vnode) vnode_list = re.findall('\(.*?\)', exec_vnode) if host in vnode_list[0]: pbs.logmsg( pbs.LOG_DEBUG, 'soca_aws_infos: detected host, about to specify new resource used') try: j.resources_used["instance_type_used"] = str(instance_type)
def vnodes_enabled(job): # see if power operations are allowed on all job vnodes for vn in _get_vnode_names(job): if not _svr_vnode(vn).power_provisioning: pbs.logjobmsg(job.id, "power functionality is disabled on vnode %s" % vn) return False return True # Accept if event not serviceable. this_event = pbs.event() if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE, pbs.EXECJOB_BEGIN, pbs.EXECJOB_END, pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC]: pbs.logmsg(pbs.LOG_WARNING, "Event not serviceable for power provisioning.") this_event.accept() # Set eoe values for my node if this_event.type == pbs.EXECHOST_STARTUP: from pbs.v1._pmi_utils import _is_node_provisionable # Don't connect if the server or sched is running. if not _is_node_provisionable(): pbs.logmsg(pbs.LOG_DEBUG, "Provisioning cannot be enabled on this host") this_event.accept() power = init_power(this_event) profiles = power.query(pbs.Power.QUERY_PROFILE) if profiles is not None: