def _get_usage(self, job): pbs.logjobmsg(job.id, "SGI: get_usage") report = api.MonitorReport(job.id) if report is not None and report[0] == 'total_energy': pbs.logjobmsg(job.id, "SGI: energy %fkWh" % report[1]) return report[1] return None
def _get_usage(self, job): pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: %s get_usage" % (job.id)) report = api.MonitorReport(job.id) if report is not None and report[0] == 'total_energy': pbs.logjobmsg(job.id, "SGI: energy %fkWh" % report[1]) return report[1] return None
def vnodes_enabled(job): # see if power operations are allowed on all job vnodes for vn in _get_vnode_names(job): if not _svr_vnode(vn).power_provisioning: pbs.logjobmsg(job.id, "power functionality is disabled on vnode %s" % vn) return False return True
def deactivate_profile(self, job=None): self._check_pmi() if job is None: job = pbs.event().job if _running_excl(job): pbs.logjobmsg(job.id, "PMI: reset current_eoe") for h in _get_vnode_names(job): try: pbs.event().vnode_list[h].current_eoe = None except: pass return self.__pmi._deactivate_profile(job)
def deactivate_profile(self, job=None): self._check_pmi() if job is None: job = pbs.event().job if _running_excl(job): pbs.logjobmsg(job.id, "PMI: reset current_eoe") for h in _get_vnode_names(job): try: pbs.event().vnode_list[h].current_eoe = None except: pass return self.__pmi._deactivate_profile(job)
def _get_usage(self, job): pbs.logjobmsg(job.id, "Cray: get_usage") try: f = open(energy_file(job), "r") start = int(f.read()) f.close() except Exception: return None e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: # This function will be called for each job in turn when # running from a periodic hook. Here we fill in some # global variables just once and use the information # for each job in turn. Save the result of calling capmc # for all running jobs in the variable ninfo. Keep a # dictionary with the job id's as keys holding a set # of nid numbers. if Pmi.ninfo is None: allnids = set() for jobid in e.job_list.keys(): j = e.job_list[jobid] nidset = jobnids(j) allnids.update(nidset) Pmi.nidarray[jobid] = nidset nids, cnt = nidlist(None, allnids) Pmi.ninfo = node_energy("all", nids, cnt) nidset = Pmi.nidarray[job.id] energy = None if Pmi.ninfo is not None and "nodes" in Pmi.ninfo: energy = 0 for node in Pmi.ninfo["nodes"]: if node["nid"] in nidset: # owned by job of interest energy += node["energy_ctr"] pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy) else: nids, cnt = nidlist(job) energy = job_energy(job, nids, cnt) if energy is not None: return float(energy - start) / 3600000.0 else: return None
def _get_usage(self, job): pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id)) try: f = open(energy_file(job), "r") start = int(f.read()) f.close() except Exception: return None e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: # This function will be called for each job in turn when # running from a periodic hook. Here we fill in some # global variables just once and use the information # for each job in turn. Save the result of calling capmc # for all running jobs in the variable ninfo. Keep a # dictionary with the job id's as keys holding a set # of nid numbers. if Pmi.ninfo is None: allnids = set() for jobid in e.job_list.keys(): j = e.job_list[jobid] nidset = jobnids(j) allnids.update(nidset) Pmi.nidarray[jobid] = nidset nids, cnt = nidlist(None, allnids) Pmi.ninfo = node_energy("all", nids, cnt) nidset = Pmi.nidarray[job.id] energy = None if Pmi.ninfo is not None and "nodes" in Pmi.ninfo: energy = 0 for node in Pmi.ninfo["nodes"]: if node["nid"] in nidset: # owned by job of interest energy += node["energy_ctr"] pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy) else: nids, cnt = nidlist(job) energy = job_energy(job, nids, cnt) if energy is not None: return float(energy - start) / 3600000.0 else: return None
def rejectjob(reason, action=DEFAULT_ACTION): """Log job rejection and then call pbs.event().reject()""" # Arguments to pbs.event().reject() do nothing in execjob events. Log a # warning instead, update the job comment, then reject the job. if action == RERUN: job.rerun() reason = 'Requeued - %s' % reason elif action == DELETE: job.delete() reason = 'Deleted - %s' % reason else: reason = 'Rejected - %s' % reason job.comment = '%s: %s' % (hook_name, reason) pbs.logmsg(pbs.LOG_WARNING, ';'.join([hook_name, job.id, reason])) pbs.logjobmsg(job.id, reason) # Add a message that can be tracejob'd if VERBOSE_USER_OUTPUT: print reason pbs_event.reject()
def rejectjob(reason, action=DEFAULT_ACTION): """Log job rejection and then call pbs.event().reject()""" # Arguments to pbs.event().reject() do nothing in execjob events. Log a # warning instead, update the job comment, then reject the job. if action == RERUN: job.rerun() reason='Requeued - %s' % reason elif action == DELETE: job.delete() reason='Deleted - %s' % reason else: reason='Rejected - %s' % reason job.comment='%s: %s' % (hook_name, reason) pbs.logmsg(pbs.LOG_WARNING, ';'.join([hook_name, job.id, reason])) pbs.logjobmsg(job.id, reason) # Add a message that can be tracejob'd if VERBOSE_USER_OUTPUT: print reason pbs_event.reject()
def launch(jid, args): """ Run capmc and return the structured output. :param jid: job id :type jid: str :param args: arguments for capmc command :type args: str :returns: capmc output in json format. """ import json # full path to capmc given by Cray cmd = os.path.join(os.path.sep, 'opt', 'cray', 'capmc', 'default', 'bin', 'capmc') if not os.path.exists(cmd): cmd = "capmc" # should be in PATH then cmd = cmd + " " + args fail = "" pbs.logjobmsg(jid, "launch: " + cmd) cmd_run = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) (cmd_out, cmd_err) = cmd_run.communicate() exitval = cmd_run.returncode if exitval != 0: fail = "%s: exit %d" % (cmd, exitval) else: pbs.logjobmsg(jid, "launch: finished") try: out = json.loads(cmd_out) except Exception: out = None try: err = cmd_err.splitlines()[0] # first line only except Exception: err = "" if out is not None: errno = out["e"] msg = out["err_msg"] if errno != 0 or (len(msg) > 0 and msg != "Success"): fail = "output: e=%d err_msg='%s'" % (errno, msg) if len(err) > 0: pbs.logjobmsg(jid, "stderr: %s" % err.strip()) if len(fail) > 0: pbs.logjobmsg(jid, fail) raise BackendError(fail) return out
def job_energy(job, nids, cnt): """ Return energy counter from capmc. Return None if no energy value is available. :param job: pbs job. :type job: str :param nids: nid list :type nids: str :param cnt: node count :type cnt: int :returns: ret on successfull energy usage capmc query. None on failure. """ energy = None ret = node_energy(job.id, nids, cnt) if ret is not None and "nodes" in ret: energy = 0 for node in ret["nodes"]: energy += node["energy_ctr"] pbs.logjobmsg(job.id, "energy usage %dJ" % energy) return energy
def launch(jid, args): """ Run capmc and return the structured output. :param jid: job id :type jid: str :param args: arguments for capmc command :type args: str :returns: capmc output in json format. """ import json # full path to capmc given by Cray cmd = os.path.join(os.path.sep, 'opt', 'cray', 'capmc', 'default', 'bin', 'capmc') if not os.path.exists(cmd): cmd = "capmc" # should be in PATH then cmd = cmd + " " + args fail = "" pbs.logjobmsg(jid, "launch: " + cmd) cmd_run = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) (cmd_out, cmd_err) = cmd_run.communicate() exitval = cmd_run.returncode if exitval != 0: fail = "%s: exit %d" % (cmd, exitval) else: pbs.logjobmsg(jid, "launch: finished") try: out = json.loads(cmd_out) except Exception: out = None try: err = cmd_err.splitlines()[0] # first line only except Exception: err = "" if out is not None: errno = out["e"] msg = out["err_msg"] if errno != 0 or (len(msg) > 0 and msg != "Success"): fail = "output: e=%d err_msg='%s'" % (errno, msg) if len(err) > 0: pbs.logjobmsg(jid, "stderr: %s" % err.strip()) if len(fail) > 0: pbs.logjobmsg(jid, fail) raise BackendError(fail) return out
def job_energy(job, nids, cnt): """ Return energy counter from capmc. Return None if no energy value is available. :param job: pbs job. :type job: str :param nids: nid list :type nids: str :param cnt: node count :type cnt: int :returns: ret on successfull energy usage capmc query. None on failure. """ energy = None ret = node_energy(job.id, nids, cnt) if ret is not None and "nodes" in ret: energy = 0 for node in ret["nodes"]: energy += node["energy_ctr"] pbs.logjobmsg(job.id, "energy usage %dJ" % energy) return energy
def node_energy(jid, nids, cnt): """ Return the result of running capmc get_node_energy_counter. The magic number of 15 seconds in the past is used because that is the most current value that can be expected from capmc. :param jid: job id. :type jid: str :param nids: nid list :type nids: str :param cnt: node count :type cnt: int :returns: ret on successfull energy usage capmc query. None on failure. """ if cnt == 0: return None cmd = "get_node_energy_counter --nids %s" % nids ret = launch(jid, cmd) cntkey = "nid_count" gotcnt = "<notset>" if (ret is not None) and (cntkey in ret): gotcnt = ret[cntkey] if gotcnt == cnt: return ret pbs.logjobmsg(jid, "node count %s, should be %d" % (str(gotcnt), cnt)) ret = launch(jid, cmd) gotcnt = "<notset>" if (ret is not None) and (cntkey in ret): gotcnt = ret[cntkey] if gotcnt == cnt: return ret pbs.logjobmsg( jid, "second query failed, node count %s, should be %d" % (str(gotcnt), cnt)) return None
def node_energy(jid, nids, cnt): """ Return the result of running capmc get_node_energy_counter. The magic number of 15 seconds in the past is used because that is the most current value that can be expected from capmc. :param jid: job id. :type jid: str :param nids: nid list :type nids: str :param cnt: node count :type cnt: int :returns: ret on successfull energy usage capmc query. None on failure. """ if cnt == 0: return None cmd = "get_node_energy_counter --nids %s" % nids ret = launch(jid, cmd) cntkey = "nid_count" gotcnt = "<notset>" if (ret is not None) and (cntkey in ret): gotcnt = ret[cntkey] if gotcnt == cnt: return ret pbs.logjobmsg(jid, "node count %s, should be %d" % (str(gotcnt), cnt)) ret = launch(jid, cmd) gotcnt = "<notset>" if (ret is not None) and (cntkey in ret): gotcnt = ret[cntkey] if gotcnt == cnt: return ret pbs.logjobmsg(jid, "second query failed, node count %s, should be %d" % (str(gotcnt), cnt)) return None
def _activate_profile(self, profile_name, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: %s activate '%s'" % (job.id, str(profile_name))) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False energy = job_energy(job, nids, cnt) if energy is not None: f = open(energy_file(job), "w") f.write(str(energy)) f.close() # If this is the only job, set nodes to capped power. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap) cmd += " --node " + str(pcap) doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap) cmd += " --accel " + str(pcap) doit = True if doit: launch(job.id, cmd) else: pbs.logjobmsg(job.id, "Cray: no power cap to set") return True
def _activate_profile(self, profile_name, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: %s activate '%s'" % (job.id, str(profile_name))) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False energy = job_energy(job, nids, cnt) if energy is not None: f = open(energy_file(job), "w") f.write(str(energy)) f.close() # If this is the only job, set nodes to capped power. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap) cmd += " --node " + str(pcap) doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap) cmd += " --accel " + str(pcap) doit = True if doit: launch(job.id, cmd) else: pbs.logjobmsg(job.id, "Cray: no power cap to set") return True
class Pmi: ninfo = None nidarray = dict() def __init__(self, pyhome=None): pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: init") def _connect(self, endpoint=None, port=None, job=None): if job is None: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: connect") else: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s connect" % (job.id)) return def _disconnect(self, job=None): if job is None: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: disconnect") else: pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s disconnect" % (job.id)) return def _get_usage(self, job): pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id)) try: f = open(energy_file(job), "r") start = int(f.read()) f.close() except Exception: return None e = pbs.event() if e.type == pbs.EXECHOST_PERIODIC: # This function will be called for each job in turn when # running from a periodic hook. Here we fill in some # global variables just once and use the information # for each job in turn. Save the result of calling capmc # for all running jobs in the variable ninfo. Keep a # dictionary with the job id's as keys holding a set # of nid numbers. if Pmi.ninfo is None: allnids = set() for jobid in e.job_list.keys(): j = e.job_list[jobid] nidset = jobnids(j) allnids.update(nidset) Pmi.nidarray[jobid] = nidset nids, cnt = nidlist(None, allnids) Pmi.ninfo = node_energy("all", nids, cnt) nidset = Pmi.nidarray[job.id] energy = None if Pmi.ninfo is not None and "nodes" in Pmi.ninfo: energy = 0 for node in Pmi.ninfo["nodes"]: if node["nid"] in nidset: # owned by job of interest energy += node["energy_ctr"] pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy) else: nids, cnt = nidlist(job) energy = job_energy(job, nids, cnt) if energy is not None: return float(energy - start) / 3600000.0 else: return None def _query(self, query_type): pbs.logmsg(pbs.LOG_DEBUG, "Cray: query") return None def _activate_profile(self, profile_name, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: %s activate '%s'" % (job.id, str(profile_name))) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False energy = job_energy(job, nids, cnt) if energy is not None: f = open(energy_file(job), "w") f.write(str(energy)) f.close() # If this is the only job, set nodes to capped power. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap) cmd += " --node " + str(pcap) doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap) cmd += " --accel " + str(pcap) doit = True if doit: launch(job.id, cmd) else: pbs.logjobmsg(job.id, "Cray: no power cap to set") return True def _deactivate_profile(self, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False # remove initial energy file try: os.unlink(energy_file(job)) except Exception: pass # If this is the only job, undo any power cap we set. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap) cmd += " --node 0" doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap) cmd += " --accel 0" doit = True if doit: try: launch(job.id, cmd) except Exception: pass else: pbs.logjobmsg(job.id, "Cray: no power cap to remove") # Get final energy value from RUR data name = rur_file(job) try: rurfp = open(name, "r") except Exception: pbs.logjobmsg(job.id, "Cray: no RUR data") return False sbuf = os.fstat(rurfp.fileno()) if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH): pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name) rurfp.close() os.unlink(name) return False pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name) energy = 0 seen = False # track if energy plugin is seen for line in rurfp: plugin, _, rest = line.partition(" : ") if plugin != "energy": # check that the plugin is energy continue apid, _, metstr = rest.partition(" : ") seen = True try: # parse the metric list metlist = eval(metstr, {}) metrics = dict(metlist[i:i + 2] for i in range(0, len(metlist), 2)) joules = metrics["energy_used"] energy += joules pbs.logjobmsg( job.id, 'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}' % (apid, joules, energy)) except Exception, e: pbs.logjobmsg(job.id, "Cray:RUR: energy_used not found: %s" % str(e)) rurfp.close() os.unlink(name) if not seen: pbs.logjobmsg(job.id, "Cray:RUR: no energy plugin") return False old_energy = job.resources_used["energy"] new_energy = float(energy) / 3600000.0 if old_energy is None: pbs.logjobmsg(job.id, "Cray:RUR: energy %fkWh" % new_energy) job.resources_used["energy"] = new_energy elif new_energy > old_energy: pbs.logjobmsg( job.id, "Cray:RUR: energy %fkWh replaces periodic energy %fkWh" % (new_energy, old_energy)) job.resources_used["energy"] = new_energy else: pbs.logjobmsg( job.id, "Cray:RUR: energy %fkWh last periodic usage %fkWh" % (new_energy, old_energy)) return True
def _deactivate_profile(self, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False # remove initial energy file try: os.unlink(energy_file(job)) except Exception: pass # If this is the only job, undo any power cap we set. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap) cmd += " --node 0" doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap) cmd += " --accel 0" doit = True if doit: try: launch(job.id, cmd) except Exception: pass else: pbs.logjobmsg(job.id, "Cray: no power cap to remove") # Get final energy value from RUR data name = rur_file(job) try: rurfp = open(name, "r") except Exception: pbs.logjobmsg(job.id, "Cray: no RUR data") return False sbuf = os.fstat(rurfp.fileno()) if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH): pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name) rurfp.close() os.unlink(name) return False pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name) energy = 0 seen = False # track if energy plugin is seen for line in rurfp: plugin, _, rest = line.partition(" : ") if plugin != "energy": # check that the plugin is energy continue apid, _, metstr = rest.partition(" : ") seen = True try: # parse the metric list metlist = eval(metstr, {}) metrics = dict(metlist[i:i + 2] for i in range(0, len(metlist), 2)) joules = metrics["energy_used"] energy += joules pbs.logjobmsg( job.id, 'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}' % (apid, joules, energy)) except Exception as e: pbs.logjobmsg(job.id, "Cray:RUR: energy_used not found: %s" % str(e)) rurfp.close() os.unlink(name) if not seen: pbs.logjobmsg(job.id, "Cray:RUR: no energy plugin") return False old_energy = job.resources_used["energy"] new_energy = float(energy) / 3600000.0 if old_energy is None: pbs.logjobmsg(job.id, "Cray:RUR: energy %fkWh" % new_energy) job.resources_used["energy"] = new_energy elif new_energy > old_energy: pbs.logjobmsg( job.id, "Cray:RUR: energy %fkWh replaces periodic energy %fkWh" % (new_energy, old_energy)) job.resources_used["energy"] = new_energy else: pbs.logjobmsg( job.id, "Cray:RUR: energy %fkWh last periodic usage %fkWh" % (new_energy, old_energy)) return True
# example add 1 chunk to each chunk (except the first) in the job's # select spec new_select = selspec.increment_chunks(1) e.job.Resource_List["select"] = new_select pbs.logmsg(pbs.LOG_DEBUG, "job's select spec changed to %s" % new_select) elif e.type == pbs.EXECJOB_LAUNCH: if 'PBS_NODEFILE' not in e.env: e.accept() # add a log entry in primary mom logs pbs.logmsg(pbs.LOG_DEBUG, "Executing launch") # print out the vnode_list[] values for vn in e.vnode_list: v = e.vnode_list[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]") # print out the vnodes in vnode_list_fail[] and offline them for vn in e.vnode_list_fail: v = e.vnode_list_fail[vn] pbs.logjobmsg( e.job.id, "launch: found vnode_list_fail[" + v.name + "]") v.state = pbs.ND_OFFLINE # prune the job's vnodes to satisfy the select spec in resource 'site' # and vnodes in vnode_list_fail[] are not used. if e.job.in_ms_mom(): pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"]) if pj is None: e.job.Hold_Types = pbs.hold_types("s") e.job.rerun()
def _deactivate_profile(self, job): pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id) nids, cnt = nidlist(job) if cnt == 0: pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting") return False # remove initial energy file try: os.unlink(energy_file(job)) except Exception: pass # If this is the only job, undo any power cap we set. if _running_excl(job): cmd = "set_power_cap --nids " + nids doit = False pcap = job.Resource_List['pcap_node'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap) cmd += " --node 0" doit = True pcap = job.Resource_List['pcap_accelerator'] if pcap is not None: pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap) cmd += " --accel 0" doit = True if doit: try: launch(job.id, cmd) except Exception: pass else: pbs.logjobmsg(job.id, "Cray: no power cap to remove") # Get final energy value from RUR data name = rur_file(job) try: rurfp = open(name, "r") except Exception: pbs.logjobmsg(job.id, "Cray: no RUR data") return False sbuf = os.fstat(rurfp.fileno()) if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH): pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name) rurfp.close() os.unlink(name) return False pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name) energy = 0 seen = False # track if energy plugin is seen for line in rurfp: plugin, _, rest = line.partition(" : ") if plugin != "energy": # check that the plugin is energy continue apid, _, metstr = rest.partition(" : ") seen = True try: # parse the metric list metlist = eval(metstr, {}) metrics = dict(metlist[i:i + 2] for i in range(0, len(metlist), 2)) joules = metrics["energy_used"] energy += joules pbs.logjobmsg(job.id, 'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}' % (apid, joules, energy)) except Exception, e: pbs.logjobmsg(job.id, "Cray:RUR: energy_used not found: %s" % str(e))
# example add 1 chunk to each chunk (except the first) in the job's # select spec new_select = selspec.increment_chunks(1) e.job.Resource_List["select"] = new_select pbs.logmsg(pbs.LOG_DEBUG, "job's select spec changed to %s" % new_select) elif e.type == pbs.EXECJOB_LAUNCH: if 'PBS_NODEFILE' not in e.env: e.accept() # add a log entry in primary mom logs pbs.logmsg(pbs.LOG_DEBUG, "Executing launch") # print out the vnode_list[] values for vn in e.vnode_list: v = e.vnode_list[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]") # print out the vnodes in vnode_list_fail[] and offline them for vn in e.vnode_list_fail: v = e.vnode_list_fail[vn] pbs.logjobmsg(e.job.id, "launch: found vnode_list_fail[" + v.name + "]") v.state = pbs.ND_OFFLINE # prune the job's vnodes to satisfy the select spec in resource 'site' # and vnodes in vnode_list_fail[] are not used. if e.job.in_ms_mom(): pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"]) if pj is None: e.job.Hold_Types = pbs.hold_types("s") e.job.rerun()
this_event.vnode_list[me].current_eoe = requested_profile except KeyError, ValueError: pass this_event.accept() if this_event.type == pbs.EXECJOB_END: me = pbs.get_local_nodename() try: this_event.vnode_list[me].current_eoe = None except KeyError, ValueError: pass power = init_power(this_event) try: power.deactivate_profile(this_job) except Exception as e: pbs.logjobmsg(this_job.id, str(e)) power.disconnect() this_event.accept() # No further processing is needed if we are not mother superior. if not this_job.in_ms_mom(): this_event.accept() # Don't do anything if power_provisioning=0 if not vnodes_enabled(this_job): this_event.accept() # Was an EOE requested? requested_profile = str(this_job.schedselect).partition( 'eoe=')[2].partition('+')[0].partition(':')[0] if requested_profile == "":
def jobobit_hook(): import pbs import sys try: e = pbs.event() job = e.job pbs.logjobmsg(job.id, 'jobobit hook started for test %s' % (e.hook_name, )) pbs.logjobmsg(job.id, 'jobobit hook, job starttime:%s' % (job.stime, )) pbs.logjobmsg(job.id, 'jobobit hook, job obittime:%s' % (job.obittime, )) pbs.logjobmsg(job.id, 'jobobit hook, job_state=%s' % (job.job_state, )) pbs.logjobmsg(job.id, 'jobobit hook, job_substate=%s' % (job.substate, )) state_desc = pbs.REVERSE_JOB_STATE.get(job.job_state, '(None)') substate_desc = pbs.REVERSE_JOB_SUBSTATE.get(job.substate, '(None)') pbs.logjobmsg(job.id, 'jobobit hook, job_state_desc=%s' % (state_desc, )) pbs.logjobmsg(job.id, 'jobobit hook, job_substate_desc=%s' % (substate_desc, )) if hasattr(job, "resv") and job.resv: pbs.logjobmsg(job.id, 'jobobit hook, resv:%s' % (job.resv.resvid, )) pbs.logjobmsg( job.id, 'jobobit hook, resv_nodes:%s' % (job.resv.resv_nodes, )) pbs.logjobmsg( job.id, 'jobobit hook, resv_state:%s' % (job.resv.reserve_state, )) else: pbs.logjobmsg(job.id, 'jobobit hook, resv:(None)') pbs.logjobmsg(job.id, 'jobobit hook finished for test %s' % (e.hook_name, )) except Exception as err: ty, _, tb = sys.exc_info() pbs.logmsg( pbs.LOG_DEBUG, str(ty) + str(tb.tb_frame.f_code.co_filename) + str(tb.tb_lineno)) e.reject() else: e.accept()