Beispiel #1
0
 def _get_usage(self, job):
     pbs.logjobmsg(job.id, "SGI: get_usage")
     report = api.MonitorReport(job.id)
     if report is not None and report[0] == 'total_energy':
         pbs.logjobmsg(job.id, "SGI: energy %fkWh" % report[1])
         return report[1]
     return None
Beispiel #2
0
 def _get_usage(self, job):
     pbs.logmsg(pbs.EVENT_DEBUG3, "SGI: %s get_usage" % (job.id))
     report = api.MonitorReport(job.id)
     if report is not None and report[0] == 'total_energy':
         pbs.logjobmsg(job.id, "SGI: energy %fkWh" % report[1])
         return report[1]
     return None
Beispiel #3
0
def vnodes_enabled(job):
    # see if power operations are allowed on all job vnodes
    for vn in _get_vnode_names(job):
        if not _svr_vnode(vn).power_provisioning:
            pbs.logjobmsg(job.id,
                          "power functionality is disabled on vnode %s" % vn)
            return False
    return True
Beispiel #4
0
    def deactivate_profile(self, job=None):
        self._check_pmi()

        if job is None:
            job = pbs.event().job
        if _running_excl(job):
            pbs.logjobmsg(job.id, "PMI: reset current_eoe")
            for h in _get_vnode_names(job):
                try:
                    pbs.event().vnode_list[h].current_eoe = None
                except:
                    pass
        return self.__pmi._deactivate_profile(job)
Beispiel #5
0
    def deactivate_profile(self, job=None):
        self._check_pmi()

        if job is None:
            job = pbs.event().job
        if _running_excl(job):
            pbs.logjobmsg(job.id, "PMI: reset current_eoe")
            for h in _get_vnode_names(job):
                try:
                    pbs.event().vnode_list[h].current_eoe = None
                except:
                    pass
        return self.__pmi._deactivate_profile(job)
Beispiel #6
0
    def _get_usage(self, job):
        pbs.logjobmsg(job.id, "Cray: get_usage")
        try:
            f = open(energy_file(job), "r")
            start = int(f.read())
            f.close()
        except Exception:
            return None

        e = pbs.event()
        if e.type == pbs.EXECHOST_PERIODIC:
            # This function will be called for each job in turn when
            # running from a periodic hook.  Here we fill in some
            # global variables just once and use the information
            # for each job in turn.  Save the result of calling capmc
            # for all running jobs in the variable ninfo.  Keep a
            # dictionary with the job id's as keys holding a set
            # of nid numbers.
            if Pmi.ninfo is None:
                allnids = set()
                for jobid in e.job_list.keys():
                    j = e.job_list[jobid]
                    nidset = jobnids(j)
                    allnids.update(nidset)
                    Pmi.nidarray[jobid] = nidset
                nids, cnt = nidlist(None, allnids)
                Pmi.ninfo = node_energy("all", nids, cnt)
            nidset = Pmi.nidarray[job.id]
            energy = None
            if Pmi.ninfo is not None and "nodes" in Pmi.ninfo:
                energy = 0
                for node in Pmi.ninfo["nodes"]:
                    if node["nid"] in nidset:		# owned by job of interest
                        energy += node["energy_ctr"]
                pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" %
                              energy)
        else:
            nids, cnt = nidlist(job)
            energy = job_energy(job, nids, cnt)
        if energy is not None:
            return float(energy - start) / 3600000.0
        else:
            return None
Beispiel #7
0
    def _get_usage(self, job):
        pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id))
        try:
            f = open(energy_file(job), "r")
            start = int(f.read())
            f.close()
        except Exception:
            return None

        e = pbs.event()
        if e.type == pbs.EXECHOST_PERIODIC:
            # This function will be called for each job in turn when
            # running from a periodic hook.  Here we fill in some
            # global variables just once and use the information
            # for each job in turn.  Save the result of calling capmc
            # for all running jobs in the variable ninfo.  Keep a
            # dictionary with the job id's as keys holding a set
            # of nid numbers.
            if Pmi.ninfo is None:
                allnids = set()
                for jobid in e.job_list.keys():
                    j = e.job_list[jobid]
                    nidset = jobnids(j)
                    allnids.update(nidset)
                    Pmi.nidarray[jobid] = nidset
                nids, cnt = nidlist(None, allnids)
                Pmi.ninfo = node_energy("all", nids, cnt)
            nidset = Pmi.nidarray[job.id]
            energy = None
            if Pmi.ninfo is not None and "nodes" in Pmi.ninfo:
                energy = 0
                for node in Pmi.ninfo["nodes"]:
                    if node["nid"] in nidset:		# owned by job of interest
                        energy += node["energy_ctr"]
                pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" %
                              energy)
        else:
            nids, cnt = nidlist(job)
            energy = job_energy(job, nids, cnt)
        if energy is not None:
            return float(energy - start) / 3600000.0
        else:
            return None
Beispiel #8
0
    def rejectjob(reason, action=DEFAULT_ACTION):
        """Log job rejection and then call pbs.event().reject()"""

        # Arguments to pbs.event().reject() do nothing in execjob events. Log a
        # warning instead, update the job comment, then reject the job.
        if action == RERUN:
            job.rerun()
            reason = 'Requeued - %s' % reason
        elif action == DELETE:
            job.delete()
            reason = 'Deleted - %s' % reason
        else:
            reason = 'Rejected - %s' % reason

        job.comment = '%s: %s' % (hook_name, reason)
        pbs.logmsg(pbs.LOG_WARNING, ';'.join([hook_name, job.id, reason]))
        pbs.logjobmsg(job.id, reason)  # Add a message that can be tracejob'd
        if VERBOSE_USER_OUTPUT:
            print reason
        pbs_event.reject()
Beispiel #9
0
    def rejectjob(reason, action=DEFAULT_ACTION):         
        """Log job rejection and then call pbs.event().reject()"""

        # Arguments to pbs.event().reject() do nothing in execjob events. Log a
        # warning instead, update the job comment, then reject the job.
        if action == RERUN:
            job.rerun()
            reason='Requeued - %s' % reason
        elif action == DELETE:
            job.delete()
            reason='Deleted - %s' % reason
        else:
            reason='Rejected - %s' % reason

        job.comment='%s: %s' % (hook_name, reason)
        pbs.logmsg(pbs.LOG_WARNING, ';'.join([hook_name, job.id, reason]))
        pbs.logjobmsg(job.id, reason) # Add a message that can be tracejob'd
        if VERBOSE_USER_OUTPUT:
            print reason
        pbs_event.reject()
Beispiel #10
0
def launch(jid, args):
    """
    Run capmc and return the structured output.

    :param jid: job id
    :type jid: str
    :param args: arguments for capmc command
    :type args: str
    :returns: capmc output in json format.
    """
    import json

    # full path to capmc given by Cray
    cmd = os.path.join(os.path.sep, 'opt', 'cray',
                       'capmc', 'default', 'bin', 'capmc')
    if not os.path.exists(cmd):
        cmd = "capmc"		# should be in PATH then
    cmd = cmd + " " + args
    fail = ""

    pbs.logjobmsg(jid, "launch: " + cmd)
    cmd_run = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
    (cmd_out, cmd_err) = cmd_run.communicate()
    exitval = cmd_run.returncode
    if exitval != 0:
        fail = "%s: exit %d" % (cmd, exitval)
    else:
        pbs.logjobmsg(jid, "launch: finished")

    try:
        out = json.loads(cmd_out)
    except Exception:
        out = None
    try:
        err = cmd_err.splitlines()[0]           # first line only
    except Exception:
        err = ""
    if out is not None:
        errno = out["e"]
        msg = out["err_msg"]
        if errno != 0 or (len(msg) > 0 and msg != "Success"):
            fail = "output: e=%d err_msg='%s'" % (errno, msg)
    if len(err) > 0:
        pbs.logjobmsg(jid, "stderr: %s" % err.strip())

    if len(fail) > 0:
        pbs.logjobmsg(jid, fail)
        raise BackendError(fail)
    return out
Beispiel #11
0
def job_energy(job, nids, cnt):
    """
    Return energy counter from capmc.  Return None if no energy
    value is available.

    :param job: pbs job.
    :type job: str
    :param nids: nid list
    :type nids: str
    :param cnt: node count
    :type cnt: int
    :returns: ret on successfull energy usage capmc query.
              None on failure.
    """
    energy = None
    ret = node_energy(job.id, nids, cnt)
    if ret is not None and "nodes" in ret:
        energy = 0
        for node in ret["nodes"]:
            energy += node["energy_ctr"]
        pbs.logjobmsg(job.id, "energy usage %dJ" % energy)
    return energy
Beispiel #12
0
def launch(jid, args):
    """
    Run capmc and return the structured output.

    :param jid: job id
    :type jid: str
    :param args: arguments for capmc command
    :type args: str
    :returns: capmc output in json format.
    """
    import json

    # full path to capmc given by Cray
    cmd = os.path.join(os.path.sep, 'opt', 'cray', 'capmc', 'default', 'bin',
                       'capmc')
    if not os.path.exists(cmd):
        cmd = "capmc"  # should be in PATH then
    cmd = cmd + " " + args
    fail = ""

    pbs.logjobmsg(jid, "launch: " + cmd)
    cmd_run = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
    (cmd_out, cmd_err) = cmd_run.communicate()
    exitval = cmd_run.returncode
    if exitval != 0:
        fail = "%s: exit %d" % (cmd, exitval)
    else:
        pbs.logjobmsg(jid, "launch: finished")

    try:
        out = json.loads(cmd_out)
    except Exception:
        out = None
    try:
        err = cmd_err.splitlines()[0]  # first line only
    except Exception:
        err = ""
    if out is not None:
        errno = out["e"]
        msg = out["err_msg"]
        if errno != 0 or (len(msg) > 0 and msg != "Success"):
            fail = "output: e=%d err_msg='%s'" % (errno, msg)
    if len(err) > 0:
        pbs.logjobmsg(jid, "stderr: %s" % err.strip())

    if len(fail) > 0:
        pbs.logjobmsg(jid, fail)
        raise BackendError(fail)
    return out
Beispiel #13
0
def job_energy(job, nids, cnt):
    """
    Return energy counter from capmc.  Return None if no energy
    value is available.

    :param job: pbs job.
    :type job: str
    :param nids: nid list
    :type nids: str
    :param cnt: node count
    :type cnt: int
    :returns: ret on successfull energy usage capmc query.
              None on failure.
    """
    energy = None
    ret = node_energy(job.id, nids, cnt)
    if ret is not None and "nodes" in ret:
        energy = 0
        for node in ret["nodes"]:
            energy += node["energy_ctr"]
        pbs.logjobmsg(job.id, "energy usage %dJ" % energy)
    return energy
Beispiel #14
0
def node_energy(jid, nids, cnt):
    """
    Return the result of running capmc get_node_energy_counter.
    The magic number of 15 seconds in the past is used because that
    is the most current value that can be expected from capmc.

    :param jid: job id.
    :type jid: str
    :param nids: nid list
    :type nids: str
    :param cnt: node count
    :type cnt: int
    :returns: ret on successfull energy usage capmc query.
              None on failure.
    """
    if cnt == 0:
        return None
    cmd = "get_node_energy_counter --nids %s" % nids
    ret = launch(jid, cmd)
    cntkey = "nid_count"
    gotcnt = "<notset>"
    if (ret is not None) and (cntkey in ret):
        gotcnt = ret[cntkey]
        if gotcnt == cnt:
            return ret

    pbs.logjobmsg(jid, "node count %s, should be %d" % (str(gotcnt), cnt))
    ret = launch(jid, cmd)
    gotcnt = "<notset>"
    if (ret is not None) and (cntkey in ret):
        gotcnt = ret[cntkey]
        if gotcnt == cnt:
            return ret

    pbs.logjobmsg(
        jid, "second query failed, node count %s, should be %d" %
        (str(gotcnt), cnt))
    return None
Beispiel #15
0
def node_energy(jid, nids, cnt):
    """
    Return the result of running capmc get_node_energy_counter.
    The magic number of 15 seconds in the past is used because that
    is the most current value that can be expected from capmc.

    :param jid: job id.
    :type jid: str
    :param nids: nid list
    :type nids: str
    :param cnt: node count
    :type cnt: int
    :returns: ret on successfull energy usage capmc query.
              None on failure.
    """
    if cnt == 0:
        return None
    cmd = "get_node_energy_counter --nids %s" % nids
    ret = launch(jid, cmd)
    cntkey = "nid_count"
    gotcnt = "<notset>"
    if (ret is not None) and (cntkey in ret):
        gotcnt = ret[cntkey]
        if gotcnt == cnt:
            return ret

    pbs.logjobmsg(jid, "node count %s, should be %d" % (str(gotcnt), cnt))
    ret = launch(jid, cmd)
    gotcnt = "<notset>"
    if (ret is not None) and (cntkey in ret):
        gotcnt = ret[cntkey]
        if gotcnt == cnt:
            return ret

    pbs.logjobmsg(jid, "second query failed, node count %s, should be %d" %
                  (str(gotcnt), cnt))
    return None
Beispiel #16
0
    def _activate_profile(self, profile_name, job):
        pbs.logmsg(pbs.LOG_DEBUG,
                   "Cray: %s activate '%s'" % (job.id, str(profile_name)))

        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        energy = job_energy(job, nids, cnt)
        if energy is not None:
            f = open(energy_file(job), "w")
            f.write(str(energy))
            f.close()

        # If this is the only job, set nodes to capped power.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap)
                cmd += " --node " + str(pcap)
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap)
                cmd += " --accel " + str(pcap)
                doit = True

            if doit:
                launch(job.id, cmd)
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to set")

        return True
Beispiel #17
0
    def _activate_profile(self, profile_name, job):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: %s activate '%s'" %
                   (job.id, str(profile_name)))

        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        energy = job_energy(job, nids, cnt)
        if energy is not None:
            f = open(energy_file(job), "w")
            f.write(str(energy))
            f.close()

        # If this is the only job, set nodes to capped power.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap)
                cmd += " --node " + str(pcap)
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap)
                cmd += " --accel " + str(pcap)
                doit = True

            if doit:
                launch(job.id, cmd)
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to set")

        return True
Beispiel #18
0
class Pmi:

    ninfo = None
    nidarray = dict()

    def __init__(self, pyhome=None):
        pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: init")

    def _connect(self, endpoint=None, port=None, job=None):
        if job is None:
            pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: connect")
        else:
            pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s connect" % (job.id))
        return

    def _disconnect(self, job=None):
        if job is None:
            pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: disconnect")
        else:
            pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s disconnect" % (job.id))
        return

    def _get_usage(self, job):
        pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id))
        try:
            f = open(energy_file(job), "r")
            start = int(f.read())
            f.close()
        except Exception:
            return None

        e = pbs.event()
        if e.type == pbs.EXECHOST_PERIODIC:
            # This function will be called for each job in turn when
            # running from a periodic hook.  Here we fill in some
            # global variables just once and use the information
            # for each job in turn.  Save the result of calling capmc
            # for all running jobs in the variable ninfo.  Keep a
            # dictionary with the job id's as keys holding a set
            # of nid numbers.
            if Pmi.ninfo is None:
                allnids = set()
                for jobid in e.job_list.keys():
                    j = e.job_list[jobid]
                    nidset = jobnids(j)
                    allnids.update(nidset)
                    Pmi.nidarray[jobid] = nidset
                nids, cnt = nidlist(None, allnids)
                Pmi.ninfo = node_energy("all", nids, cnt)
            nidset = Pmi.nidarray[job.id]
            energy = None
            if Pmi.ninfo is not None and "nodes" in Pmi.ninfo:
                energy = 0
                for node in Pmi.ninfo["nodes"]:
                    if node["nid"] in nidset:  # owned by job of interest
                        energy += node["energy_ctr"]
                pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" % energy)
        else:
            nids, cnt = nidlist(job)
            energy = job_energy(job, nids, cnt)
        if energy is not None:
            return float(energy - start) / 3600000.0
        else:
            return None

    def _query(self, query_type):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: query")
        return None

    def _activate_profile(self, profile_name, job):
        pbs.logmsg(pbs.LOG_DEBUG,
                   "Cray: %s activate '%s'" % (job.id, str(profile_name)))

        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        energy = job_energy(job, nids, cnt)
        if energy is not None:
            f = open(energy_file(job), "w")
            f.write(str(energy))
            f.close()

        # If this is the only job, set nodes to capped power.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap node %d" % pcap)
                cmd += " --node " + str(pcap)
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: pcap accel %d" % pcap)
                cmd += " --accel " + str(pcap)
                doit = True

            if doit:
                launch(job.id, cmd)
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to set")

        return True

    def _deactivate_profile(self, job):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id)
        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        # remove initial energy file
        try:
            os.unlink(energy_file(job))
        except Exception:
            pass

        # If this is the only job, undo any power cap we set.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap)
                cmd += " --node 0"
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap)
                cmd += " --accel 0"
                doit = True

            if doit:
                try:
                    launch(job.id, cmd)
                except Exception:
                    pass
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to remove")

        # Get final energy value from RUR data
        name = rur_file(job)
        try:
            rurfp = open(name, "r")
        except Exception:
            pbs.logjobmsg(job.id, "Cray: no RUR data")
            return False

        sbuf = os.fstat(rurfp.fileno())
        if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH):
            pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name)
            rurfp.close()
            os.unlink(name)
            return False

        pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name)
        energy = 0
        seen = False  # track if energy plugin is seen
        for line in rurfp:
            plugin, _, rest = line.partition(" : ")
            if plugin != "energy":  # check that the plugin is energy
                continue

            apid, _, metstr = rest.partition(" : ")
            seen = True
            try:  # parse the metric list
                metlist = eval(metstr, {})
                metrics = dict(metlist[i:i + 2]
                               for i in range(0, len(metlist), 2))
                joules = metrics["energy_used"]
                energy += joules
                pbs.logjobmsg(
                    job.id,
                    'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}'
                    % (apid, joules, energy))
            except Exception, e:
                pbs.logjobmsg(job.id,
                              "Cray:RUR: energy_used not found: %s" % str(e))

        rurfp.close()
        os.unlink(name)

        if not seen:
            pbs.logjobmsg(job.id, "Cray:RUR: no energy plugin")
            return False

        old_energy = job.resources_used["energy"]
        new_energy = float(energy) / 3600000.0
        if old_energy is None:
            pbs.logjobmsg(job.id, "Cray:RUR: energy %fkWh" % new_energy)
            job.resources_used["energy"] = new_energy
        elif new_energy > old_energy:
            pbs.logjobmsg(
                job.id,
                "Cray:RUR: energy %fkWh replaces periodic energy %fkWh" %
                (new_energy, old_energy))
            job.resources_used["energy"] = new_energy
        else:
            pbs.logjobmsg(
                job.id, "Cray:RUR: energy %fkWh last periodic usage %fkWh" %
                (new_energy, old_energy))
        return True
Beispiel #19
0
    def _deactivate_profile(self, job):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id)
        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        # remove initial energy file
        try:
            os.unlink(energy_file(job))
        except Exception:
            pass

        # If this is the only job, undo any power cap we set.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap)
                cmd += " --node 0"
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap)
                cmd += " --accel 0"
                doit = True

            if doit:
                try:
                    launch(job.id, cmd)
                except Exception:
                    pass
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to remove")

        # Get final energy value from RUR data
        name = rur_file(job)
        try:
            rurfp = open(name, "r")
        except Exception:
            pbs.logjobmsg(job.id, "Cray: no RUR data")
            return False

        sbuf = os.fstat(rurfp.fileno())
        if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH):
            pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name)
            rurfp.close()
            os.unlink(name)
            return False

        pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name)
        energy = 0
        seen = False  # track if energy plugin is seen
        for line in rurfp:
            plugin, _, rest = line.partition(" : ")
            if plugin != "energy":  # check that the plugin is energy
                continue

            apid, _, metstr = rest.partition(" : ")
            seen = True
            try:  # parse the metric list
                metlist = eval(metstr, {})
                metrics = dict(metlist[i:i + 2]
                               for i in range(0, len(metlist), 2))
                joules = metrics["energy_used"]
                energy += joules
                pbs.logjobmsg(
                    job.id,
                    'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}'
                    % (apid, joules, energy))
            except Exception as e:
                pbs.logjobmsg(job.id,
                              "Cray:RUR: energy_used not found: %s" % str(e))

        rurfp.close()
        os.unlink(name)

        if not seen:
            pbs.logjobmsg(job.id, "Cray:RUR: no energy plugin")
            return False

        old_energy = job.resources_used["energy"]
        new_energy = float(energy) / 3600000.0
        if old_energy is None:
            pbs.logjobmsg(job.id, "Cray:RUR: energy %fkWh" % new_energy)
            job.resources_used["energy"] = new_energy
        elif new_energy > old_energy:
            pbs.logjobmsg(
                job.id,
                "Cray:RUR: energy %fkWh replaces periodic energy %fkWh" %
                (new_energy, old_energy))
            job.resources_used["energy"] = new_energy
        else:
            pbs.logjobmsg(
                job.id, "Cray:RUR: energy %fkWh last periodic usage %fkWh" %
                (new_energy, old_energy))
        return True
    # example add 1 chunk to each chunk (except the first) in the job's
    # select spec
    new_select = selspec.increment_chunks(1)
    e.job.Resource_List["select"] = new_select
    pbs.logmsg(pbs.LOG_DEBUG, "job's select spec changed to %s" % new_select)

elif e.type == pbs.EXECJOB_LAUNCH:
    if 'PBS_NODEFILE' not in e.env:
        e.accept()
    # add a log entry in primary mom logs
    pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")

    # print out the vnode_list[] values
    for vn in e.vnode_list:
        v = e.vnode_list[vn]
        pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")

    # print out the vnodes in vnode_list_fail[] and offline them
    for vn in e.vnode_list_fail:
        v = e.vnode_list_fail[vn]
        pbs.logjobmsg(
            e.job.id, "launch: found vnode_list_fail[" + v.name + "]")
        v.state = pbs.ND_OFFLINE

    # prune the job's vnodes to satisfy the select spec in resource 'site'
    # and vnodes in vnode_list_fail[] are not used.
    if e.job.in_ms_mom():
        pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
        if pj is None:
            e.job.Hold_Types = pbs.hold_types("s")
            e.job.rerun()
Beispiel #21
0
    def _deactivate_profile(self, job):
        pbs.logmsg(pbs.LOG_DEBUG, "Cray: deactivate %s" % job.id)
        nids, cnt = nidlist(job)
        if cnt == 0:
            pbs.logjobmsg(job.id, "Cray: no compute nodes for power setting")
            return False

        # remove initial energy file
        try:
            os.unlink(energy_file(job))
        except Exception:
            pass

        # If this is the only job, undo any power cap we set.
        if _running_excl(job):
            cmd = "set_power_cap --nids " + nids
            doit = False

            pcap = job.Resource_List['pcap_node']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap node %d" % pcap)
                cmd += " --node 0"
                doit = True
            pcap = job.Resource_List['pcap_accelerator']
            if pcap is not None:
                pbs.logjobmsg(job.id, "Cray: remove pcap accel %d" % pcap)
                cmd += " --accel 0"
                doit = True

            if doit:
                try:
                    launch(job.id, cmd)
                except Exception:
                    pass
            else:
                pbs.logjobmsg(job.id, "Cray: no power cap to remove")

        # Get final energy value from RUR data
        name = rur_file(job)
        try:
            rurfp = open(name, "r")
        except Exception:
            pbs.logjobmsg(job.id, "Cray: no RUR data")
            return False

        sbuf = os.fstat(rurfp.fileno())
        if (sbuf.st_uid != 0) or (sbuf.st_mode & stat.S_IWOTH):
            pbs.logjobmsg(job.id, "Cray: RUR file permission: %s" % name)
            rurfp.close()
            os.unlink(name)
            return False

        pbs.logjobmsg(job.id, "Cray: reading RUR file: %s" % name)
        energy = 0
        seen = False        # track if energy plugin is seen
        for line in rurfp:
            plugin, _, rest = line.partition(" : ")
            if plugin != "energy":		# check that the plugin is energy
                continue

            apid, _, metstr = rest.partition(" : ")
            seen = True
            try:						# parse the metric list
                metlist = eval(metstr, {})
                metrics = dict(metlist[i:i + 2] for i in range(0,
                                                               len(metlist), 2))
                joules = metrics["energy_used"]
                energy += joules
                pbs.logjobmsg(job.id,
                              'Cray:RUR: {"apid":%s,"apid_energy":%dJ,"job_energy":%dJ}' %
                              (apid, joules, energy))
            except Exception, e:
                pbs.logjobmsg(job.id,
                              "Cray:RUR: energy_used not found: %s" % str(e))
Beispiel #22
0
    # example add 1 chunk to each chunk (except the first) in the job's
    # select spec
    new_select = selspec.increment_chunks(1)
    e.job.Resource_List["select"] = new_select
    pbs.logmsg(pbs.LOG_DEBUG, "job's select spec changed to %s" % new_select)

elif e.type == pbs.EXECJOB_LAUNCH:
    if 'PBS_NODEFILE' not in e.env:
        e.accept()
    # add a log entry in primary mom logs
    pbs.logmsg(pbs.LOG_DEBUG, "Executing launch")

    # print out the vnode_list[] values
    for vn in e.vnode_list:
        v = e.vnode_list[vn]
        pbs.logjobmsg(e.job.id, "launch: found vnode_list[" + v.name + "]")

    # print out the vnodes in vnode_list_fail[] and offline them
    for vn in e.vnode_list_fail:
        v = e.vnode_list_fail[vn]
        pbs.logjobmsg(e.job.id,
                      "launch: found vnode_list_fail[" + v.name + "]")
        v.state = pbs.ND_OFFLINE

    # prune the job's vnodes to satisfy the select spec in resource 'site'
    # and vnodes in vnode_list_fail[] are not used.
    if e.job.in_ms_mom():
        pj = e.job.release_nodes(keep_select=e.job.Resource_List["site"])
        if pj is None:
            e.job.Hold_Types = pbs.hold_types("s")
            e.job.rerun()
Beispiel #23
0
            this_event.vnode_list[me].current_eoe = requested_profile
        except KeyError, ValueError:
            pass
    this_event.accept()
if this_event.type == pbs.EXECJOB_END:
    me = pbs.get_local_nodename()
    try:
        this_event.vnode_list[me].current_eoe = None
    except KeyError, ValueError:
        pass

    power = init_power(this_event)
    try:
        power.deactivate_profile(this_job)
    except Exception as e:
        pbs.logjobmsg(this_job.id, str(e))
    power.disconnect()
    this_event.accept()

# No further processing is needed if we are not mother superior.
if not this_job.in_ms_mom():
    this_event.accept()

# Don't do anything if power_provisioning=0
if not vnodes_enabled(this_job):
    this_event.accept()

# Was an EOE requested?
requested_profile = str(this_job.schedselect).partition(
    'eoe=')[2].partition('+')[0].partition(':')[0]
if requested_profile == "":
Beispiel #24
0
def jobobit_hook():
    import pbs
    import sys

    try:
        e = pbs.event()
        job = e.job
        pbs.logjobmsg(job.id,
                      'jobobit hook started for test %s' % (e.hook_name, ))
        pbs.logjobmsg(job.id, 'jobobit hook, job starttime:%s' % (job.stime, ))
        pbs.logjobmsg(job.id,
                      'jobobit hook, job obittime:%s' % (job.obittime, ))
        pbs.logjobmsg(job.id, 'jobobit hook, job_state=%s' % (job.job_state, ))
        pbs.logjobmsg(job.id,
                      'jobobit hook, job_substate=%s' % (job.substate, ))
        state_desc = pbs.REVERSE_JOB_STATE.get(job.job_state, '(None)')
        substate_desc = pbs.REVERSE_JOB_SUBSTATE.get(job.substate, '(None)')
        pbs.logjobmsg(job.id,
                      'jobobit hook, job_state_desc=%s' % (state_desc, ))
        pbs.logjobmsg(job.id,
                      'jobobit hook, job_substate_desc=%s' % (substate_desc, ))
        if hasattr(job, "resv") and job.resv:
            pbs.logjobmsg(job.id,
                          'jobobit hook, resv:%s' % (job.resv.resvid, ))
            pbs.logjobmsg(
                job.id,
                'jobobit hook, resv_nodes:%s' % (job.resv.resv_nodes, ))
            pbs.logjobmsg(
                job.id,
                'jobobit hook, resv_state:%s' % (job.resv.reserve_state, ))
        else:
            pbs.logjobmsg(job.id, 'jobobit hook, resv:(None)')
        pbs.logjobmsg(job.id,
                      'jobobit hook finished for test %s' % (e.hook_name, ))
    except Exception as err:
        ty, _, tb = sys.exc_info()
        pbs.logmsg(
            pbs.LOG_DEBUG,
            str(ty) + str(tb.tb_frame.f_code.co_filename) + str(tb.tb_lineno))
        e.reject()
    else:
        e.accept()