コード例 #1
0
    def getResourceUsage(self):
        """Returns S_OK with a dictionary containing the entries WallClock, WallClockLimit, and Unit for current slot."""
        # $_CONDOR_JOB_AD corresponds to the path to the .job.ad file
        # It contains info about the job:
        # - MaxRuntime: wallclock time allocated to the job - not officially supported by HTCondor,
        #   only present on some Sites
        # - CurrentTime: current time
        # - JobCurrentStartDate: start of the job execution
        jobDescription = os.environ.get("_CONDOR_JOB_AD")
        cmd = "condor_status -ads %s -af MaxRuntime CurrentTime-JobCurrentStartDate" % jobDescription
        result = runCommand(cmd)
        if not result["OK"]:
            return S_ERROR("Current batch system is not supported")

        output = str(result["Value"]).split(" ")
        if len(output) != 2:
            self.log.warn("Cannot open $_CONDOR_JOB_AD: output probably empty")
            return S_ERROR("Current batch system is not supported")

        wallClockLimit = output[0]
        wallClock = output[1]
        if wallClockLimit == "undefined":
            self.log.warn("MaxRuntime attribute is not supported")
            return S_ERROR("Current batch system is not supported")

        wallClockLimit = float(wallClockLimit)
        wallClock = float(wallClock)

        consumed = {"WallClock": wallClock, "WallClockLimit": wallClockLimit, "Unit": "WallClock"}

        self.log.debug("TimeLeft counters complete:", str(consumed))
        return S_OK(consumed)
コード例 #2
0
ファイル: SLURMResourceUsage.py プロジェクト: pmusset/DIRAC
    def getResourceUsage(self):
        """ Returns S_OK with a dictionary containing the entries CPU, CPULimit,
        WallClock, WallClockLimit, and Unit for current slot.
    """
        # sacct displays accounting data for all jobs and job steps
        # -j is the given job, -o the information of interest, -X to get rid of intermediate steps
        # -n to remove the header, -P to make the output parseable (remove tabs, spaces, columns)
        # --delimiter to specify character that splits the fields
        cmd = 'sacct -j %s -o JobID,CPUTimeRAW,AllocCPUS,ElapsedRaw,Timelimit -X -n -P --delimiter=,' % (
            self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        output = str(result['Value']).split(',')
        if len(output) == 5:
            _, cpu, allocCPUs, wallClock, wallClockLimitFormatted = output
            # Timelimit is in a specific format and have to be converted in seconds
            # TimelimitRaw is in seconds but only available from Slurm 18.08...
            wallClockLimit = self._getFormattedTimeInSeconds(
                wallClockLimitFormatted)
            wallClock = float(wallClock)
            if wallClockLimit:
                cpuLimit = wallClockLimit * int(allocCPUs)
            cpu = float(cpu)

        # Slurm allocations are based on wallclock time, not cpu time.
        # We precise it in the 'Unit' field
        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit,
            'Unit': 'WallClock'
        }

        if None in consumed.values():
            missed = [key for key, val in consumed.items() if val is None]
            msg = 'Could not determine parameter'
            self.log.warn('Could not determine parameter', ','.join(missed))
            self.log.debug(
                'This is the stdout from the batch system call\n%s' %
                (result['Value']))
            return S_ERROR(msg)

        self.log.debug("TimeLeft counters complete:", str(consumed))
        return S_OK(consumed)
コード例 #3
0
def _getCPUScalingFactor():

    host = socket.getfqdn()
    cmd = 'qconf -se %s' % host
    result = runCommand(cmd)
    if not result['OK']:
        return None
    _example = """Example of output for qconf -se ccwsge0640
hostname              ccwsge0640.in2p3.fr
load_scaling          NONE
complex_values        m_mem_free=131022.000000M,m_mem_free_n0=65486.613281M, \
                      m_mem_free_n1=65536.000000M,os=sl6
load_values           arch=lx-amd64,cpu=89.400000,fsize_used_rate=0.089, \
                      load_avg=36.300000,load_long=36.020000, \
                      load_medium=36.300000,load_short=35.960000, \
                      m_cache_l1=32.000000K,m_cache_l2=256.000000K, \
                      m_cache_l3=25600.000000K,m_core=20, \
                      m_mem_free=72544.000000M,m_mem_free_n0=18696.761719M, \
                      m_mem_free_n1=22139.621094M,m_mem_total=131022.000000M, \
                      m_mem_total_n0=65486.613281M, \
                      m_mem_total_n1=65536.000000M,m_mem_used=58478.000000M, \
                      m_mem_used_n0=46789.851562M,m_mem_used_n1=43396.378906M, \
                      m_numa_nodes=2,m_socket=2,m_thread=40, \
                      m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_topology_numa=[SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT][SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT], \
                      mem_free=70513.675781M,mem_total=129001.429688M, \
                      mem_used=58487.753906M,memory_used_rate=0.468, \
                      np_load_avg=0.907500,np_load_long=0.900500, \
                      np_load_medium=0.907500,np_load_short=0.899000, \
                      num_proc=40,swap_free=0.000000M,swap_total=266.699219M, \
                      swap_used=266.699219M,virtual_free=70513.675781M, \
                      virtual_total=129268.128906M,virtual_used=58754.453125M
processors            40
user_lists            NONE
xuser_lists           NONE
projects              NONE
xprojects             NONE
usage_scaling         cpu=11.350000,acct_cpu=11.350000
report_variables      NONE

"""
    lines = str(result['Value']).split('\n')
    for line in lines:
        if re.search('usage_scaling', line):
            match = re.search(r'cpu=([\d,\.]*),', line)
            if match:
                return float(match.groups()[0])
    return None
コード例 #4
0
def _getCPUScalingFactor():

    host = socket.getfqdn()
    cmd = "qconf -se %s" % host
    result = runCommand(cmd)
    if not result["OK"]:
        return None

    lines = str(result["Value"]).split("\n")
    for line in lines:
        if re.search("usage_scaling", line):
            match = re.search(r"cpu=([\d,\.]*),", line)
            if match:
                return float(match.groups()[0])
    return None
コード例 #5
0
ファイル: SGEResourceUsage.py プロジェクト: pmusset/DIRAC
def _getCPUScalingFactor():

    host = socket.getfqdn()
    cmd = 'qconf -se %s' % host
    result = runCommand(cmd)
    if not result['OK']:
        return None

    lines = str(result['Value']).split('\n')
    for line in lines:
        if re.search('usage_scaling', line):
            match = re.search(r'cpu=([\d,\.]*),', line)
            if match:
                return float(match.groups()[0])
    return None
コード例 #6
0
ファイル: PBSResourceUsage.py プロジェクト: DIRACGrid/DIRAC
    def getResourceUsage(self):
        """Returns S_OK with a dictionary containing the entries CPU, CPULimit,
        WallClock, WallClockLimit, and Unit for current slot.
        """
        cmd = "qstat -f %s" % (self.jobID)
        result = runCommand(cmd)
        if not result["OK"]:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result["Value"]).split("\n")
        for line in lines:
            info = line.split()
            if re.search(".*resources_used.cput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search(".*resources_used.pcput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search(".*resources_used.walltime.*", line):
                if len(info) >= 3:
                    wcList = info[2].split(":")
                    wallClock = (float(wcList[0]) * 60 +
                                 float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn(
                        'Problem parsing "%s" for elapsed wall clock time' %
                        line)
            if re.search(".*Resource_List.cput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search(".*Resource_List.pcput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search(".*Resource_List.walltime.*", line):
                if len(info) >= 3:
                    wcList = info[2].split(":")
                    wallClockLimit = (float(wcList[0]) * 60 +
                                      float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' %
                                  line)

        consumed = {
            "CPU": cpu,
            "CPULimit": cpuLimit,
            "WallClock": wallClock,
            "WallClockLimit": wallClockLimit
        }
        self.log.debug(consumed)

        if None not in consumed.values():
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        missed = [key for key, val in consumed.items() if val is None]
        self.log.info("Could not determine parameter", ",".join(missed))
        self.log.info("This is the stdout from the batch system call\n%s" %
                      (result["Value"]))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from PBS, assume that we ran for too short time
            if not cpuLimit:
                consumed["CPULimit"] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed["WallClockLimit"] = cpuLimit / 0.8
            if not cpu:
                consumed["CPU"] = int(time.time() - self.startTime)
            if not wallClock:
                consumed["WallClock"] = int(time.time() - self.startTime)
            self.log.verbose("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        msg = "Could not determine some parameters"
        self.log.info(
            msg, ":\nThis is the stdout from the batch system call\n%s" %
            (result["Value"]))
        retVal = S_ERROR(msg)
        retVal["Value"] = consumed
        return retVal
コード例 #7
0
    def __init__(self):
        """Standard constructor"""
        super(LSFResourceUsage, self).__init__("LSF", "LSB_JOBID")

        self.queue = os.environ.get("LSB_QUEUE")
        self.bin = os.environ.get("LSF_BINDIR")
        self.host = os.environ.get("LSB_HOSTS")
        self.year = time.strftime("%Y", time.gmtime())
        self.log.verbose(
            "LSB_JOBID=%s, LSB_QUEUE=%s, LSF_BINDIR=%s, LSB_HOSTS=%s" % (self.jobID, self.queue, self.bin, self.host)
        )

        self.cpuLimit = None
        self.cpuRef = None
        self.normRef = None
        self.wallClockLimit = None
        self.hostNorm = None

        cmd = "%s/bqueues -l %s" % (self.bin, self.queue)
        result = runCommand(cmd)
        if not result["OK"]:
            return

        lines = str(result["Value"]).split("\n")
        self.log.debug(
            "From %s" % cmd, "\n".join([line if len(line) <= 128 else line[:128] + " [...]" for line in lines])
        )
        for i in range(len(lines)):
            if re.search(".*CPULIMIT.*", lines[i]):
                info = lines[i + 1].split()
                if len(info) >= 4:
                    self.cpuLimit = float(info[0]) * 60
                    self.cpuRef = info[3]
                elif len(info) == 2 and info[1] == "min":
                    self.cpuLimit = float(info[0]) * 60
                    self.cpuRef = None
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % lines[i + 1])
                    self.cpuLimit = -1
            elif re.search(".*RUNLIMIT.*", lines[i]):
                info = lines[i + 1].split()
                if len(info) >= 1:
                    self.wallClockLimit = float(info[0]) * 60
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' % lines[i + 1])
                    self.wallClockLimit = -1

        modelMaxNorm = 0
        if self.cpuRef:
            # Now try to get the CPU_FACTOR for this reference CPU,
            # it must be either a Model, a Host or the largest Model

            cmd = "%s/lshosts -w %s" % (self.bin, self.cpuRef)
            result = runCommand(cmd)
            if result["OK"]:
                # At CERN this command will return an error since there is no host defined
                # with the name of the reference Host.
                lines = str(result["Value"]).split("\n")
                l1 = lines[0].split()
                l2 = lines[1].split()
                if len(l1) > len(l2):
                    self.log.error("Failed lshost command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
                else:
                    for i in range(len(l1)):
                        if l1[i] == "cpuf":
                            try:
                                self.normRef = float(l2[i])
                                self.log.info(
                                    "Reference Normalization taken from Host", "%s: %s" % (self.cpuRef, self.normRef)
                                )
                            except ValueError as e:
                                self.log.exception("Exception parsing lshosts output", "", e)

            if not self.normRef:
                # Try if there is a model define with the name of cpuRef
                cmd = "%s/lsinfo -m" % (self.bin)
                result = runCommand(cmd)
                if result["OK"]:
                    lines = str(result["Value"]).split("\n")
                    for line in lines[1:]:
                        words = line.split()
                        if len(words) > 1:
                            try:
                                norm = float(words[1])
                                if norm > modelMaxNorm:
                                    modelMaxNorm = norm
                                if words[0].find(self.cpuRef) > -1:
                                    self.normRef = norm
                                    self.log.info(
                                        "Reference Normalization taken from Host Model",
                                        "%s: %s" % (self.cpuRef, self.normRef),
                                    )
                            except ValueError as e:
                                self.log.exception("Exception parsing lsfinfo output", "", e)

            if not self.normRef:
                # Now parse LSF configuration files
                if not os.path.isfile("./lsf.sh"):
                    os.symlink(os.path.join(os.environ["LSF_ENVDIR"], "lsf.conf"), "./lsf.sh")
                # As the variables are not exported, we must force it
                ret = sourceEnv(10, ["./lsf", "&& export LSF_CONFDIR"])
                if ret["OK"]:
                    lsfEnv = ret["outputEnv"]
                    shared = None
                    try:
                        egoShared = os.path.join(lsfEnv["LSF_CONFDIR"], "ego.shared")
                        lsfShared = os.path.join(lsfEnv["LSF_CONFDIR"], "lsf.shared")
                        if os.path.exists(egoShared):
                            shared = egoShared
                        elif os.path.exists(lsfShared):
                            shared = lsfShared
                    except KeyError as e:
                        self.log.exception("Exception getting LSF configuration", "", e)
                    if shared:
                        with open(shared) as f:
                            hostModelSection = False
                            for line in f.readlines():
                                if line.find("Begin HostModel") == 0:
                                    hostModelSection = True
                                    continue
                                if not hostModelSection:
                                    continue
                                if line.find("End HostModel") == 0:
                                    break
                                line = line.strip()
                                if line and line.split()[0] == self.cpuRef:
                                    try:
                                        self.normRef = float(line.split()[1])
                                        self.log.info(
                                            "Reference Normalization taken from Configuration File",
                                            "(%s) %s: %s" % (shared, self.cpuRef, self.normRef),
                                        )
                                    except ValueError as e:
                                        self.log.exception("Exception reading LSF configuration", "", e)
                    else:
                        self.log.warn("Could not find LSF configuration")
                else:
                    self.log.error("Cannot source the LSF environment", ret["Message"])
        if not self.normRef:
            # If nothing works take this as the unit
            self.normRef = 1.0
            # If nothing worked, take the maximum defined for a Model
            # if modelMaxNorm:
            #  self.normRef = modelMaxNorm
            #  self.log.info('Reference Normalization taken from Max Model:', self.normRef)

        # Now get the Normalization for the current Host
        if self.host:
            cmd = "%s/lshosts -w %s" % (self.bin, self.host)
            result = runCommand(cmd)
            if result["OK"]:
                lines = str(result["Value"]).split("\n")
                l1 = lines[0].split()
                l2 = lines[1].split()
                if len(l1) > len(l2):
                    self.log.error("Failed lshost command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
                else:
                    for i in range(len(l1)):
                        if l1[i] == "cpuf":
                            try:
                                self.hostNorm = float(l2[i])
                                self.log.info("Host Normalization", "%s: %s" % (self.host, self.hostNorm))
                            except ValueError as e:
                                self.log.exception("Exception parsing lshosts output", l1, e)
                            finally:
                                break

            if self.hostNorm and self.normRef:
                self.hostNorm /= self.normRef
                self.log.info("CPU power w.r.t. batch unit", self.hostNorm)

            if self.hostNorm:
                # Set the limits in real seconds
                self.cpuLimit /= self.hostNorm
                self.wallClockLimit /= self.hostNorm
コード例 #8
0
    def getResourceUsage(self):
        """Returns S_OK with a dictionary containing the entries CPU, CPULimit,
        WallClock, WallClockLimit, and Unit for current slot.
        """
        if not self.bin:
            return S_ERROR("Could not determine bin directory for LSF")
        if not self.hostNorm:
            return S_ERROR("Could not determine host Norm factor")

        cpu = None
        wallClock = None

        cmd = "%s/bjobs -W %s" % (self.bin, self.jobID)
        result = runCommand(cmd)
        if not result["OK"]:
            return result
        lines = str(result["Value"]).split("\n")
        l1 = lines[0].split()
        l2 = lines[1].split()
        if len(l1) > len(l2):
            self.log.error("Failed bjobs command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
            return S_ERROR("Can not parse LSF output")

        sCPU = None
        sStart = None
        for i in range(len(l1)):
            if l1[i] == "CPU_USED":
                sCPU = l2[i]
                lCPU = sCPU.split(":")
                try:
                    cpu = float(lCPU[0]) * 3600 + float(lCPU[1]) * 60 + float(lCPU[2])
                except (ValueError, IndexError) as _e:
                    pass
            elif l1[i] == "START_TIME":
                sStart = l2[i]
                sStart = "%s %s" % (sStart, self.year)
                try:
                    timeTup = time.strptime(sStart, "%m/%d-%H:%M:%S %Y")
                    wallClock = time.mktime(time.localtime()) - time.mktime(timeTup)
                except ValueError:
                    pass

        if cpu is None or wallClock is None:
            return S_ERROR("Failed to parse LSF output")

        consumed = {
            "CPU": cpu,
            "CPULimit": self.cpuLimit,
            "WallClock": wallClock,
            "WallClockLimit": self.wallClockLimit,
        }
        self.log.debug(consumed)

        if None not in consumed.values():
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            msg = "Could not determine some parameters"
            self.log.info(
                msg, ": %s\nThis is the stdout from the batch system call\n%s" % (",".join(missed), result["Value"])
            )
            return S_ERROR(msg)
コード例 #9
0
    def getResourceUsage(self):
        """Returns S_OK with a dictionary containing the entries CPU, CPULimit,
        WallClock, WallClockLimit, and Unit for current slot.
        """
        cmd = "qstat -f -j %s" % (self.jobID)
        result = runCommand(cmd)
        if not result["OK"]:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result["Value"]).split("\n")
        for line in lines:
            if re.search("usage.*cpu.*", line):
                match = re.search(r"cpu=([\d,:]*),", line)
                if match:
                    cpuList = match.groups()[0].split(":")
                try:
                    newcpu = 0.0
                    if len(cpuList) == 3:
                        newcpu = float(cpuList[0]) * 3600 + float(
                            cpuList[1]) * 60 + float(cpuList[2])
                    elif len(cpuList) == 4:
                        newcpu = (float(cpuList[0]) * 24 * 3600 +
                                  float(cpuList[1]) * 3600 +
                                  float(cpuList[2]) * 60 + float(cpuList[3]))
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                except ValueError:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search("hard resource_list.*cpu.*", line):
                match = re.search(r"_cpu=(\d*)", line)
                if match:
                    cpuLimit = float(match.groups()[0])
                match = re.search(r"_rt=(\d*)", line)
                if match:
                    wallClockLimit = float(match.groups()[0])
            else:
                self.log.warn("No hard limits found")

        # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures
        if cpu:
            factor = _getCPUScalingFactor()
            if factor:
                cpu = cpu / factor

        consumed = {
            "CPU": cpu,
            "CPULimit": cpuLimit,
            "WallClock": wallClock,
            "WallClockLimit": wallClockLimit
        }

        if None in consumed.values():
            missed = [key for key, val in consumed.items() if val is None]
            msg = "Could not determine parameter"
            self.log.warn("Could not determine parameter", ",".join(missed))
            self.log.debug(
                "This is the stdout from the batch system call\n%s" %
                (result["Value"]))
        else:
            self.log.debug("TimeLeft counters complete:", str(consumed))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from SGE
            if not cpuLimit:
                # Take some margin
                consumed["CPULimit"] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed["WallClockLimit"] = cpuLimit / 0.8
            if not cpu:
                consumed["CPU"] = time.time() - self.startTime
            if not wallClock:
                consumed["WallClock"] = time.time() - self.startTime
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = "Could not determine necessary parameters"
            self.log.info(
                msg, ":\nThis is the stdout from the batch system call\n%s" %
                (result["Value"]))
            retVal = S_ERROR(msg)
            retVal["Value"] = consumed
            return retVal
コード例 #10
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result['Value']).split('\n')
        for line in lines:
            info = line.split()
            if re.search('.*resources_used.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.pcput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClock = (float(wcList[0]) * 60 +
                                 float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn(
                        'Problem parsing "%s" for elapsed wall clock time' %
                        line)
            if re.search('.*Resource_List.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.pcput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClockLimit = (float(wcList[0]) * 60 +
                                      float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' %
                                  line)

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }
        self.log.debug(consumed)

        if None not in consumed.values():
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            self.log.info('Could not determine parameter', ','.join(missed))
            self.log.debug(
                'This is the stdout from the batch system call\n%s' %
                (result['Value']))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from PBS, assume that we ran for too short time
            if not cpuLimit:
                consumed['CPULimit'] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed['WallClockLimit'] = cpuLimit / 0.8
            if not cpu:
                consumed['CPU'] = int(time.time() - self.startTime)
            if not wallClock:
                consumed['WallClock'] = int(time.time() - self.startTime)
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = 'Could not determine some parameters'
            self.log.info(
                msg, ':\nThis is the stdout from the batch system call\n%s' %
                (result['Value']))
            retVal = S_ERROR(msg)
            retVal['Value'] = consumed
            return retVal
コード例 #11
0
ファイル: LSFResourceUsage.py プロジェクト: sparsh35/DIRAC
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        if not self.bin:
            return S_ERROR('Could not determine bin directory for LSF')
        if not self.hostNorm:
            return S_ERROR('Could not determine host Norm factor')

        cpu = None
        wallClock = None

        cmd = '%s/bjobs -W %s' % (self.bin, self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result
        lines = str(result['Value']).split('\n')
        l1 = lines[0].split()
        l2 = lines[1].split()
        if len(l1) > len(l2):
            self.log.error("Failed bjobs command",
                           "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
            return S_ERROR('Can not parse LSF output')

        sCPU = None
        sStart = None
        for i in xrange(len(l1)):
            if l1[i] == 'CPU_USED':
                sCPU = l2[i]
                lCPU = sCPU.split(':')
                try:
                    cpu = float(lCPU[0]) * 3600 + float(lCPU[1]) * 60 + float(
                        lCPU[2])
                except (ValueError, IndexError) as _e:
                    pass
            elif l1[i] == 'START_TIME':
                sStart = l2[i]
                sStart = '%s %s' % (sStart, self.year)
                try:
                    timeTup = time.strptime(sStart, '%m/%d-%H:%M:%S %Y')
                    wallClock = time.mktime(
                        time.localtime()) - time.mktime(timeTup)
                except ValueError:
                    pass

        if cpu is None or wallClock is None:
            return S_ERROR('Failed to parse LSF output')

        consumed = {
            'CPU': cpu,
            'CPULimit': self.cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': self.wallClockLimit
        }
        self.log.debug(consumed)

        if None not in consumed.values():
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            msg = 'Could not determine some parameters'
            self.log.info(
                msg,
                ': %s\nThis is the stdout from the batch system call\n%s' %
                (','.join(missed), result['Value']))
            return S_ERROR(msg)
コード例 #12
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f -j %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result['Value']).split('\n')
        for line in lines:
            if re.search('usage.*cpu.*', line):
                match = re.search(r'cpu=([\d,:]*),', line)
                if match:
                    cpuList = match.groups()[0].split(':')
                try:
                    newcpu = 0.
                    if len(cpuList) == 3:
                        newcpu = (float(cpuList[0]) * 60 +
                                  float(cpuList[1])) * 60 + float(cpuList[2])
                    elif len(cpuList) == 4:
                        newcpu = (
                            (float(cpuList[0]) * 24 + float(cpuList[1])) * 60 +
                            float(cpuList[2])) * 60 + float(cpuList[3])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                except ValueError:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            elif re.search('hard resource_list.*cpu.*', line):
                match = re.search(r'_cpu=(\d*)', line)
                if match:
                    cpuLimit = float(match.groups()[0])
                match = re.search(r'_rt=(\d*)', line)
                if match:
                    wallClockLimit = float(match.groups()[0])

        # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures
        if cpu:
            factor = _getCPUScalingFactor()
            if factor:
                cpu = cpu / factor

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }

        if None not in consumed.values():
            # This cannot happen as we can't get wallClock from anywhere
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            self.log.info('Could not determine parameter', ','.join(missed))
            self.log.debug(
                'This is the stdout from the batch system call\n%s' %
                (result['Value']))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from SGE
            if not cpuLimit:
                # Take some margin
                consumed['CPULimit'] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed['WallClockLimit'] = cpuLimit / 0.8
            if not cpu:
                consumed['CPU'] = time.time() - self.startTime
            if not wallClock:
                consumed['WallClock'] = time.time() - self.startTime
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = 'Could not determine some parameters'
            self.log.info(
                msg, ':\nThis is the stdout from the batch system call\n%s' %
                (result['Value']))
            retVal = S_ERROR(msg)
            retVal['Value'] = consumed
            return retVal