def getResourceUsage(self): """Returns S_OK with a dictionary containing the entries WallClock, WallClockLimit, and Unit for current slot.""" # $_CONDOR_JOB_AD corresponds to the path to the .job.ad file # It contains info about the job: # - MaxRuntime: wallclock time allocated to the job - not officially supported by HTCondor, # only present on some Sites # - CurrentTime: current time # - JobCurrentStartDate: start of the job execution jobDescription = os.environ.get("_CONDOR_JOB_AD") cmd = "condor_status -ads %s -af MaxRuntime CurrentTime-JobCurrentStartDate" % jobDescription result = runCommand(cmd) if not result["OK"]: return S_ERROR("Current batch system is not supported") output = str(result["Value"]).split(" ") if len(output) != 2: self.log.warn("Cannot open $_CONDOR_JOB_AD: output probably empty") return S_ERROR("Current batch system is not supported") wallClockLimit = output[0] wallClock = output[1] if wallClockLimit == "undefined": self.log.warn("MaxRuntime attribute is not supported") return S_ERROR("Current batch system is not supported") wallClockLimit = float(wallClockLimit) wallClock = float(wallClock) consumed = {"WallClock": wallClock, "WallClockLimit": wallClockLimit, "Unit": "WallClock"} self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed)
def getResourceUsage(self): """ Returns S_OK with a dictionary containing the entries CPU, CPULimit, WallClock, WallClockLimit, and Unit for current slot. """ # sacct displays accounting data for all jobs and job steps # -j is the given job, -o the information of interest, -X to get rid of intermediate steps # -n to remove the header, -P to make the output parseable (remove tabs, spaces, columns) # --delimiter to specify character that splits the fields cmd = 'sacct -j %s -o JobID,CPUTimeRAW,AllocCPUS,ElapsedRaw,Timelimit -X -n -P --delimiter=,' % ( self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None output = str(result['Value']).split(',') if len(output) == 5: _, cpu, allocCPUs, wallClock, wallClockLimitFormatted = output # Timelimit is in a specific format and have to be converted in seconds # TimelimitRaw is in seconds but only available from Slurm 18.08... wallClockLimit = self._getFormattedTimeInSeconds( wallClockLimitFormatted) wallClock = float(wallClock) if wallClockLimit: cpuLimit = wallClockLimit * int(allocCPUs) cpu = float(cpu) # Slurm allocations are based on wallclock time, not cpu time. # We precise it in the 'Unit' field consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit, 'Unit': 'WallClock' } if None in consumed.values(): missed = [key for key, val in consumed.items() if val is None] msg = 'Could not determine parameter' self.log.warn('Could not determine parameter', ','.join(missed)) self.log.debug( 'This is the stdout from the batch system call\n%s' % (result['Value'])) return S_ERROR(msg) self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed)
def _getCPUScalingFactor(): host = socket.getfqdn() cmd = 'qconf -se %s' % host result = runCommand(cmd) if not result['OK']: return None _example = """Example of output for qconf -se ccwsge0640 hostname ccwsge0640.in2p3.fr load_scaling NONE complex_values m_mem_free=131022.000000M,m_mem_free_n0=65486.613281M, \ m_mem_free_n1=65536.000000M,os=sl6 load_values arch=lx-amd64,cpu=89.400000,fsize_used_rate=0.089, \ load_avg=36.300000,load_long=36.020000, \ load_medium=36.300000,load_short=35.960000, \ m_cache_l1=32.000000K,m_cache_l2=256.000000K, \ m_cache_l3=25600.000000K,m_core=20, \ m_mem_free=72544.000000M,m_mem_free_n0=18696.761719M, \ m_mem_free_n1=22139.621094M,m_mem_total=131022.000000M, \ m_mem_total_n0=65486.613281M, \ m_mem_total_n1=65536.000000M,m_mem_used=58478.000000M, \ m_mem_used_n0=46789.851562M,m_mem_used_n1=43396.378906M, \ m_numa_nodes=2,m_socket=2,m_thread=40, \ m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \ m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \ m_topology_numa=[SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT][SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT], \ mem_free=70513.675781M,mem_total=129001.429688M, \ mem_used=58487.753906M,memory_used_rate=0.468, \ np_load_avg=0.907500,np_load_long=0.900500, \ np_load_medium=0.907500,np_load_short=0.899000, \ num_proc=40,swap_free=0.000000M,swap_total=266.699219M, \ swap_used=266.699219M,virtual_free=70513.675781M, \ virtual_total=129268.128906M,virtual_used=58754.453125M processors 40 user_lists NONE xuser_lists NONE projects NONE xprojects NONE usage_scaling cpu=11.350000,acct_cpu=11.350000 report_variables NONE """ lines = str(result['Value']).split('\n') for line in lines: if re.search('usage_scaling', line): match = re.search(r'cpu=([\d,\.]*),', line) if match: return float(match.groups()[0]) return None
def _getCPUScalingFactor(): host = socket.getfqdn() cmd = "qconf -se %s" % host result = runCommand(cmd) if not result["OK"]: return None lines = str(result["Value"]).split("\n") for line in lines: if re.search("usage_scaling", line): match = re.search(r"cpu=([\d,\.]*),", line) if match: return float(match.groups()[0]) return None
def _getCPUScalingFactor(): host = socket.getfqdn() cmd = 'qconf -se %s' % host result = runCommand(cmd) if not result['OK']: return None lines = str(result['Value']).split('\n') for line in lines: if re.search('usage_scaling', line): match = re.search(r'cpu=([\d,\.]*),', line) if match: return float(match.groups()[0]) return None
def getResourceUsage(self): """Returns S_OK with a dictionary containing the entries CPU, CPULimit, WallClock, WallClockLimit, and Unit for current slot. """ cmd = "qstat -f %s" % (self.jobID) result = runCommand(cmd) if not result["OK"]: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result["Value"]).split("\n") for line in lines: info = line.split() if re.search(".*resources_used.cput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search(".*resources_used.pcput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search(".*resources_used.walltime.*", line): if len(info) >= 3: wcList = info[2].split(":") wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line) if re.search(".*Resource_List.cput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search(".*Resource_List.pcput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search(".*Resource_List.walltime.*", line): if len(info) >= 3: wcList = info[2].split(":") wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for wall clock limit' % line) consumed = { "CPU": cpu, "CPULimit": cpuLimit, "WallClock": wallClock, "WallClockLimit": wallClockLimit } self.log.debug(consumed) if None not in consumed.values(): self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) missed = [key for key, val in consumed.items() if val is None] self.log.info("Could not determine parameter", ",".join(missed)) self.log.info("This is the stdout from the batch system call\n%s" % (result["Value"])) if cpuLimit or wallClockLimit: # We have got a partial result from PBS, assume that we ran for too short time if not cpuLimit: consumed["CPULimit"] = wallClockLimit * 0.8 if not wallClockLimit: consumed["WallClockLimit"] = cpuLimit / 0.8 if not cpu: consumed["CPU"] = int(time.time() - self.startTime) if not wallClock: consumed["WallClock"] = int(time.time() - self.startTime) self.log.verbose("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) msg = "Could not determine some parameters" self.log.info( msg, ":\nThis is the stdout from the batch system call\n%s" % (result["Value"])) retVal = S_ERROR(msg) retVal["Value"] = consumed return retVal
def __init__(self): """Standard constructor""" super(LSFResourceUsage, self).__init__("LSF", "LSB_JOBID") self.queue = os.environ.get("LSB_QUEUE") self.bin = os.environ.get("LSF_BINDIR") self.host = os.environ.get("LSB_HOSTS") self.year = time.strftime("%Y", time.gmtime()) self.log.verbose( "LSB_JOBID=%s, LSB_QUEUE=%s, LSF_BINDIR=%s, LSB_HOSTS=%s" % (self.jobID, self.queue, self.bin, self.host) ) self.cpuLimit = None self.cpuRef = None self.normRef = None self.wallClockLimit = None self.hostNorm = None cmd = "%s/bqueues -l %s" % (self.bin, self.queue) result = runCommand(cmd) if not result["OK"]: return lines = str(result["Value"]).split("\n") self.log.debug( "From %s" % cmd, "\n".join([line if len(line) <= 128 else line[:128] + " [...]" for line in lines]) ) for i in range(len(lines)): if re.search(".*CPULIMIT.*", lines[i]): info = lines[i + 1].split() if len(info) >= 4: self.cpuLimit = float(info[0]) * 60 self.cpuRef = info[3] elif len(info) == 2 and info[1] == "min": self.cpuLimit = float(info[0]) * 60 self.cpuRef = None else: self.log.warn('Problem parsing "%s" for CPU limit' % lines[i + 1]) self.cpuLimit = -1 elif re.search(".*RUNLIMIT.*", lines[i]): info = lines[i + 1].split() if len(info) >= 1: self.wallClockLimit = float(info[0]) * 60 else: self.log.warn('Problem parsing "%s" for wall clock limit' % lines[i + 1]) self.wallClockLimit = -1 modelMaxNorm = 0 if self.cpuRef: # Now try to get the CPU_FACTOR for this reference CPU, # it must be either a Model, a Host or the largest Model cmd = "%s/lshosts -w %s" % (self.bin, self.cpuRef) result = runCommand(cmd) if result["OK"]: # At CERN this command will return an error since there is no host defined # with the name of the reference Host. lines = str(result["Value"]).split("\n") l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed lshost command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) else: for i in range(len(l1)): if l1[i] == "cpuf": try: self.normRef = float(l2[i]) self.log.info( "Reference Normalization taken from Host", "%s: %s" % (self.cpuRef, self.normRef) ) except ValueError as e: self.log.exception("Exception parsing lshosts output", "", e) if not self.normRef: # Try if there is a model define with the name of cpuRef cmd = "%s/lsinfo -m" % (self.bin) result = runCommand(cmd) if result["OK"]: lines = str(result["Value"]).split("\n") for line in lines[1:]: words = line.split() if len(words) > 1: try: norm = float(words[1]) if norm > modelMaxNorm: modelMaxNorm = norm if words[0].find(self.cpuRef) > -1: self.normRef = norm self.log.info( "Reference Normalization taken from Host Model", "%s: %s" % (self.cpuRef, self.normRef), ) except ValueError as e: self.log.exception("Exception parsing lsfinfo output", "", e) if not self.normRef: # Now parse LSF configuration files if not os.path.isfile("./lsf.sh"): os.symlink(os.path.join(os.environ["LSF_ENVDIR"], "lsf.conf"), "./lsf.sh") # As the variables are not exported, we must force it ret = sourceEnv(10, ["./lsf", "&& export LSF_CONFDIR"]) if ret["OK"]: lsfEnv = ret["outputEnv"] shared = None try: egoShared = os.path.join(lsfEnv["LSF_CONFDIR"], "ego.shared") lsfShared = os.path.join(lsfEnv["LSF_CONFDIR"], "lsf.shared") if os.path.exists(egoShared): shared = egoShared elif os.path.exists(lsfShared): shared = lsfShared except KeyError as e: self.log.exception("Exception getting LSF configuration", "", e) if shared: with open(shared) as f: hostModelSection = False for line in f.readlines(): if line.find("Begin HostModel") == 0: hostModelSection = True continue if not hostModelSection: continue if line.find("End HostModel") == 0: break line = line.strip() if line and line.split()[0] == self.cpuRef: try: self.normRef = float(line.split()[1]) self.log.info( "Reference Normalization taken from Configuration File", "(%s) %s: %s" % (shared, self.cpuRef, self.normRef), ) except ValueError as e: self.log.exception("Exception reading LSF configuration", "", e) else: self.log.warn("Could not find LSF configuration") else: self.log.error("Cannot source the LSF environment", ret["Message"]) if not self.normRef: # If nothing works take this as the unit self.normRef = 1.0 # If nothing worked, take the maximum defined for a Model # if modelMaxNorm: # self.normRef = modelMaxNorm # self.log.info('Reference Normalization taken from Max Model:', self.normRef) # Now get the Normalization for the current Host if self.host: cmd = "%s/lshosts -w %s" % (self.bin, self.host) result = runCommand(cmd) if result["OK"]: lines = str(result["Value"]).split("\n") l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed lshost command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) else: for i in range(len(l1)): if l1[i] == "cpuf": try: self.hostNorm = float(l2[i]) self.log.info("Host Normalization", "%s: %s" % (self.host, self.hostNorm)) except ValueError as e: self.log.exception("Exception parsing lshosts output", l1, e) finally: break if self.hostNorm and self.normRef: self.hostNorm /= self.normRef self.log.info("CPU power w.r.t. batch unit", self.hostNorm) if self.hostNorm: # Set the limits in real seconds self.cpuLimit /= self.hostNorm self.wallClockLimit /= self.hostNorm
def getResourceUsage(self): """Returns S_OK with a dictionary containing the entries CPU, CPULimit, WallClock, WallClockLimit, and Unit for current slot. """ if not self.bin: return S_ERROR("Could not determine bin directory for LSF") if not self.hostNorm: return S_ERROR("Could not determine host Norm factor") cpu = None wallClock = None cmd = "%s/bjobs -W %s" % (self.bin, self.jobID) result = runCommand(cmd) if not result["OK"]: return result lines = str(result["Value"]).split("\n") l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed bjobs command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) return S_ERROR("Can not parse LSF output") sCPU = None sStart = None for i in range(len(l1)): if l1[i] == "CPU_USED": sCPU = l2[i] lCPU = sCPU.split(":") try: cpu = float(lCPU[0]) * 3600 + float(lCPU[1]) * 60 + float(lCPU[2]) except (ValueError, IndexError) as _e: pass elif l1[i] == "START_TIME": sStart = l2[i] sStart = "%s %s" % (sStart, self.year) try: timeTup = time.strptime(sStart, "%m/%d-%H:%M:%S %Y") wallClock = time.mktime(time.localtime()) - time.mktime(timeTup) except ValueError: pass if cpu is None or wallClock is None: return S_ERROR("Failed to parse LSF output") consumed = { "CPU": cpu, "CPULimit": self.cpuLimit, "WallClock": wallClock, "WallClockLimit": self.wallClockLimit, } self.log.debug(consumed) if None not in consumed.values(): return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] msg = "Could not determine some parameters" self.log.info( msg, ": %s\nThis is the stdout from the batch system call\n%s" % (",".join(missed), result["Value"]) ) return S_ERROR(msg)
def getResourceUsage(self): """Returns S_OK with a dictionary containing the entries CPU, CPULimit, WallClock, WallClockLimit, and Unit for current slot. """ cmd = "qstat -f -j %s" % (self.jobID) result = runCommand(cmd) if not result["OK"]: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result["Value"]).split("\n") for line in lines: if re.search("usage.*cpu.*", line): match = re.search(r"cpu=([\d,:]*),", line) if match: cpuList = match.groups()[0].split(":") try: newcpu = 0.0 if len(cpuList) == 3: newcpu = float(cpuList[0]) * 3600 + float( cpuList[1]) * 60 + float(cpuList[2]) elif len(cpuList) == 4: newcpu = (float(cpuList[0]) * 24 * 3600 + float(cpuList[1]) * 3600 + float(cpuList[2]) * 60 + float(cpuList[3])) if not cpu or newcpu > cpu: cpu = newcpu except ValueError: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search("hard resource_list.*cpu.*", line): match = re.search(r"_cpu=(\d*)", line) if match: cpuLimit = float(match.groups()[0]) match = re.search(r"_rt=(\d*)", line) if match: wallClockLimit = float(match.groups()[0]) else: self.log.warn("No hard limits found") # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures if cpu: factor = _getCPUScalingFactor() if factor: cpu = cpu / factor consumed = { "CPU": cpu, "CPULimit": cpuLimit, "WallClock": wallClock, "WallClockLimit": wallClockLimit } if None in consumed.values(): missed = [key for key, val in consumed.items() if val is None] msg = "Could not determine parameter" self.log.warn("Could not determine parameter", ",".join(missed)) self.log.debug( "This is the stdout from the batch system call\n%s" % (result["Value"])) else: self.log.debug("TimeLeft counters complete:", str(consumed)) if cpuLimit or wallClockLimit: # We have got a partial result from SGE if not cpuLimit: # Take some margin consumed["CPULimit"] = wallClockLimit * 0.8 if not wallClockLimit: consumed["WallClockLimit"] = cpuLimit / 0.8 if not cpu: consumed["CPU"] = time.time() - self.startTime if not wallClock: consumed["WallClock"] = time.time() - self.startTime self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = "Could not determine necessary parameters" self.log.info( msg, ":\nThis is the stdout from the batch system call\n%s" % (result["Value"])) retVal = S_ERROR(msg) retVal["Value"] = consumed return retVal
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result['Value']).split('\n') for line in lines: info = line.split() if re.search('.*resources_used.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.pcput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line) if re.search('.*Resource_List.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.pcput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for wall clock limit' % line) consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } self.log.debug(consumed) if None not in consumed.values(): self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] self.log.info('Could not determine parameter', ','.join(missed)) self.log.debug( 'This is the stdout from the batch system call\n%s' % (result['Value'])) if cpuLimit or wallClockLimit: # We have got a partial result from PBS, assume that we ran for too short time if not cpuLimit: consumed['CPULimit'] = wallClockLimit * 0.8 if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit / 0.8 if not cpu: consumed['CPU'] = int(time.time() - self.startTime) if not wallClock: consumed['WallClock'] = int(time.time() - self.startTime) self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR(msg) retVal['Value'] = consumed return retVal
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.bin: return S_ERROR('Could not determine bin directory for LSF') if not self.hostNorm: return S_ERROR('Could not determine host Norm factor') cpu = None wallClock = None cmd = '%s/bjobs -W %s' % (self.bin, self.jobID) result = runCommand(cmd) if not result['OK']: return result lines = str(result['Value']).split('\n') l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed bjobs command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) return S_ERROR('Can not parse LSF output') sCPU = None sStart = None for i in xrange(len(l1)): if l1[i] == 'CPU_USED': sCPU = l2[i] lCPU = sCPU.split(':') try: cpu = float(lCPU[0]) * 3600 + float(lCPU[1]) * 60 + float( lCPU[2]) except (ValueError, IndexError) as _e: pass elif l1[i] == 'START_TIME': sStart = l2[i] sStart = '%s %s' % (sStart, self.year) try: timeTup = time.strptime(sStart, '%m/%d-%H:%M:%S %Y') wallClock = time.mktime( time.localtime()) - time.mktime(timeTup) except ValueError: pass if cpu is None or wallClock is None: return S_ERROR('Failed to parse LSF output') consumed = { 'CPU': cpu, 'CPULimit': self.cpuLimit, 'WallClock': wallClock, 'WallClockLimit': self.wallClockLimit } self.log.debug(consumed) if None not in consumed.values(): return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] msg = 'Could not determine some parameters' self.log.info( msg, ': %s\nThis is the stdout from the batch system call\n%s' % (','.join(missed), result['Value'])) return S_ERROR(msg)
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f -j %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result['Value']).split('\n') for line in lines: if re.search('usage.*cpu.*', line): match = re.search(r'cpu=([\d,:]*),', line) if match: cpuList = match.groups()[0].split(':') try: newcpu = 0. if len(cpuList) == 3: newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) elif len(cpuList) == 4: newcpu = ( (float(cpuList[0]) * 24 + float(cpuList[1])) * 60 + float(cpuList[2])) * 60 + float(cpuList[3]) if not cpu or newcpu > cpu: cpu = newcpu except ValueError: self.log.warn('Problem parsing "%s" for CPU consumed' % line) elif re.search('hard resource_list.*cpu.*', line): match = re.search(r'_cpu=(\d*)', line) if match: cpuLimit = float(match.groups()[0]) match = re.search(r'_rt=(\d*)', line) if match: wallClockLimit = float(match.groups()[0]) # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures if cpu: factor = _getCPUScalingFactor() if factor: cpu = cpu / factor consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } if None not in consumed.values(): # This cannot happen as we can't get wallClock from anywhere self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] self.log.info('Could not determine parameter', ','.join(missed)) self.log.debug( 'This is the stdout from the batch system call\n%s' % (result['Value'])) if cpuLimit or wallClockLimit: # We have got a partial result from SGE if not cpuLimit: # Take some margin consumed['CPULimit'] = wallClockLimit * 0.8 if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit / 0.8 if not cpu: consumed['CPU'] = time.time() - self.startTime if not wallClock: consumed['WallClock'] = time.time() - self.startTime self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR(msg) retVal['Value'] = consumed return retVal