def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.jobID: return S_ERROR( 'Could not determine batch jobID from QSUB_REQNAME env var.') if not self.scaleFactor: return S_ERROR('CPU scala factor is not defined') cmd = 'qjob -a -nh -wide %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result self.log.verbose(result['Value']) cpu = None cpuLimit = None try: cpuItems = result['Value'].split() if cpuItems[5][-1] == '/': cpu = float(cpuItems[5][:-1]) cpuLimit = float(cpuItems[6]) else: cpuList = cpuItems[5].split('/') cpu = float(cpuList[0]) cpuLimit = float(cpuList[1]) except Exception: self.log.warn('Problem parsing "%s" for CPU usage' % (result['Value'])) #BQS has no wallclock limit so will simply return the same as for CPU to the TimeLeft utility wallClock = cpu wallClockLimit = cpuLimit # Divide the numbers by 5 to bring it to HS06 units from the CC UI units # and remove HS06 normalization factor consumed = { 'CPU': cpu / 5. / self.scaleFactor, 'CPULimit': cpuLimit / 5. / self.scaleFactor, 'WallClock': wallClock / 5. / self.scaleFactor, 'WallClockLimit': wallClockLimit / 5. / self.scaleFactor } self.log.debug(consumed) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn('Could not determine %s' % key) if not failed: return S_OK(consumed) else: msg = 'Could not determine some parameters,' \ ' this is the stdout from the batch system call\n%s' % ( result['Value'] ) self.log.info(msg) return S_ERROR('Could not determine some parameters')
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f %s' % ( self.jobID ) result = runCommand( cmd ) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = result['Value'].split( '\n' ) for line in lines: info = line.split() if re.search( '.*resources_used.cput.*', line ): if len( info ) >= 3: cpuList = info[2].split( ':' ) cpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) else: self.log.warn( 'Problem parsing "%s" for CPU consumed' % line ) if re.search( '.*resources_used.walltime.*', line ): if len( info ) >= 3: wcList = info[2].split( ':' ) wallClock = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] ) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line ) if re.search( '.*Resource_List.cput.*', line ): if len( info ) >= 3: cpuList = info[2].split( ':' ) cpuLimit = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) else: self.log.warn( 'Problem parsing "%s" for CPU limit' % line ) if re.search( '.*Resource_List.walltime.*', line ): if len( info ) >= 3: wcList = info[2].split( ':' ) wallClockLimit = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] ) else: self.log.warn( 'Problem parsing "%s" for wall clock limit' % line ) consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit} self.log.debug( consumed ) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn( 'Could not determine %s' % key ) if not failed: return S_OK( consumed ) else: self.log.info( 'Could not determine some parameters, this is the stdout from the batch system call\n%s' % ( result['Value'] ) ) retVal = S_ERROR( 'Could not determine some parameters' ) retVal['Value'] = consumed return retVal
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.bin: return S_ERROR( 'Could not determine bin directory for LSF' ) if not self.hostNorm: return S_ERROR( 'Could not determine host Norm factor' ) cpu = None wallClock = None cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID ) result = runCommand( cmd ) if not result['OK']: return result lines = str( result['Value'] ).split( '\n' ) l1 = lines[0].split() l2 = lines[1].split() if len( l1 ) > len( l2 ): self.log.error( "Failed bjobs command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) ) return S_ERROR( 'Can not parse LSF output' ) sCPU = None sStart = None for i in range( len( l1 ) ): if l1[i] == 'CPU_USED': sCPU = l2[i] lCPU = sCPU.split( ':' ) try: cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] ) except ( ValueError, IndexError ) as _e: pass elif l1[i] == 'START_TIME': sStart = l2[i] sStart = '%s %s' % ( sStart, self.year ) try: timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' ) wallClock = time.mktime( time.localtime() ) - time.mktime( timeTup ) except ValueError: pass if cpu is None or wallClock is None: return S_ERROR( 'Failed to parse LSF output' ) consumed = {'CPU':cpu, 'CPULimit':self.cpuLimit, 'WallClock':wallClock, 'WallClockLimit':self.wallClockLimit} self.log.debug( consumed ) if None not in consumed.values(): return S_OK( consumed ) else: missed = [key for key, val in consumed.items() if val is None] msg = 'Could not determine some parameters' self.log.info( msg, ': %s\nThis is the stdout from the batch system call\n%s' % ( ','.join( missed ), result['Value'] ) ) return S_ERROR( msg )
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.jobID: return S_ERROR( 'Could not determine batch jobID from QSUB_REQNAME env var.' ) if not self.scaleFactor: return S_ERROR( 'CPU scala factor is not defined' ) cmd = 'qjob -a -nh -wide %s' % ( self.jobID ) result = runCommand( cmd ) if not result['OK']: return result self.log.verbose( result['Value'] ) cpu = None cpuLimit = None try: cpuItems = result['Value'].split() if cpuItems[5][-1] == '/': cpu = float( cpuItems[5][:-1] ) cpuLimit = float( cpuItems[6] ) else: cpuList = cpuItems[5].split( '/' ) cpu = float( cpuList[0] ) cpuLimit = float( cpuList[1] ) except Exception: self.log.warn( 'Problem parsing "%s" for CPU usage' % ( result['Value'] ) ) #BQS has no wallclock limit so will simply return the same as for CPU to the TimeLeft utility wallClock = cpu wallClockLimit = cpuLimit # Divide the numbers by 5 to bring it to HS06 units from the CC UI units # and remove HS06 normalization factor consumed = {'CPU':cpu / 5. / self.scaleFactor, 'CPULimit':cpuLimit / 5. / self.scaleFactor, 'WallClock':wallClock / 5. / self.scaleFactor, 'WallClockLimit':wallClockLimit / 5. / self.scaleFactor} self.log.debug( consumed ) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn( 'Could not determine %s' % key ) if not failed: return S_OK( consumed ) else: msg = 'Could not determine some parameters,' \ ' this is the stdout from the batch system call\n%s' % ( result['Value'] ) self.log.info( msg ) return S_ERROR( 'Could not determine some parameters' )
def __getCPUScalingFactor(self): host = socket.getfqdn() cmd = 'qconf -se %s' % host result = runCommand( cmd ) if not result['OK']: return None lines = result['Value'].split( '\n' ) for line in lines: if re.search( 'usage_scaling', line ): match = re.search('cpu=([\d,\.]*),',line) if match: return float( match.groups()[0] ) return None
def __getCPUScalingFactor(self): host = socket.getfqdn() cmd = 'qconf -se %s' % host result = runCommand(cmd) if not result['OK']: return None lines = result['Value'].split('\n') for line in lines: if re.search('usage_scaling', line): match = re.search('cpu=([\d,\.]*),', line) if match: return float(match.groups()[0]) return None
def _getCPUScalingFactor(): host = socket.getfqdn() cmd = 'qconf -se %s' % host result = runCommand( cmd ) if not result['OK']: return None _example = """Example of output for qconf -se ccwsge0640 hostname ccwsge0640.in2p3.fr load_scaling NONE complex_values m_mem_free=131022.000000M,m_mem_free_n0=65486.613281M, \ m_mem_free_n1=65536.000000M,os=sl6 load_values arch=lx-amd64,cpu=89.400000,fsize_used_rate=0.089, \ load_avg=36.300000,load_long=36.020000, \ load_medium=36.300000,load_short=35.960000, \ m_cache_l1=32.000000K,m_cache_l2=256.000000K, \ m_cache_l3=25600.000000K,m_core=20, \ m_mem_free=72544.000000M,m_mem_free_n0=18696.761719M, \ m_mem_free_n1=22139.621094M,m_mem_total=131022.000000M, \ m_mem_total_n0=65486.613281M, \ m_mem_total_n1=65536.000000M,m_mem_used=58478.000000M, \ m_mem_used_n0=46789.851562M,m_mem_used_n1=43396.378906M, \ m_numa_nodes=2,m_socket=2,m_thread=40, \ m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \ m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \ m_topology_numa=[SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT][SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT], \ mem_free=70513.675781M,mem_total=129001.429688M, \ mem_used=58487.753906M,memory_used_rate=0.468, \ np_load_avg=0.907500,np_load_long=0.900500, \ np_load_medium=0.907500,np_load_short=0.899000, \ num_proc=40,swap_free=0.000000M,swap_total=266.699219M, \ swap_used=266.699219M,virtual_free=70513.675781M, \ virtual_total=129268.128906M,virtual_used=58754.453125M processors 40 user_lists NONE xuser_lists NONE projects NONE xprojects NONE usage_scaling cpu=11.350000,acct_cpu=11.350000 report_variables NONE """ lines = str( result['Value'] ).split( '\n' ) for line in lines: if re.search( 'usage_scaling', line ): match = re.search( 'cpu=([\d,\.]*),', line ) if match: return float( match.groups()[0] ) return None
def _getCPUScalingFactor(): host = socket.getfqdn() cmd = 'qconf -se %s' % host result = runCommand(cmd) if not result['OK']: return None _example = """Example of output for qconf -se ccwsge0640 hostname ccwsge0640.in2p3.fr load_scaling NONE complex_values m_mem_free=131022.000000M,m_mem_free_n0=65486.613281M, \ m_mem_free_n1=65536.000000M,os=sl6 load_values arch=lx-amd64,cpu=89.400000,fsize_used_rate=0.089, \ load_avg=36.300000,load_long=36.020000, \ load_medium=36.300000,load_short=35.960000, \ m_cache_l1=32.000000K,m_cache_l2=256.000000K, \ m_cache_l3=25600.000000K,m_core=20, \ m_mem_free=72544.000000M,m_mem_free_n0=18696.761719M, \ m_mem_free_n1=22139.621094M,m_mem_total=131022.000000M, \ m_mem_total_n0=65486.613281M, \ m_mem_total_n1=65536.000000M,m_mem_used=58478.000000M, \ m_mem_used_n0=46789.851562M,m_mem_used_n1=43396.378906M, \ m_numa_nodes=2,m_socket=2,m_thread=40, \ m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \ m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \ m_topology_numa=[SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT][SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT], \ mem_free=70513.675781M,mem_total=129001.429688M, \ mem_used=58487.753906M,memory_used_rate=0.468, \ np_load_avg=0.907500,np_load_long=0.900500, \ np_load_medium=0.907500,np_load_short=0.899000, \ num_proc=40,swap_free=0.000000M,swap_total=266.699219M, \ swap_used=266.699219M,virtual_free=70513.675781M, \ virtual_total=129268.128906M,virtual_used=58754.453125M processors 40 user_lists NONE xuser_lists NONE projects NONE xprojects NONE usage_scaling cpu=11.350000,acct_cpu=11.350000 report_variables NONE """ lines = str(result['Value']).split('\n') for line in lines: if re.search('usage_scaling', line): match = re.search('cpu=([\d,\.]*),', line) if match: return float(match.groups()[0]) return None
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.bin: return S_ERROR('Could not determine bin directory for LSF') if not self.hostNorm: return S_ERROR('Could not determine host Norm factor') cpu = None wallClock = None cmd = '%s/bjobs -W %s' % (self.bin, self.jobID) result = runCommand(cmd) if not result['OK']: return result lines = str(result['Value']).split('\n') l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed bjobs command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) return S_ERROR('Can not parse LSF output') sCPU = None sStart = None for i in range(len(l1)): if l1[i] == 'CPU_USED': sCPU = l2[i] lCPU = sCPU.split(':') try: cpu = float(lCPU[0]) * 3600 + float(lCPU[1]) * 60 + float( lCPU[2]) except ValueError, IndexError: pass elif l1[i] == 'START_TIME': sStart = l2[i] sStart = '%s %s' % (sStart, self.year) try: timeTup = time.strptime(sStart, '%m/%d-%H:%M:%S %Y') wallClock = time.mktime( time.localtime()) - time.mktime(timeTup) except ValueError: pass
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.bin: return S_ERROR( 'Could not determine bin directory for LSF' ) if not self.hostNorm: return S_ERROR( 'Could not determine host Norm factor' ) cpu = None wallClock = None cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID ) result = runCommand( cmd ) if not result['OK']: return result lines = str( result['Value'] ).split( '\n' ) l1 = lines[0].split() l2 = lines[1].split() if len( l1 ) > len( l2 ): self.log.error( "Failed bjobs command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) ) return S_ERROR( 'Can not parse LSF output' ) sCPU = None sStart = None for i in range( len( l1 ) ): if l1[i] == 'CPU_USED': sCPU = l2[i] lCPU = sCPU.split( ':' ) try: cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] ) except ValueError, IndexError: pass elif l1[i] == 'START_TIME': sStart = l2[i] sStart = '%s %s' % ( sStart, self.year ) try: timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' ) wallClock = time.mktime( time.localtime() ) - time.mktime( timeTup ) except ValueError: pass
def __init__(self): """ Standard constructor """ self.log = gLogger.getSubLogger('LSFTimeLeft') self.jobID = os.environ.get('LSB_JOBID') self.queue = os.environ.get('LSB_QUEUE') self.bin = os.environ.get('LSF_BINDIR') self.host = os.environ.get('LSB_HOSTS') self.year = time.strftime('%Y', time.gmtime()) self.log.verbose( 'LSB_JOBID=%s, LSB_QUEUE=%s, LSF_BINDIR=%s, LSB_HOSTS=%s' % (self.jobID, self.queue, self.bin, self.host)) self.cpuLimit = None self.cpuRef = None self.normRef = None self.wallClockLimit = None self.hostNorm = None cmd = '%s/bqueues -l %s' % (self.bin, self.queue) result = runCommand(cmd) if not result['OK']: return lines = str(result['Value']).split('\n') self.log.debug( 'From %s' % cmd, '\n'.join([ line if len(line) <= 128 else line[:128] + ' [...]' for line in lines ])) for i in xrange(len(lines)): if re.search('.*CPULIMIT.*', lines[i]): info = lines[i + 1].split() if len(info) >= 4: self.cpuLimit = float(info[0]) * 60 self.cpuRef = info[3] elif len(info) == 2 and info[1] == "min": self.cpuLimit = float(info[0]) * 60 self.cpuRef = None else: self.log.warn('Problem parsing "%s" for CPU limit' % lines[i + 1]) self.cpuLimit = -1 elif re.search('.*RUNLIMIT.*', lines[i]): info = lines[i + 1].split() if len(info) >= 1: self.wallClockLimit = float(info[0]) * 60 else: self.log.warn('Problem parsing "%s" for wall clock limit' % lines[i + 1]) self.wallClockLimit = -1 modelMaxNorm = 0 if self.cpuRef: # Now try to get the CPU_FACTOR for this reference CPU, # it must be either a Model, a Host or the largest Model cmd = '%s/lshosts -w %s' % (self.bin, self.cpuRef) result = runCommand(cmd) if result['OK']: # At CERN this command will return an error since there is no host defined # with the name of the reference Host. lines = str(result['Value']).split('\n') l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed lshost command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) else: for i in xrange(len(l1)): if l1[i] == 'cpuf': try: self.normRef = float(l2[i]) self.log.info( 'Reference Normalization taken from Host', '%s: %s' % (self.cpuRef, self.normRef)) except ValueError as e: self.log.exception( 'Exception parsing lshosts output', '', e) if not self.normRef: # Try if there is a model define with the name of cpuRef cmd = '%s/lsinfo -m' % (self.bin) result = runCommand(cmd) if result['OK']: lines = str(result['Value']).split('\n') for line in lines[1:]: words = line.split() if len(words) > 1: try: norm = float(words[1]) if norm > modelMaxNorm: modelMaxNorm = norm if words[0].find(self.cpuRef) > -1: self.normRef = norm self.log.info( 'Reference Normalization taken from Host Model', '%s: %s' % (self.cpuRef, self.normRef)) except ValueError as e: self.log.exception( 'Exception parsing lsfinfo output', '', e) if not self.normRef: # Now parse LSF configuration files if not os.path.isfile('./lsf.sh'): os.symlink( os.path.join(os.environ['LSF_ENVDIR'], 'lsf.conf'), './lsf.sh') # As the variables are not exported, we must force it ret = sourceEnv(10, ['./lsf', '&& export LSF_CONFDIR']) if ret['OK']: lsfEnv = ret['outputEnv'] shared = None try: egoShared = os.path.join(lsfEnv['LSF_CONFDIR'], 'ego.shared') lsfShared = os.path.join(lsfEnv['LSF_CONFDIR'], 'lsf.shared') if os.path.exists(egoShared): shared = egoShared elif os.path.exists(lsfShared): shared = lsfShared except KeyError as e: self.log.exception( 'Exception getting LSF configuration', '', e) if shared: with open(shared) as f: hostModelSection = False for line in f.readlines(): if line.find('Begin HostModel') == 0: hostModelSection = True continue if not hostModelSection: continue if line.find('End HostModel') == 0: break line = line.strip() if line and line.split()[0] == self.cpuRef: try: self.normRef = float(line.split()[1]) self.log.info( 'Reference Normalization taken from Configuration File', '(%s) %s: %s' % (shared, self.cpuRef, self.normRef)) except ValueError as e: self.log.exception( 'Exception reading LSF configuration', '', e) else: self.log.warn('Could not find LSF configuration') else: self.log.error('Cannot source the LSF environment', ret['Message']) if not self.normRef: # If nothing works take this as the unit self.normRef = 1. # If nothing worked, take the maximum defined for a Model # if modelMaxNorm: # self.normRef = modelMaxNorm # self.log.info( 'Reference Normalization taken from Max Model:', self.normRef ) # Now get the Normalization for the current Host if self.host: cmd = '%s/lshosts -w %s' % (self.bin, self.host) result = runCommand(cmd) if result['OK']: lines = str(result['Value']).split('\n') l1 = lines[0].split() l2 = lines[1].split() if len(l1) > len(l2): self.log.error("Failed lshost command", "%s:\n %s\n %s" % (cmd, lines[0], lines[0])) else: for i in xrange(len(l1)): if l1[i] == 'cpuf': try: self.hostNorm = float(l2[i]) self.log.info( 'Host Normalization', '%s: %s' % (self.host, self.hostNorm)) except ValueError as e: self.log.exception( 'Exception parsing lshosts output', l1, e) finally: break if self.hostNorm and self.normRef: self.hostNorm /= self.normRef self.log.info('CPU power w.r.t. batch unit', self.hostNorm) if self.hostNorm: # Set the limits in real seconds self.cpuLimit /= self.hostNorm self.wallClockLimit /= self.hostNorm
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f -j %s' % ( self.jobID ) result = runCommand( cmd ) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str( result['Value'] ).split( '\n' ) for line in lines: if re.search( 'usage.*cpu.*', line ): match = re.search( 'cpu=([\d,:]*),', line ) if match: cpuList = match.groups()[0].split( ':' ) try: newcpu = 0. if len( cpuList ) == 3: newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) elif len( cpuList ) == 4: newcpu = ( ( float( cpuList[0] ) * 24 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) ) * 60 + float( cpuList[3] ) if not cpu or newcpu > cpu: cpu = newcpu except ValueError: self.log.warn( 'Problem parsing "%s" for CPU consumed' % line ) elif re.search( 'hard resource_list.*cpu.*', line ): match = re.search( '_cpu=(\d*)', line ) if match: cpuLimit = float( match.groups()[0] ) match = re.search( '_rt=(\d*)', line ) if match: wallClockLimit = float( match.groups()[0] ) # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures if cpu: factor = _getCPUScalingFactor() if factor: cpu = cpu / factor consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit} if None not in consumed.values(): # This cannot happen as we can't get wallClock from anywhere self.log.debug( "TimeLeft counters complete:", str( consumed ) ) return S_OK( consumed ) else: missed = [key for key, val in consumed.items() if val is None] self.log.info( 'Could not determine parameter', ','.join( missed ) ) self.log.debug( 'This is the stdout from the batch system call\n%s' % ( result['Value'] ) ) if cpuLimit or wallClockLimit: # We have got a partial result from SGE if not cpuLimit: # Take some margin consumed['CPULimit'] = wallClockLimit * 0.8 if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit / 0.8 if not cpu: consumed['CPU'] = time.time() - self.startTime if not wallClock: consumed['WallClock'] = time.time() - self.startTime self.log.debug( "TimeLeft counters restored:", str( consumed ) ) return S_OK( consumed ) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % ( result['Value'] ) ) retVal = S_ERROR( msg ) retVal['Value'] = consumed return retVal
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ if not self.bin: return S_ERROR( 'Could not determine bin directory for LSF' ) if not self.hostNorm: return S_ERROR( 'Could not determine host Norm factor' ) cpu = None wallClock = None cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID ) result = runCommand( cmd ) if not result['OK']: return result lines = result['Value'].split( '\n' ) l1 = lines[0].split() l2 = lines[1].split() if len( l1 ) > len( l2 ): self.log.error( cmd ) self.log.error( lines[0] ) self.log.error( lines[1] ) return S_ERROR( 'Can not parse LSF output' ) sCPU = None sStart = None for i in range( len( l1 ) ): if l1[i] == 'CPU_USED': sCPU = l2[i] lCPU = sCPU.split( ':' ) try: cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] ) except Exception: pass elif l1[i] == 'START_TIME': sStart = l2[i] sStart = '%s %s' % ( sStart, self.year ) try: timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' ) wallClock = time.mktime( timeTup ) wallClock = time.mktime( time.localtime() ) - wallClock except Exception: pass if cpu == None or wallClock == None: return S_ERROR( 'Failed to parse LSF output' ) cpu = cpu * self.hostNorm wallClock = wallClock * self.hostNorm consumed = {'CPU':cpu, 'CPULimit':self.cpuLimit, 'WallClock':wallClock, 'WallClockLimit':self.wallClockLimit} self.log.debug( consumed ) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn( 'Could not determine %s' % key ) if not failed: return S_OK( consumed ) else: msg = 'Could not determine some parameters,' \ ' this is the stdout from the batch system call\n%s' % ( result['Value'] ) self.log.info( msg ) return S_ERROR( 'Could not determine some parameters' )
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f %s' % ( self.jobID ) result = runCommand( cmd ) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str( result['Value'] ).split( '\n' ) for line in lines: info = line.split() if re.search( '.*resources_used.cput.*', line ): if len( info ) >= 3: cpuList = info[2].split( ':' ) newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn( 'Problem parsing "%s" for CPU consumed' % line ) if re.search( '.*resources_used.pcput.*', line ): if len( info ) >= 3: cpuList = info[2].split( ':' ) newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn( 'Problem parsing "%s" for CPU consumed' % line ) if re.search( '.*resources_used.walltime.*', line ): if len( info ) >= 3: wcList = info[2].split( ':' ) wallClock = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] ) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line ) if re.search( '.*Resource_List.cput.*', line ): if len( info ) >= 3: cpuList = info[2].split( ':' ) newcpuLimit = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn( 'Problem parsing "%s" for CPU limit' % line ) if re.search( '.*Resource_List.pcput.*', line ): if len( info ) >= 3: cpuList = info[2].split( ':' ) newcpuLimit = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn( 'Problem parsing "%s" for CPU limit' % line ) if re.search( '.*Resource_List.walltime.*', line ): if len( info ) >= 3: wcList = info[2].split( ':' ) wallClockLimit = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] ) else: self.log.warn( 'Problem parsing "%s" for wall clock limit' % line ) consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit} self.log.debug( consumed ) if None not in consumed.values(): self.log.debug( "TimeLeft counters complete:", str( consumed ) ) return S_OK( consumed ) if cpuLimit or wallClockLimit: # We have got a partial result from PBS, assume that we ran for too short time if not cpuLimit: consumed['CPULimit'] = wallClockLimit if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit if not cpu: consumed['CPU'] = int( time.time() - self.startTime ) if not wallClock: consumed['WallClock'] = int( time.time() - self.startTime ) self.log.debug( "TimeLeft counters restored:", str( consumed ) ) return S_OK( consumed ) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % ( result['Value'] ) ) retVal = S_ERROR( msg ) retVal['Value'] = consumed return retVal
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f -j %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result example = """ Example of output from qstat -f -j $JOB_ID ============================================================== job_number: 620685 exec_file: job_scripts/620685 submission_time: Wed Apr 11 09:36:41 2012 owner: lhcb049 uid: 18416 group: lhcb gid: 155 sge_o_home: /home/lhcb049 sge_o_log_name: lhcb049 sge_o_path: /opt/sge/bin/lx24-amd64:/usr/bin:/bin sge_o_shell: /bin/sh sge_o_workdir: /var/glite/tmp sge_o_host: cccreamceli05 account: GRID=EGI SITE=IN2P3-CC TIER=tier1 VO=lhcb ROLEVOMS=&2Flhcb&2FRole=pilot&2FCapability=NULL DN=&2FDC=ch&2FDC=cern&2FOU=Organic&20Units&2FOU=Users&2FCN=romanov&2FCN=427293&2FCN=Vladimir&20Romanovskiy&2FCN=proxy&2FCN=proxy&2FCN=proxy&2FCN=proxy merge: y hard resource_list: os=sl5,s_cpu=165600,s_vmem=5120M,s_fsize=51200M,cvmfs=1,dcache=1 mail_list: [email protected] notify: FALSE job_name: cccreamceli05_crm05_749996134 stdout_path_list: NONE:NONE:/dev/null jobshare: 0 hard_queue_list: huge restart: n shell_list: NONE:/bin/bash env_list: SITE_NAME=IN2P3-CC,MANPATH=/opt/sge/man:/usr/share/man:/usr/local/man:/usr/local/share/man,HOSTNAME=cccreamceli05,SHELL=/bin/sh,TERM=vanilla,HISTSIZE=1000,SGE_CELL=ccin2p3,USER=lhcb049,LD_LIBRARY_PATH=/usr/lib64:,LS_COLORS=no=00:fi=00:di=01;34:ln=01;36:pi=40;33:so=01;35:bd=40;33;01:cd=40;33;01:or=01;05;37;41:mi=01;05;37;41:ex=01;32:*.cmd=01;32:*.exe=01;32:*.com=01;32:*.btm=01;32:*.bat=01;32:*.sh=01;32:*.csh=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.gz=01;31:*.bz2=01;31:*.bz=01;31:*.tz=01;31:*.rpm=01;31:*.cpio=01;31:*.jpg=01;35:*.gif=01;35:*.bmp=01;35:*.xbm=01;35:*.xpm=01;35:*.png=01;35:*.tif=01;35:,SUDO_USER=tomcat,SUDO_UID=91,USERNAME=lhcb049,PATH=/opt/sge/bin/lx24-amd64:/usr/bin:/bin,MAIL=/var/spool/mail/tomcat,PWD=/var/glite/tmp,INPUTRC=/etc/inputrc,SGE_EXECD_PORT=10501,SGE_QMASTER_PORT=10500,SGE_ROOT=/opt/sge,SHLVL=1,SUDO_COMMAND=/opt/glite/bin/sge_submit.sh -x /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/proxy/354BFF4A_EAD9_3B10_FBE7_D9FFB765662A11488451642439 -u /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=romanov/CN=427293/CN=Vladimir Romanovskiy -r no -c /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/CREAM749996134_jobWrapper.sh -T /tmp -C /tmp/ce-req-file-1334129801228226 -o /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardOutput -e /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardError -q verylong -j crm05_749996134,HOME=/home/lhcb049,LOGNAME=lhcb049,SGE_CLUSTER_NAME=prod,SUDO_GID=91,DISPLAY=localhost:10.0,XAUTHORITY=/tmp/ssh-oosv2628/cookies,_=/opt/sge/bin/lx24-amd64/qsub script_file: /tmp/crm05_749996134 project: P_lhcb_pilot usage 1: cpu=00:00:07, mem=0.03044 GBs, io=0.19846, vmem=288.609M, maxvmem=288.609M scheduling info: (Collecting of scheduler job information is turned off) """ cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = result['Value'].split('\n') for line in lines: if re.search('usage.*cpu.*', line): match = re.search('cpu=([\d,:]*),', line) if match: cpuList = match.groups()[0].split(':') try: newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2][:-1]) if not cpu or newcpu > cpu: cpu = newcpu except ValueError: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('hard resource_list.*cpu.*', line): match = re.search('_cpu=(\d*)', line) if match: cpuLimit = float(match.groups()[0]) match = re.search('_rt=(\d*)', line) if match: wallClockLimit = float(match.groups()[0]) # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures if cpu: factor = self.__getCPUScalingFactor() if factor: cpu = cpu / factor consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } self.log.debug(consumed) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn('Could not determine %s' % key) if not failed: return S_OK(consumed) if cpuLimit or wallClockLimit: # We have got a partial result from SGE if not cpuLimit: consumed['CPULimit'] = wallClockLimit if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit if not cpu: consumed['CPU'] = time.time() - self.startTime if not wallClock: consumed['WallClock'] = time.time() - self.startTime self.log.debug("TimeLeft counters restored: " + str(consumed)) return S_OK(consumed) else: self.log.info( 'Could not determine some parameters, this is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR('Could not determine some parameters') retVal['Value'] = consumed return retVal
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result['Value']).split('\n') for line in lines: info = line.split() if re.search('.*resources_used.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.pcput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line) if re.search('.*Resource_List.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.pcput.*', line): if len(info) >= 3: cpuList = info[2].split(':') newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for wall clock limit' % line) consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } self.log.debug(consumed) if None not in consumed.values(): self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] self.log.info('Could not determine parameter', ','.join(missed)) self.log.debug( 'This is the stdout from the batch system call\n%s' % (result['Value'])) if cpuLimit or wallClockLimit: # We have got a partial result from PBS, assume that we ran for too short time if not cpuLimit: consumed['CPULimit'] = wallClockLimit * 0.8 if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit / 0.8 if not cpu: consumed['CPU'] = int(time.time() - self.startTime) if not wallClock: consumed['WallClock'] = int(time.time() - self.startTime) self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR(msg) retVal['Value'] = consumed return retVal
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = result['Value'].split('\n') for line in lines: info = line.split() if re.search('.*resources_used.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') cpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search('.*resources_used.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line) if re.search('.*Resource_List.cput.*', line): if len(info) >= 3: cpuList = info[2].split(':') cpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search('.*Resource_List.walltime.*', line): if len(info) >= 3: wcList = info[2].split(':') wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for wall clock limit' % line) consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } self.log.debug(consumed) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn('Could not determine %s' % key) if not failed: return S_OK(consumed) else: self.log.info( 'Could not determine some parameters, this is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR('Could not determine some parameters') retVal['Value'] = consumed return retVal
def getResourceUsage( self ): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f -j %s' % ( self.jobID ) result = runCommand( cmd ) if not result['OK']: return result example = """ Example of output from qstat -f -j $JOB_ID ============================================================== job_number: 620685 exec_file: job_scripts/620685 submission_time: Wed Apr 11 09:36:41 2012 owner: lhcb049 uid: 18416 group: lhcb gid: 155 sge_o_home: /home/lhcb049 sge_o_log_name: lhcb049 sge_o_path: /opt/sge/bin/lx24-amd64:/usr/bin:/bin sge_o_shell: /bin/sh sge_o_workdir: /var/glite/tmp sge_o_host: cccreamceli05 account: GRID=EGI SITE=IN2P3-CC TIER=tier1 VO=lhcb ROLEVOMS=&2Flhcb&2FRole=pilot&2FCapability=NULL DN=&2FDC=ch&2FDC=cern&2FOU=Organic&20Units&2FOU=Users&2FCN=romanov&2FCN=427293&2FCN=Vladimir&20Romanovskiy&2FCN=proxy&2FCN=proxy&2FCN=proxy&2FCN=proxy merge: y hard resource_list: os=sl5,s_cpu=165600,s_vmem=5120M,s_fsize=51200M,cvmfs=1,dcache=1 mail_list: [email protected] notify: FALSE job_name: cccreamceli05_crm05_749996134 stdout_path_list: NONE:NONE:/dev/null jobshare: 0 hard_queue_list: huge restart: n shell_list: NONE:/bin/bash env_list: SITE_NAME=IN2P3-CC,MANPATH=/opt/sge/man:/usr/share/man:/usr/local/man:/usr/local/share/man,HOSTNAME=cccreamceli05,SHELL=/bin/sh,TERM=vanilla,HISTSIZE=1000,SGE_CELL=ccin2p3,USER=lhcb049,LD_LIBRARY_PATH=/usr/lib64:,LS_COLORS=no=00:fi=00:di=01;34:ln=01;36:pi=40;33:so=01;35:bd=40;33;01:cd=40;33;01:or=01;05;37;41:mi=01;05;37;41:ex=01;32:*.cmd=01;32:*.exe=01;32:*.com=01;32:*.btm=01;32:*.bat=01;32:*.sh=01;32:*.csh=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.gz=01;31:*.bz2=01;31:*.bz=01;31:*.tz=01;31:*.rpm=01;31:*.cpio=01;31:*.jpg=01;35:*.gif=01;35:*.bmp=01;35:*.xbm=01;35:*.xpm=01;35:*.png=01;35:*.tif=01;35:,SUDO_USER=tomcat,SUDO_UID=91,USERNAME=lhcb049,PATH=/opt/sge/bin/lx24-amd64:/usr/bin:/bin,MAIL=/var/spool/mail/tomcat,PWD=/var/glite/tmp,INPUTRC=/etc/inputrc,SGE_EXECD_PORT=10501,SGE_QMASTER_PORT=10500,SGE_ROOT=/opt/sge,SHLVL=1,SUDO_COMMAND=/opt/glite/bin/sge_submit.sh -x /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/proxy/354BFF4A_EAD9_3B10_FBE7_D9FFB765662A11488451642439 -u /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=romanov/CN=427293/CN=Vladimir Romanovskiy -r no -c /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/CREAM749996134_jobWrapper.sh -T /tmp -C /tmp/ce-req-file-1334129801228226 -o /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardOutput -e /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardError -q verylong -j crm05_749996134,HOME=/home/lhcb049,LOGNAME=lhcb049,SGE_CLUSTER_NAME=prod,SUDO_GID=91,DISPLAY=localhost:10.0,XAUTHORITY=/tmp/ssh-oosv2628/cookies,_=/opt/sge/bin/lx24-amd64/qsub script_file: /tmp/crm05_749996134 project: P_lhcb_pilot usage 1: cpu=00:00:07, mem=0.03044 GBs, io=0.19846, vmem=288.609M, maxvmem=288.609M scheduling info: (Collecting of scheduler job information is turned off) """ cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = result['Value'].split( '\n' ) for line in lines: if re.search( 'usage.*cpu.*', line ): match = re.search( 'cpu=([\d,:]*),', line ) if match: cpuList = match.groups()[0].split( ':' ) try: newcpu = 0. if len( cpuList ) == 3: newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) elif len( cpuList ) == 4: newcpu = ( ( float( cpuList[0] ) * 24 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) ) * 60 + float( cpuList[3] ) if not cpu or newcpu > cpu: cpu = newcpu except ValueError: self.log.warn( 'Problem parsing "%s" for CPU consumed' % line ) if re.search( 'hard resource_list.*cpu.*', line ): match = re.search( '_cpu=(\d*)', line ) if match: cpuLimit = float( match.groups()[0] ) match = re.search( '_rt=(\d*)', line ) if match: wallClockLimit = float( match.groups()[0] ) # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures if cpu: factor = self.__getCPUScalingFactor() if factor: cpu = cpu/factor consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit} self.log.debug( consumed ) failed = False for key, val in consumed.items(): if val == None: failed = True self.log.warn( 'Could not determine %s' % key ) if not failed: return S_OK( consumed ) if cpuLimit or wallClockLimit: # We have got a partial result from SGE if not cpuLimit: consumed['CPULimit'] = wallClockLimit if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit if not cpu: consumed['CPU'] = time.time() - self.startTime if not wallClock: consumed['WallClock'] = time.time() - self.startTime self.log.debug( "TimeLeft counters restored: " + str( consumed ) ) return S_OK( consumed ) else: self.log.info( 'Could not determine some parameters, this is the stdout from the batch system call\n%s' % ( result['Value'] ) ) retVal = S_ERROR( 'Could not determine some parameters' ) retVal['Value'] = consumed return retVal
def __init__( self ): """ Standard constructor """ self.log = gLogger.getSubLogger( 'LSFTimeLeft' ) self.jobID = os.environ.get( 'LSB_JOBID' ) self.queue = os.environ.get( 'LSB_QUEUE' ) self.bin = os.environ.get( 'LSF_BINDIR' ) self.host = os.environ.get( 'LSB_HOSTS' ) self.year = time.strftime( '%Y', time.gmtime() ) self.log.verbose( 'LSB_JOBID=%s, LSB_QUEUE=%s, LSF_BINDIR=%s, LSB_HOSTS=%s' % ( self.jobID, self.queue, self.bin, self.host ) ) self.cpuLimit = None self.cpuRef = None self.normRef = None self.wallClockLimit = None self.hostNorm = None cmd = '%s/bqueues -l %s' % ( self.bin, self.queue ) result = runCommand( cmd ) if not result['OK']: return self.log.debug( 'From %s' % cmd, result['Value'] ) lines = str( result['Value'] ).split( '\n' ) for i in xrange( len( lines ) ): if re.search( '.*CPULIMIT.*', lines[i] ): info = lines[i + 1].split() if len( info ) >= 4: self.cpuLimit = float( info[0] ) * 60 self.cpuRef = info[3] else: self.log.warn( 'Problem parsing "%s" for CPU limit' % lines[i + 1] ) self.cpuLimit = -1 if re.search( '.*RUNLIMIT.*', lines[i] ): info = lines[i + 1].split() if len( info ) >= 1: self.wallClockLimit = float( info[0] ) * 60 else: self.log.warn( 'Problem parsing "%s" for wall clock limit' % lines[i + 1] ) modelMaxNorm = 0 if self.cpuRef: # Now try to get the CPU_FACTOR for this reference CPU, # it must be either a Model, a Host or the largest Model cmd = '%s/lshosts -w %s' % ( self.bin, self.cpuRef ) result = runCommand( cmd ) if result['OK']: # At CERN this command will return an error since there is no host defined # with the name of the reference Host. lines = str( result['Value'] ).split( '\n' ) l1 = lines[0].split() l2 = lines[1].split() if len( l1 ) > len( l2 ): self.log.error( "Failed lshost command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) ) else: for i in range( len( l1 ) ): if l1[i] == 'cpuf': try: self.normRef = float( l2[i] ) self.log.info( 'Reference Normalization taken from Host', '%s: %s' % ( self.cpuRef, self.normRef ) ) except ValueError as e: self.log.exception( 'Exception parsing lshosts output', '', e ) if not self.normRef: # Try if there is a model define with the name of cpuRef cmd = '%s/lsinfo -m' % ( self.bin ) result = runCommand( cmd ) if result['OK']: lines = str( result['Value'] ).split( '\n' ) for line in lines[1:]: words = line.split() if len( words ) > 1: try: norm = float( words[1] ) if norm > modelMaxNorm: modelMaxNorm = norm if words[0].find( self.cpuRef ) > -1: self.normRef = norm self.log.info( 'Reference Normalization taken from Host Model', '%s: %s' % ( self.cpuRef, self.normRef ) ) except ValueError as e: self.log.exception( 'Exception parsing lsfinfo output', '', e ) if not self.normRef: # Now parse LSF configuration files if not os.path.isfile( './lsf.sh' ): os.symlink( os.path.join( os.environ['LSF_ENVDIR'], 'lsf.conf' ) , './lsf.sh' ) # As the variables are not exported, we must force it ret = sourceEnv( 10, ['./lsf', '&& export LSF_CONFDIR' ] ) if ret['OK']: lsfEnv = ret['outputEnv'] shared = None try: egoShared = os.path.join( lsfEnv['LSF_CONFDIR'], 'ego.shared' ) lsfShared = os.path.join( lsfEnv['LSF_CONFDIR'], 'lsf.shared' ) if os.path.exists( egoShared ): shared = egoShared elif os.path.exists( lsfShared ): shared = lsfShared except KeyError as e: self.log.exception( 'Exception getting LSF configuration', '', e ) if shared: f = open( shared ) hostModelSection = False for line in f.readlines(): if line.find( 'Begin HostModel' ) == 0: hostModelSection = True continue if not hostModelSection: continue if line.find( 'End HostModel' ) == 0: break line = line.strip() if line and line.split()[0] == self.cpuRef: try: self.normRef = float( line.split()[1] ) self.log.info( 'Reference Normalization taken from Configuration File', '(%s) %s: %s' % ( shared, self.cpuRef, self.normRef ) ) except ValueError as e: self.log.exception( 'Exception reading LSF configuration', '', e ) else: self.log.warn( 'Could not find LSF configuration' ) else: self.log.error( 'Cannot source the LSF environment', ret['Message'] ) if not self.normRef: # If nothing worked, take the maximum defined for a Model if modelMaxNorm: self.normRef = modelMaxNorm self.log.info( 'Reference Normalization taken from Max Model:', self.normRef ) # Now get the Normalization for the current Host if self.host: cmd = '%s/lshosts -w %s' % ( self.bin, self.host ) result = runCommand( cmd ) if result['OK']: lines = str( result['Value'] ).split( '\n' ) l1 = lines[0].split() l2 = lines[1].split() if len( l1 ) > len( l2 ): self.log.error( "Failed lshost command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) ) else: for i in range( len( l1 ) ): if l1[i] == 'cpuf': try: self.hostNorm = float( l2[i] ) self.log.info( 'Host Normalization', '%s: %s' % ( self.host, self.hostNorm ) ) except ValueError as e: self.log.exception( 'Exception parsing lshosts output', '', e ) if self.hostNorm and self.normRef: self.hostNorm /= self.normRef self.log.info( 'CPU Normalization', self.hostNorm )
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = "qstat -f %s" % (self.jobID) result = runCommand(cmd) if not result["OK"]: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result["Value"]).split("\n") for line in lines: info = line.split() if re.search(".*resources_used.cput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search(".*resources_used.pcput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpu or newcpu > cpu: cpu = newcpu else: self.log.warn('Problem parsing "%s" for CPU consumed' % line) if re.search(".*resources_used.walltime.*", line): if len(info) >= 3: wcList = info[2].split(":") wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for elapsed wall clock time' % line) if re.search(".*Resource_List.cput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search(".*Resource_List.pcput.*", line): if len(info) >= 3: cpuList = info[2].split(":") newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) if not cpuLimit or newcpuLimit < cpuLimit: cpuLimit = newcpuLimit else: self.log.warn('Problem parsing "%s" for CPU limit' % line) if re.search(".*Resource_List.walltime.*", line): if len(info) >= 3: wcList = info[2].split(":") wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2]) else: self.log.warn('Problem parsing "%s" for wall clock limit' % line) consumed = {"CPU": cpu, "CPULimit": cpuLimit, "WallClock": wallClock, "WallClockLimit": wallClockLimit} self.log.debug(consumed) if None not in consumed.values(): self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] self.log.info("Could not determine parameter", ",".join(missed)) self.log.debug("This is the stdout from the batch system call\n%s" % (result["Value"])) if cpuLimit or wallClockLimit: # We have got a partial result from PBS, assume that we ran for too short time if not cpuLimit: consumed["CPULimit"] = wallClockLimit * 0.8 if not wallClockLimit: consumed["WallClockLimit"] = cpuLimit / 0.8 if not cpu: consumed["CPU"] = int(time.time() - self.startTime) if not wallClock: consumed["WallClock"] = int(time.time() - self.startTime) self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = "Could not determine some parameters" self.log.info(msg, ":\nThis is the stdout from the batch system call\n%s" % (result["Value"])) retVal = S_ERROR(msg) retVal["Value"] = consumed return retVal
def getResourceUsage(self): """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed and WallClockLimit for current slot. All values returned in seconds. """ cmd = 'qstat -f -j %s' % (self.jobID) result = runCommand(cmd) if not result['OK']: return result cpu = None cpuLimit = None wallClock = None wallClockLimit = None lines = str(result['Value']).split('\n') for line in lines: if re.search('usage.*cpu.*', line): match = re.search('cpu=([\d,:]*),', line) if match: cpuList = match.groups()[0].split(':') try: newcpu = 0. if len(cpuList) == 3: newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2]) elif len(cpuList) == 4: newcpu = ( (float(cpuList[0]) * 24 + float(cpuList[1])) * 60 + float(cpuList[2])) * 60 + float(cpuList[3]) if not cpu or newcpu > cpu: cpu = newcpu except ValueError: self.log.warn('Problem parsing "%s" for CPU consumed' % line) elif re.search('hard resource_list.*cpu.*', line): match = re.search('_cpu=(\d*)', line) if match: cpuLimit = float(match.groups()[0]) match = re.search('_rt=(\d*)', line) if match: wallClockLimit = float(match.groups()[0]) # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures if cpu: factor = _getCPUScalingFactor() if factor: cpu = cpu / factor consumed = { 'CPU': cpu, 'CPULimit': cpuLimit, 'WallClock': wallClock, 'WallClockLimit': wallClockLimit } if None not in consumed.values(): # This cannot happen as we can't get wallClock from anywhere self.log.debug("TimeLeft counters complete:", str(consumed)) return S_OK(consumed) else: missed = [key for key, val in consumed.items() if val is None] self.log.info('Could not determine parameter', ','.join(missed)) self.log.debug( 'This is the stdout from the batch system call\n%s' % (result['Value'])) if cpuLimit or wallClockLimit: # We have got a partial result from SGE if not cpuLimit: # Take some margin consumed['CPULimit'] = wallClockLimit * 0.8 if not wallClockLimit: consumed['WallClockLimit'] = cpuLimit / 0.8 if not cpu: consumed['CPU'] = time.time() - self.startTime if not wallClock: consumed['WallClock'] = time.time() - self.startTime self.log.debug("TimeLeft counters restored:", str(consumed)) return S_OK(consumed) else: msg = 'Could not determine some parameters' self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % (result['Value'])) retVal = S_ERROR(msg) retVal['Value'] = consumed return retVal