예제 #1
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        if not self.jobID:
            return S_ERROR(
                'Could not determine batch jobID from QSUB_REQNAME env var.')

        if not self.scaleFactor:
            return S_ERROR('CPU scala factor is not defined')

        cmd = 'qjob -a -nh -wide %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        self.log.verbose(result['Value'])

        cpu = None
        cpuLimit = None
        try:
            cpuItems = result['Value'].split()
            if cpuItems[5][-1] == '/':
                cpu = float(cpuItems[5][:-1])
                cpuLimit = float(cpuItems[6])
            else:
                cpuList = cpuItems[5].split('/')
                cpu = float(cpuList[0])
                cpuLimit = float(cpuList[1])
        except Exception:
            self.log.warn('Problem parsing "%s" for CPU usage' %
                          (result['Value']))

        #BQS has no wallclock limit so will simply return the same as for CPU to the TimeLeft utility
        wallClock = cpu
        wallClockLimit = cpuLimit
        # Divide the numbers by 5 to bring it to HS06 units from the CC UI units
        # and remove HS06 normalization factor
        consumed = {
            'CPU': cpu / 5. / self.scaleFactor,
            'CPULimit': cpuLimit / 5. / self.scaleFactor,
            'WallClock': wallClock / 5. / self.scaleFactor,
            'WallClockLimit': wallClockLimit / 5. / self.scaleFactor
        }
        self.log.debug(consumed)
        failed = False
        for key, val in consumed.items():
            if val == None:
                failed = True
                self.log.warn('Could not determine %s' % key)

        if not failed:
            return S_OK(consumed)
        else:
            msg = 'Could not determine some parameters,' \
                  ' this is the stdout from the batch system call\n%s' % ( result['Value'] )
            self.log.info(msg)
            return S_ERROR('Could not determine some parameters')
예제 #2
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    cmd = 'qstat -f %s' % ( self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result

    cpu = None
    cpuLimit = None
    wallClock = None
    wallClockLimit = None

    lines = result['Value'].split( '\n' )
    for line in lines:
      info = line.split()
      if re.search( '.*resources_used.cput.*', line ):
        if len( info ) >= 3:
          cpuList = info[2].split( ':' )
          cpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
        else:
          self.log.warn( 'Problem parsing "%s" for CPU consumed' % line )
      if re.search( '.*resources_used.walltime.*', line ):
        if len( info ) >= 3:
          wcList = info[2].split( ':' )
          wallClock = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] )
        else:
          self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line )
      if re.search( '.*Resource_List.cput.*', line ):
        if len( info ) >= 3:
          cpuList = info[2].split( ':' )
          cpuLimit = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
        else:
          self.log.warn( 'Problem parsing "%s" for CPU limit' % line )
      if re.search( '.*Resource_List.walltime.*', line ):
        if len( info ) >= 3:
          wcList = info[2].split( ':' )
          wallClockLimit = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] )
        else:
          self.log.warn( 'Problem parsing "%s" for wall clock limit' % line )

    consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit}
    self.log.debug( consumed )
    failed = False
    for key, val in consumed.items():
      if val == None:
        failed = True
        self.log.warn( 'Could not determine %s' % key )

    if not failed:
      return S_OK( consumed )
    else:
      self.log.info( 'Could not determine some parameters, this is the stdout from the batch system call\n%s' % ( result['Value'] ) )
      retVal = S_ERROR( 'Could not determine some parameters' )
      retVal['Value'] = consumed
      return retVal
예제 #3
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    if not self.bin:
      return S_ERROR( 'Could not determine bin directory for LSF' )
    if not self.hostNorm:
      return S_ERROR( 'Could not determine host Norm factor' )


    cpu = None
    wallClock = None

    cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result
    lines = str( result['Value'] ).split( '\n' )
    l1 = lines[0].split()
    l2 = lines[1].split()
    if len( l1 ) > len( l2 ):
      self.log.error( "Failed bjobs command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) )
      return S_ERROR( 'Can not parse LSF output' )

    sCPU = None
    sStart = None
    for i in range( len( l1 ) ):
      if l1[i] == 'CPU_USED':
        sCPU = l2[i]
        lCPU = sCPU.split( ':' )
        try:
          cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] )
        except ( ValueError, IndexError ) as _e:
          pass
      elif l1[i] == 'START_TIME':
        sStart = l2[i]
        sStart = '%s %s' % ( sStart, self.year )
        try:
          timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' )
          wallClock = time.mktime( time.localtime() ) - time.mktime( timeTup )
        except ValueError:
          pass

    if cpu is None or wallClock is None:
      return S_ERROR( 'Failed to parse LSF output' )

    consumed = {'CPU':cpu, 'CPULimit':self.cpuLimit, 'WallClock':wallClock, 'WallClockLimit':self.wallClockLimit}
    self.log.debug( consumed )

    if None not in consumed.values():
      return S_OK( consumed )
    else:
      missed = [key for key, val in consumed.items() if val is None]
      msg = 'Could not determine some parameters'
      self.log.info( msg, ': %s\nThis is the stdout from the batch system call\n%s' % ( ','.join( missed ), result['Value'] ) )
      return S_ERROR( msg )
예제 #4
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    if not self.bin:
      return S_ERROR( 'Could not determine bin directory for LSF' )
    if not self.hostNorm:
      return S_ERROR( 'Could not determine host Norm factor' )


    cpu = None
    wallClock = None

    cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result
    lines = str( result['Value'] ).split( '\n' )
    l1 = lines[0].split()
    l2 = lines[1].split()
    if len( l1 ) > len( l2 ):
      self.log.error( "Failed bjobs command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) )
      return S_ERROR( 'Can not parse LSF output' )

    sCPU = None
    sStart = None
    for i in range( len( l1 ) ):
      if l1[i] == 'CPU_USED':
        sCPU = l2[i]
        lCPU = sCPU.split( ':' )
        try:
          cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] )
        except ( ValueError, IndexError ) as _e:
          pass
      elif l1[i] == 'START_TIME':
        sStart = l2[i]
        sStart = '%s %s' % ( sStart, self.year )
        try:
          timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' )
          wallClock = time.mktime( time.localtime() ) - time.mktime( timeTup )
        except ValueError:
          pass

    if cpu is None or wallClock is None:
      return S_ERROR( 'Failed to parse LSF output' )

    consumed = {'CPU':cpu, 'CPULimit':self.cpuLimit, 'WallClock':wallClock, 'WallClockLimit':self.wallClockLimit}
    self.log.debug( consumed )

    if None not in consumed.values():
      return S_OK( consumed )
    else:
      missed = [key for key, val in consumed.items() if val is None]
      msg = 'Could not determine some parameters'
      self.log.info( msg, ': %s\nThis is the stdout from the batch system call\n%s' % ( ','.join( missed ), result['Value'] ) )
      return S_ERROR( msg )
예제 #5
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    if not self.jobID:
      return S_ERROR( 'Could not determine batch jobID from QSUB_REQNAME env var.' )

    if not self.scaleFactor:
      return S_ERROR( 'CPU scala factor is not defined' )

    cmd = 'qjob -a -nh -wide %s' % ( self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result

    self.log.verbose( result['Value'] )

    cpu = None
    cpuLimit = None
    try:
      cpuItems = result['Value'].split()
      if cpuItems[5][-1] == '/':
        cpu = float( cpuItems[5][:-1] )
        cpuLimit = float( cpuItems[6] )
      else:
        cpuList = cpuItems[5].split( '/' )
        cpu = float( cpuList[0] )
        cpuLimit = float( cpuList[1] )
    except Exception:
      self.log.warn( 'Problem parsing "%s" for CPU usage' % ( result['Value'] ) )

    #BQS has no wallclock limit so will simply return the same as for CPU to the TimeLeft utility
    wallClock = cpu
    wallClockLimit = cpuLimit
    # Divide the numbers by 5 to bring it to HS06 units from the CC UI units
    # and remove HS06 normalization factor
    consumed = {'CPU':cpu / 5. / self.scaleFactor,
                'CPULimit':cpuLimit / 5. / self.scaleFactor,
                'WallClock':wallClock / 5. / self.scaleFactor,
                'WallClockLimit':wallClockLimit / 5. / self.scaleFactor}
    self.log.debug( consumed )
    failed = False
    for key, val in consumed.items():
      if val == None:
        failed = True
        self.log.warn( 'Could not determine %s' % key )

    if not failed:
      return S_OK( consumed )
    else:
      msg = 'Could not determine some parameters,' \
            ' this is the stdout from the batch system call\n%s' % ( result['Value'] )
      self.log.info( msg )
      return S_ERROR( 'Could not determine some parameters' )
예제 #6
0
  def __getCPUScalingFactor(self):

    host = socket.getfqdn()
    cmd = 'qconf -se %s' % host
    result = runCommand( cmd )
    if not result['OK']:
      return None
    lines = result['Value'].split( '\n' )
    for line in lines:
      if re.search( 'usage_scaling', line ):
        match = re.search('cpu=([\d,\.]*),',line)
        if match:
          return float( match.groups()[0] )
    return None
예제 #7
0
    def __getCPUScalingFactor(self):

        host = socket.getfqdn()
        cmd = 'qconf -se %s' % host
        result = runCommand(cmd)
        if not result['OK']:
            return None
        lines = result['Value'].split('\n')
        for line in lines:
            if re.search('usage_scaling', line):
                match = re.search('cpu=([\d,\.]*),', line)
                if match:
                    return float(match.groups()[0])
        return None
예제 #8
0
def _getCPUScalingFactor():

  host = socket.getfqdn()
  cmd = 'qconf -se %s' % host
  result = runCommand( cmd )
  if not result['OK']:
    return None
  _example = """Example of output for qconf -se ccwsge0640
hostname              ccwsge0640.in2p3.fr
load_scaling          NONE
complex_values        m_mem_free=131022.000000M,m_mem_free_n0=65486.613281M, \
                      m_mem_free_n1=65536.000000M,os=sl6
load_values           arch=lx-amd64,cpu=89.400000,fsize_used_rate=0.089, \
                      load_avg=36.300000,load_long=36.020000, \
                      load_medium=36.300000,load_short=35.960000, \
                      m_cache_l1=32.000000K,m_cache_l2=256.000000K, \
                      m_cache_l3=25600.000000K,m_core=20, \
                      m_mem_free=72544.000000M,m_mem_free_n0=18696.761719M, \
                      m_mem_free_n1=22139.621094M,m_mem_total=131022.000000M, \
                      m_mem_total_n0=65486.613281M, \
                      m_mem_total_n1=65536.000000M,m_mem_used=58478.000000M, \
                      m_mem_used_n0=46789.851562M,m_mem_used_n1=43396.378906M, \
                      m_numa_nodes=2,m_socket=2,m_thread=40, \
                      m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_topology_numa=[SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT][SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT], \
                      mem_free=70513.675781M,mem_total=129001.429688M, \
                      mem_used=58487.753906M,memory_used_rate=0.468, \
                      np_load_avg=0.907500,np_load_long=0.900500, \
                      np_load_medium=0.907500,np_load_short=0.899000, \
                      num_proc=40,swap_free=0.000000M,swap_total=266.699219M, \
                      swap_used=266.699219M,virtual_free=70513.675781M, \
                      virtual_total=129268.128906M,virtual_used=58754.453125M
processors            40
user_lists            NONE
xuser_lists           NONE
projects              NONE
xprojects             NONE
usage_scaling         cpu=11.350000,acct_cpu=11.350000
report_variables      NONE

"""
  lines = str( result['Value'] ).split( '\n' )
  for line in lines:
    if re.search( 'usage_scaling', line ):
      match = re.search( 'cpu=([\d,\.]*),', line )
      if match:
        return float( match.groups()[0] )
  return None
예제 #9
0
def _getCPUScalingFactor():

    host = socket.getfqdn()
    cmd = 'qconf -se %s' % host
    result = runCommand(cmd)
    if not result['OK']:
        return None
    _example = """Example of output for qconf -se ccwsge0640
hostname              ccwsge0640.in2p3.fr
load_scaling          NONE
complex_values        m_mem_free=131022.000000M,m_mem_free_n0=65486.613281M, \
                      m_mem_free_n1=65536.000000M,os=sl6
load_values           arch=lx-amd64,cpu=89.400000,fsize_used_rate=0.089, \
                      load_avg=36.300000,load_long=36.020000, \
                      load_medium=36.300000,load_short=35.960000, \
                      m_cache_l1=32.000000K,m_cache_l2=256.000000K, \
                      m_cache_l3=25600.000000K,m_core=20, \
                      m_mem_free=72544.000000M,m_mem_free_n0=18696.761719M, \
                      m_mem_free_n1=22139.621094M,m_mem_total=131022.000000M, \
                      m_mem_total_n0=65486.613281M, \
                      m_mem_total_n1=65536.000000M,m_mem_used=58478.000000M, \
                      m_mem_used_n0=46789.851562M,m_mem_used_n1=43396.378906M, \
                      m_numa_nodes=2,m_socket=2,m_thread=40, \
                      m_topology=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_topology_inuse=SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTSCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT, \
                      m_topology_numa=[SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT][SCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT], \
                      mem_free=70513.675781M,mem_total=129001.429688M, \
                      mem_used=58487.753906M,memory_used_rate=0.468, \
                      np_load_avg=0.907500,np_load_long=0.900500, \
                      np_load_medium=0.907500,np_load_short=0.899000, \
                      num_proc=40,swap_free=0.000000M,swap_total=266.699219M, \
                      swap_used=266.699219M,virtual_free=70513.675781M, \
                      virtual_total=129268.128906M,virtual_used=58754.453125M
processors            40
user_lists            NONE
xuser_lists           NONE
projects              NONE
xprojects             NONE
usage_scaling         cpu=11.350000,acct_cpu=11.350000
report_variables      NONE

"""
    lines = str(result['Value']).split('\n')
    for line in lines:
        if re.search('usage_scaling', line):
            match = re.search('cpu=([\d,\.]*),', line)
            if match:
                return float(match.groups()[0])
    return None
예제 #10
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        if not self.bin:
            return S_ERROR('Could not determine bin directory for LSF')
        if not self.hostNorm:
            return S_ERROR('Could not determine host Norm factor')

        cpu = None
        wallClock = None

        cmd = '%s/bjobs -W %s' % (self.bin, self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result
        lines = str(result['Value']).split('\n')
        l1 = lines[0].split()
        l2 = lines[1].split()
        if len(l1) > len(l2):
            self.log.error("Failed bjobs command",
                           "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
            return S_ERROR('Can not parse LSF output')

        sCPU = None
        sStart = None
        for i in range(len(l1)):
            if l1[i] == 'CPU_USED':
                sCPU = l2[i]
                lCPU = sCPU.split(':')
                try:
                    cpu = float(lCPU[0]) * 3600 + float(lCPU[1]) * 60 + float(
                        lCPU[2])
                except ValueError, IndexError:
                    pass
            elif l1[i] == 'START_TIME':
                sStart = l2[i]
                sStart = '%s %s' % (sStart, self.year)
                try:
                    timeTup = time.strptime(sStart, '%m/%d-%H:%M:%S %Y')
                    wallClock = time.mktime(
                        time.localtime()) - time.mktime(timeTup)
                except ValueError:
                    pass
예제 #11
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    if not self.bin:
      return S_ERROR( 'Could not determine bin directory for LSF' )
    if not self.hostNorm:
      return S_ERROR( 'Could not determine host Norm factor' )


    cpu = None
    wallClock = None

    cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result
    lines = str( result['Value'] ).split( '\n' )
    l1 = lines[0].split()
    l2 = lines[1].split()
    if len( l1 ) > len( l2 ):
      self.log.error( "Failed bjobs command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) )
      return S_ERROR( 'Can not parse LSF output' )

    sCPU = None
    sStart = None
    for i in range( len( l1 ) ):
      if l1[i] == 'CPU_USED':
        sCPU = l2[i]
        lCPU = sCPU.split( ':' )
        try:
          cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] )
        except ValueError, IndexError:
          pass
      elif l1[i] == 'START_TIME':
        sStart = l2[i]
        sStart = '%s %s' % ( sStart, self.year )
        try:
          timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' )
          wallClock = time.mktime( time.localtime() ) - time.mktime( timeTup )
        except ValueError:
          pass
예제 #12
0
파일: LSFTimeLeft.py 프로젝트: vingar/DIRAC
    def __init__(self):
        """ Standard constructor
    """
        self.log = gLogger.getSubLogger('LSFTimeLeft')
        self.jobID = os.environ.get('LSB_JOBID')
        self.queue = os.environ.get('LSB_QUEUE')
        self.bin = os.environ.get('LSF_BINDIR')
        self.host = os.environ.get('LSB_HOSTS')
        self.year = time.strftime('%Y', time.gmtime())
        self.log.verbose(
            'LSB_JOBID=%s, LSB_QUEUE=%s, LSF_BINDIR=%s, LSB_HOSTS=%s' %
            (self.jobID, self.queue, self.bin, self.host))

        self.cpuLimit = None
        self.cpuRef = None
        self.normRef = None
        self.wallClockLimit = None
        self.hostNorm = None

        cmd = '%s/bqueues -l %s' % (self.bin, self.queue)
        result = runCommand(cmd)
        if not result['OK']:
            return

        lines = str(result['Value']).split('\n')
        self.log.debug(
            'From %s' % cmd, '\n'.join([
                line if len(line) <= 128 else line[:128] + ' [...]'
                for line in lines
            ]))
        for i in xrange(len(lines)):
            if re.search('.*CPULIMIT.*', lines[i]):
                info = lines[i + 1].split()
                if len(info) >= 4:
                    self.cpuLimit = float(info[0]) * 60
                    self.cpuRef = info[3]
                elif len(info) == 2 and info[1] == "min":
                    self.cpuLimit = float(info[0]) * 60
                    self.cpuRef = None
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' %
                                  lines[i + 1])
                    self.cpuLimit = -1
            elif re.search('.*RUNLIMIT.*', lines[i]):
                info = lines[i + 1].split()
                if len(info) >= 1:
                    self.wallClockLimit = float(info[0]) * 60
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' %
                                  lines[i + 1])
                    self.wallClockLimit = -1

        modelMaxNorm = 0
        if self.cpuRef:
            # Now try to get the CPU_FACTOR for this reference CPU,
            # it must be either a Model, a Host or the largest Model

            cmd = '%s/lshosts -w %s' % (self.bin, self.cpuRef)
            result = runCommand(cmd)
            if result['OK']:
                # At CERN this command will return an error since there is no host defined
                # with the name of the reference Host.
                lines = str(result['Value']).split('\n')
                l1 = lines[0].split()
                l2 = lines[1].split()
                if len(l1) > len(l2):
                    self.log.error("Failed lshost command",
                                   "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
                else:
                    for i in xrange(len(l1)):
                        if l1[i] == 'cpuf':
                            try:
                                self.normRef = float(l2[i])
                                self.log.info(
                                    'Reference Normalization taken from Host',
                                    '%s: %s' % (self.cpuRef, self.normRef))
                            except ValueError as e:
                                self.log.exception(
                                    'Exception parsing lshosts output', '', e)

            if not self.normRef:
                # Try if there is a model define with the name of cpuRef
                cmd = '%s/lsinfo -m' % (self.bin)
                result = runCommand(cmd)
                if result['OK']:
                    lines = str(result['Value']).split('\n')
                    for line in lines[1:]:
                        words = line.split()
                        if len(words) > 1:
                            try:
                                norm = float(words[1])
                                if norm > modelMaxNorm:
                                    modelMaxNorm = norm
                                if words[0].find(self.cpuRef) > -1:
                                    self.normRef = norm
                                    self.log.info(
                                        'Reference Normalization taken from Host Model',
                                        '%s: %s' % (self.cpuRef, self.normRef))
                            except ValueError as e:
                                self.log.exception(
                                    'Exception parsing lsfinfo output', '', e)

            if not self.normRef:
                # Now parse LSF configuration files
                if not os.path.isfile('./lsf.sh'):
                    os.symlink(
                        os.path.join(os.environ['LSF_ENVDIR'], 'lsf.conf'),
                        './lsf.sh')
                # As the variables are not exported, we must force it
                ret = sourceEnv(10, ['./lsf', '&& export LSF_CONFDIR'])
                if ret['OK']:
                    lsfEnv = ret['outputEnv']
                    shared = None
                    try:
                        egoShared = os.path.join(lsfEnv['LSF_CONFDIR'],
                                                 'ego.shared')
                        lsfShared = os.path.join(lsfEnv['LSF_CONFDIR'],
                                                 'lsf.shared')
                        if os.path.exists(egoShared):
                            shared = egoShared
                        elif os.path.exists(lsfShared):
                            shared = lsfShared
                    except KeyError as e:
                        self.log.exception(
                            'Exception getting LSF configuration', '', e)
                    if shared:
                        with open(shared) as f:
                            hostModelSection = False
                            for line in f.readlines():
                                if line.find('Begin HostModel') == 0:
                                    hostModelSection = True
                                    continue
                                if not hostModelSection:
                                    continue
                                if line.find('End HostModel') == 0:
                                    break
                                line = line.strip()
                                if line and line.split()[0] == self.cpuRef:
                                    try:
                                        self.normRef = float(line.split()[1])
                                        self.log.info(
                                            'Reference Normalization taken from Configuration File',
                                            '(%s) %s: %s' %
                                            (shared, self.cpuRef,
                                             self.normRef))
                                    except ValueError as e:
                                        self.log.exception(
                                            'Exception reading LSF configuration',
                                            '', e)
                    else:
                        self.log.warn('Could not find LSF configuration')
                else:
                    self.log.error('Cannot source the LSF environment',
                                   ret['Message'])
        if not self.normRef:
            # If nothing works take this as the unit
            self.normRef = 1.
            # If nothing worked, take the maximum defined for a Model
            # if modelMaxNorm:
            #  self.normRef = modelMaxNorm
            #  self.log.info( 'Reference Normalization taken from Max Model:', self.normRef )

        # Now get the Normalization for the current Host
        if self.host:
            cmd = '%s/lshosts -w %s' % (self.bin, self.host)
            result = runCommand(cmd)
            if result['OK']:
                lines = str(result['Value']).split('\n')
                l1 = lines[0].split()
                l2 = lines[1].split()
                if len(l1) > len(l2):
                    self.log.error("Failed lshost command",
                                   "%s:\n %s\n %s" % (cmd, lines[0], lines[0]))
                else:
                    for i in xrange(len(l1)):
                        if l1[i] == 'cpuf':
                            try:
                                self.hostNorm = float(l2[i])
                                self.log.info(
                                    'Host Normalization',
                                    '%s: %s' % (self.host, self.hostNorm))
                            except ValueError as e:
                                self.log.exception(
                                    'Exception parsing lshosts output', l1, e)
                            finally:
                                break

            if self.hostNorm and self.normRef:
                self.hostNorm /= self.normRef
                self.log.info('CPU power w.r.t. batch unit', self.hostNorm)

            if self.hostNorm:
                # Set the limits in real seconds
                self.cpuLimit /= self.hostNorm
                self.wallClockLimit /= self.hostNorm
예제 #13
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    cmd = 'qstat -f -j %s' % ( self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result

    cpu = None
    cpuLimit = None
    wallClock = None
    wallClockLimit = None

    lines = str( result['Value'] ).split( '\n' )
    for line in lines:
      if re.search( 'usage.*cpu.*', line ):
        match = re.search( 'cpu=([\d,:]*),', line )
        if match:
          cpuList = match.groups()[0].split( ':' )
        try:
          newcpu = 0.
          if len( cpuList ) == 3:
            newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
          elif len( cpuList ) == 4:
            newcpu = ( ( float( cpuList[0] ) * 24 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) ) * 60 + float( cpuList[3] )
          if not cpu or newcpu > cpu:
            cpu = newcpu
        except ValueError:
          self.log.warn( 'Problem parsing "%s" for CPU consumed' % line )
      elif re.search( 'hard resource_list.*cpu.*', line ):
        match = re.search( '_cpu=(\d*)', line )
        if match:
          cpuLimit = float( match.groups()[0] )
        match = re.search( '_rt=(\d*)', line )
        if match:
          wallClockLimit = float( match.groups()[0] )

    # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures
    if cpu:
      factor = _getCPUScalingFactor()
      if factor:
        cpu = cpu / factor

    consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit}

    if None not in consumed.values():
      # This cannot happen as we can't get wallClock from anywhere
      self.log.debug( "TimeLeft counters complete:", str( consumed ) )
      return S_OK( consumed )
    else:
      missed = [key for key, val in consumed.items() if val is None]
      self.log.info( 'Could not determine parameter', ','.join( missed ) )
      self.log.debug( 'This is the stdout from the batch system call\n%s' % ( result['Value'] ) )

    if cpuLimit or wallClockLimit:
      # We have got a partial result from SGE
      if not cpuLimit:
        # Take some margin
        consumed['CPULimit'] = wallClockLimit * 0.8
      if not wallClockLimit:
        consumed['WallClockLimit'] = cpuLimit / 0.8
      if not cpu:
        consumed['CPU'] = time.time() - self.startTime
      if not wallClock:
        consumed['WallClock'] = time.time() - self.startTime
      self.log.debug( "TimeLeft counters restored:", str( consumed ) )
      return S_OK( consumed )
    else:
      msg = 'Could not determine some parameters'
      self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % ( result['Value'] ) )
      retVal = S_ERROR( msg )
      retVal['Value'] = consumed
      return retVal
예제 #14
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    if not self.bin:
      return S_ERROR( 'Could not determine bin directory for LSF' )
    if not self.hostNorm:
      return S_ERROR( 'Could not determine host Norm factor' )


    cpu = None
    wallClock = None

    cmd = '%s/bjobs -W %s' % ( self.bin, self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result
    lines = result['Value'].split( '\n' )
    l1 = lines[0].split()
    l2 = lines[1].split()
    if len( l1 ) > len( l2 ):
      self.log.error( cmd )
      self.log.error( lines[0] )
      self.log.error( lines[1] )
      return S_ERROR( 'Can not parse LSF output' )

    sCPU = None
    sStart = None
    for i in range( len( l1 ) ):
      if l1[i] == 'CPU_USED':
        sCPU = l2[i]
        lCPU = sCPU.split( ':' )
        try:
          cpu = float( lCPU[0] ) * 3600 + float( lCPU[1] ) * 60 + float( lCPU[2] )
        except Exception:
          pass
      elif l1[i] == 'START_TIME':
        sStart = l2[i]
        sStart = '%s %s' % ( sStart, self.year )
        try:
          timeTup = time.strptime( sStart, '%m/%d-%H:%M:%S %Y' )
          wallClock = time.mktime( timeTup )
          wallClock = time.mktime( time.localtime() ) - wallClock
        except Exception:
          pass

    if cpu == None or wallClock == None:
      return S_ERROR( 'Failed to parse LSF output' )

    cpu = cpu * self.hostNorm
    wallClock = wallClock * self.hostNorm

    consumed = {'CPU':cpu, 'CPULimit':self.cpuLimit, 'WallClock':wallClock, 'WallClockLimit':self.wallClockLimit}
    self.log.debug( consumed )
    failed = False
    for key, val in consumed.items():
      if val == None:
        failed = True
        self.log.warn( 'Could not determine %s' % key )

    if not failed:
      return S_OK( consumed )
    else:
      msg = 'Could not determine some parameters,' \
            ' this is the stdout from the batch system call\n%s' % ( result['Value'] )
      self.log.info( msg )
      return S_ERROR( 'Could not determine some parameters' )
예제 #15
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    cmd = 'qstat -f %s' % ( self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result

    cpu = None
    cpuLimit = None
    wallClock = None
    wallClockLimit = None

    lines = str( result['Value'] ).split( '\n' )
    for line in lines:
      info = line.split()
      if re.search( '.*resources_used.cput.*', line ):
        if len( info ) >= 3:
          cpuList = info[2].split( ':' )
          newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
          if not cpu or newcpu > cpu:
            cpu = newcpu
        else:
          self.log.warn( 'Problem parsing "%s" for CPU consumed' % line )
      if re.search( '.*resources_used.pcput.*', line ):
        if len( info ) >= 3:
          cpuList = info[2].split( ':' )
          newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
          if not cpu or newcpu > cpu:
            cpu = newcpu
        else:
          self.log.warn( 'Problem parsing "%s" for CPU consumed' % line )
      if re.search( '.*resources_used.walltime.*', line ):
        if len( info ) >= 3:
          wcList = info[2].split( ':' )
          wallClock = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] )
        else:
          self.log.warn( 'Problem parsing "%s" for elapsed wall clock time' % line )
      if re.search( '.*Resource_List.cput.*', line ):
        if len( info ) >= 3:
          cpuList = info[2].split( ':' )
          newcpuLimit = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
          if not cpuLimit or newcpuLimit < cpuLimit:
            cpuLimit = newcpuLimit
        else:
          self.log.warn( 'Problem parsing "%s" for CPU limit' % line )
      if re.search( '.*Resource_List.pcput.*', line ):
        if len( info ) >= 3:
          cpuList = info[2].split( ':' )
          newcpuLimit = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
          if not cpuLimit or newcpuLimit < cpuLimit:
            cpuLimit = newcpuLimit
        else:
          self.log.warn( 'Problem parsing "%s" for CPU limit' % line )
      if re.search( '.*Resource_List.walltime.*', line ):
        if len( info ) >= 3:
          wcList = info[2].split( ':' )
          wallClockLimit = ( float( wcList[0] ) * 60 + float( wcList[1] ) ) * 60 + float( wcList[2] )
        else:
          self.log.warn( 'Problem parsing "%s" for wall clock limit' % line )

    consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit}
    self.log.debug( consumed )

    if None not in consumed.values():
      self.log.debug( "TimeLeft counters complete:", str( consumed ) )
      return S_OK( consumed )

    if cpuLimit or wallClockLimit:
      # We have got a partial result from PBS, assume that we ran for too short time
      if not cpuLimit:
        consumed['CPULimit'] = wallClockLimit
      if not wallClockLimit:
        consumed['WallClockLimit'] = cpuLimit
      if not cpu:
        consumed['CPU'] = int( time.time() - self.startTime )
      if not wallClock:
        consumed['WallClock'] = int( time.time() - self.startTime )
      self.log.debug( "TimeLeft counters restored:", str( consumed ) )
      return S_OK( consumed )
    else:
      msg = 'Could not determine some parameters'
      self.log.info( msg, ':\nThis is the stdout from the batch system call\n%s' % ( result['Value'] ) )
      retVal = S_ERROR( msg )
      retVal['Value'] = consumed
      return retVal
예제 #16
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f -j %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result
        example = """ Example of output from qstat -f -j $JOB_ID
==============================================================
job_number:                 620685
exec_file:                  job_scripts/620685
submission_time:            Wed Apr 11 09:36:41 2012
owner:                      lhcb049
uid:                        18416
group:                      lhcb
gid:                        155
sge_o_home:                 /home/lhcb049
sge_o_log_name:             lhcb049
sge_o_path:                 /opt/sge/bin/lx24-amd64:/usr/bin:/bin
sge_o_shell:                /bin/sh
sge_o_workdir:              /var/glite/tmp
sge_o_host:                 cccreamceli05
account:                    GRID=EGI SITE=IN2P3-CC TIER=tier1 VO=lhcb ROLEVOMS=&2Flhcb&2FRole=pilot&2FCapability=NULL DN=&2FDC=ch&2FDC=cern&2FOU=Organic&20Units&2FOU=Users&2FCN=romanov&2FCN=427293&2FCN=Vladimir&20Romanovskiy&2FCN=proxy&2FCN=proxy&2FCN=proxy&2FCN=proxy
merge:                      y
hard resource_list:         os=sl5,s_cpu=165600,s_vmem=5120M,s_fsize=51200M,cvmfs=1,dcache=1
mail_list:                  [email protected]
notify:                     FALSE
job_name:                   cccreamceli05_crm05_749996134
stdout_path_list:           NONE:NONE:/dev/null
jobshare:                   0
hard_queue_list:            huge
restart:                    n
shell_list:                 NONE:/bin/bash
env_list:                   SITE_NAME=IN2P3-CC,MANPATH=/opt/sge/man:/usr/share/man:/usr/local/man:/usr/local/share/man,HOSTNAME=cccreamceli05,SHELL=/bin/sh,TERM=vanilla,HISTSIZE=1000,SGE_CELL=ccin2p3,USER=lhcb049,LD_LIBRARY_PATH=/usr/lib64:,LS_COLORS=no=00:fi=00:di=01;34:ln=01;36:pi=40;33:so=01;35:bd=40;33;01:cd=40;33;01:or=01;05;37;41:mi=01;05;37;41:ex=01;32:*.cmd=01;32:*.exe=01;32:*.com=01;32:*.btm=01;32:*.bat=01;32:*.sh=01;32:*.csh=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.gz=01;31:*.bz2=01;31:*.bz=01;31:*.tz=01;31:*.rpm=01;31:*.cpio=01;31:*.jpg=01;35:*.gif=01;35:*.bmp=01;35:*.xbm=01;35:*.xpm=01;35:*.png=01;35:*.tif=01;35:,SUDO_USER=tomcat,SUDO_UID=91,USERNAME=lhcb049,PATH=/opt/sge/bin/lx24-amd64:/usr/bin:/bin,MAIL=/var/spool/mail/tomcat,PWD=/var/glite/tmp,INPUTRC=/etc/inputrc,SGE_EXECD_PORT=10501,SGE_QMASTER_PORT=10500,SGE_ROOT=/opt/sge,SHLVL=1,SUDO_COMMAND=/opt/glite/bin/sge_submit.sh -x /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/proxy/354BFF4A_EAD9_3B10_FBE7_D9FFB765662A11488451642439 -u /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=romanov/CN=427293/CN=Vladimir Romanovskiy -r no -c /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/CREAM749996134_jobWrapper.sh -T /tmp -C /tmp/ce-req-file-1334129801228226 -o /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardOutput -e /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardError -q verylong -j crm05_749996134,HOME=/home/lhcb049,LOGNAME=lhcb049,SGE_CLUSTER_NAME=prod,SUDO_GID=91,DISPLAY=localhost:10.0,XAUTHORITY=/tmp/ssh-oosv2628/cookies,_=/opt/sge/bin/lx24-amd64/qsub
script_file:                /tmp/crm05_749996134
project:                    P_lhcb_pilot
usage    1:                 cpu=00:00:07, mem=0.03044 GBs, io=0.19846, vmem=288.609M, maxvmem=288.609M
scheduling info:            (Collecting of scheduler job information is turned off)
    """

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = result['Value'].split('\n')
        for line in lines:
            if re.search('usage.*cpu.*', line):
                match = re.search('cpu=([\d,:]*),', line)
                if match:
                    cpuList = match.groups()[0].split(':')
                try:
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2][:-1])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                except ValueError:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('hard resource_list.*cpu.*', line):
                match = re.search('_cpu=(\d*)', line)
                if match:
                    cpuLimit = float(match.groups()[0])
                match = re.search('_rt=(\d*)', line)
                if match:
                    wallClockLimit = float(match.groups()[0])

        # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures
        if cpu:
            factor = self.__getCPUScalingFactor()
            if factor:
                cpu = cpu / factor

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }
        self.log.debug(consumed)
        failed = False
        for key, val in consumed.items():
            if val == None:
                failed = True
                self.log.warn('Could not determine %s' % key)

        if not failed:
            return S_OK(consumed)

        if cpuLimit or wallClockLimit:
            # We have got a partial result from SGE
            if not cpuLimit:
                consumed['CPULimit'] = wallClockLimit
            if not wallClockLimit:
                consumed['WallClockLimit'] = cpuLimit
            if not cpu:
                consumed['CPU'] = time.time() - self.startTime
            if not wallClock:
                consumed['WallClock'] = time.time() - self.startTime
            self.log.debug("TimeLeft counters restored: " + str(consumed))
            return S_OK(consumed)
        else:
            self.log.info(
                'Could not determine some parameters, this is the stdout from the batch system call\n%s'
                % (result['Value']))
            retVal = S_ERROR('Could not determine some parameters')
            retVal['Value'] = consumed
            return retVal
예제 #17
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result['Value']).split('\n')
        for line in lines:
            info = line.split()
            if re.search('.*resources_used.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.pcput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpu = (float(cpuList[0]) * 60 +
                              float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClock = (float(wcList[0]) * 60 +
                                 float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn(
                        'Problem parsing "%s" for elapsed wall clock time' %
                        line)
            if re.search('.*Resource_List.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.pcput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    newcpuLimit = (float(cpuList[0]) * 60 +
                                   float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClockLimit = (float(wcList[0]) * 60 +
                                      float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' %
                                  line)

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }
        self.log.debug(consumed)

        if None not in consumed.values():
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            self.log.info('Could not determine parameter', ','.join(missed))
            self.log.debug(
                'This is the stdout from the batch system call\n%s' %
                (result['Value']))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from PBS, assume that we ran for too short time
            if not cpuLimit:
                consumed['CPULimit'] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed['WallClockLimit'] = cpuLimit / 0.8
            if not cpu:
                consumed['CPU'] = int(time.time() - self.startTime)
            if not wallClock:
                consumed['WallClock'] = int(time.time() - self.startTime)
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = 'Could not determine some parameters'
            self.log.info(
                msg, ':\nThis is the stdout from the batch system call\n%s' %
                (result['Value']))
            retVal = S_ERROR(msg)
            retVal['Value'] = consumed
            return retVal
예제 #18
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = result['Value'].split('\n')
        for line in lines:
            info = line.split()
            if re.search('.*resources_used.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    cpu = (float(cpuList[0]) * 60 +
                           float(cpuList[1])) * 60 + float(cpuList[2])
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            if re.search('.*resources_used.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClock = (float(wcList[0]) * 60 +
                                 float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn(
                        'Problem parsing "%s" for elapsed wall clock time' %
                        line)
            if re.search('.*Resource_List.cput.*', line):
                if len(info) >= 3:
                    cpuList = info[2].split(':')
                    cpuLimit = (float(cpuList[0]) * 60 +
                                float(cpuList[1])) * 60 + float(cpuList[2])
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search('.*Resource_List.walltime.*', line):
                if len(info) >= 3:
                    wcList = info[2].split(':')
                    wallClockLimit = (float(wcList[0]) * 60 +
                                      float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' %
                                  line)

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }
        self.log.debug(consumed)
        failed = False
        for key, val in consumed.items():
            if val == None:
                failed = True
                self.log.warn('Could not determine %s' % key)

        if not failed:
            return S_OK(consumed)
        else:
            self.log.info(
                'Could not determine some parameters, this is the stdout from the batch system call\n%s'
                % (result['Value']))
            retVal = S_ERROR('Could not determine some parameters')
            retVal['Value'] = consumed
            return retVal
예제 #19
0
  def getResourceUsage( self ):
    """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
    cmd = 'qstat -f -j %s' % ( self.jobID )
    result = runCommand( cmd )
    if not result['OK']:
      return result
    example = """ Example of output from qstat -f -j $JOB_ID
==============================================================
job_number:                 620685
exec_file:                  job_scripts/620685
submission_time:            Wed Apr 11 09:36:41 2012
owner:                      lhcb049
uid:                        18416
group:                      lhcb
gid:                        155
sge_o_home:                 /home/lhcb049
sge_o_log_name:             lhcb049
sge_o_path:                 /opt/sge/bin/lx24-amd64:/usr/bin:/bin
sge_o_shell:                /bin/sh
sge_o_workdir:              /var/glite/tmp
sge_o_host:                 cccreamceli05
account:                    GRID=EGI SITE=IN2P3-CC TIER=tier1 VO=lhcb ROLEVOMS=&2Flhcb&2FRole=pilot&2FCapability=NULL DN=&2FDC=ch&2FDC=cern&2FOU=Organic&20Units&2FOU=Users&2FCN=romanov&2FCN=427293&2FCN=Vladimir&20Romanovskiy&2FCN=proxy&2FCN=proxy&2FCN=proxy&2FCN=proxy
merge:                      y
hard resource_list:         os=sl5,s_cpu=165600,s_vmem=5120M,s_fsize=51200M,cvmfs=1,dcache=1
mail_list:                  [email protected]
notify:                     FALSE
job_name:                   cccreamceli05_crm05_749996134
stdout_path_list:           NONE:NONE:/dev/null
jobshare:                   0
hard_queue_list:            huge
restart:                    n
shell_list:                 NONE:/bin/bash
env_list:                   SITE_NAME=IN2P3-CC,MANPATH=/opt/sge/man:/usr/share/man:/usr/local/man:/usr/local/share/man,HOSTNAME=cccreamceli05,SHELL=/bin/sh,TERM=vanilla,HISTSIZE=1000,SGE_CELL=ccin2p3,USER=lhcb049,LD_LIBRARY_PATH=/usr/lib64:,LS_COLORS=no=00:fi=00:di=01;34:ln=01;36:pi=40;33:so=01;35:bd=40;33;01:cd=40;33;01:or=01;05;37;41:mi=01;05;37;41:ex=01;32:*.cmd=01;32:*.exe=01;32:*.com=01;32:*.btm=01;32:*.bat=01;32:*.sh=01;32:*.csh=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.gz=01;31:*.bz2=01;31:*.bz=01;31:*.tz=01;31:*.rpm=01;31:*.cpio=01;31:*.jpg=01;35:*.gif=01;35:*.bmp=01;35:*.xbm=01;35:*.xpm=01;35:*.png=01;35:*.tif=01;35:,SUDO_USER=tomcat,SUDO_UID=91,USERNAME=lhcb049,PATH=/opt/sge/bin/lx24-amd64:/usr/bin:/bin,MAIL=/var/spool/mail/tomcat,PWD=/var/glite/tmp,INPUTRC=/etc/inputrc,SGE_EXECD_PORT=10501,SGE_QMASTER_PORT=10500,SGE_ROOT=/opt/sge,SHLVL=1,SUDO_COMMAND=/opt/glite/bin/sge_submit.sh -x /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/proxy/354BFF4A_EAD9_3B10_FBE7_D9FFB765662A11488451642439 -u /DC=ch/DC=cern/OU=Organic Units/OU=Users/CN=romanov/CN=427293/CN=Vladimir Romanovskiy -r no -c /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/CREAM749996134_jobWrapper.sh -T /tmp -C /tmp/ce-req-file-1334129801228226 -o /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardOutput -e /var/glite/cream_sandbox/lhcb/_DC_ch_DC_cern_OU_Organic_Units_OU_Users_CN_romanov_CN_427293_CN_Vladimir_Romanovskiy_lhcb_Role_pilot_Capability_NULL_lhcb049/74/CREAM749996134/StandardError -q verylong -j crm05_749996134,HOME=/home/lhcb049,LOGNAME=lhcb049,SGE_CLUSTER_NAME=prod,SUDO_GID=91,DISPLAY=localhost:10.0,XAUTHORITY=/tmp/ssh-oosv2628/cookies,_=/opt/sge/bin/lx24-amd64/qsub
script_file:                /tmp/crm05_749996134
project:                    P_lhcb_pilot
usage    1:                 cpu=00:00:07, mem=0.03044 GBs, io=0.19846, vmem=288.609M, maxvmem=288.609M
scheduling info:            (Collecting of scheduler job information is turned off)
    """


    cpu = None
    cpuLimit = None
    wallClock = None
    wallClockLimit = None

    lines = result['Value'].split( '\n' )
    for line in lines:
      if re.search( 'usage.*cpu.*', line ):
        match = re.search( 'cpu=([\d,:]*),', line )
        if match:
          cpuList = match.groups()[0].split( ':' )
        try:
          newcpu = 0.
          if len( cpuList ) == 3:
            newcpu = ( float( cpuList[0] ) * 60 + float( cpuList[1] ) ) * 60 + float( cpuList[2] )
          elif len( cpuList ) == 4:
            newcpu = ( ( float( cpuList[0] ) * 24 + float( cpuList[1] ) ) * 60 + float( cpuList[2] ) ) * 60 + float( cpuList[3] )              
          if not cpu or newcpu > cpu:
            cpu = newcpu
        except ValueError:
          self.log.warn( 'Problem parsing "%s" for CPU consumed' % line )
      if re.search( 'hard resource_list.*cpu.*', line ):
        match = re.search( '_cpu=(\d*)', line )
        if match:
          cpuLimit = float( match.groups()[0] )
        match = re.search( '_rt=(\d*)', line )
        if match:
          wallClockLimit = float( match.groups()[0] )

    # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures
    if cpu:
      factor = self.__getCPUScalingFactor()
      if factor:
        cpu = cpu/factor

    consumed = {'CPU':cpu, 'CPULimit':cpuLimit, 'WallClock':wallClock, 'WallClockLimit':wallClockLimit}
    self.log.debug( consumed )
    failed = False
    for key, val in consumed.items():
      if val == None:
        failed = True
        self.log.warn( 'Could not determine %s' % key )

    if not failed:
      return S_OK( consumed )

    if cpuLimit or wallClockLimit:
      # We have got a partial result from SGE
      if not cpuLimit:
        consumed['CPULimit'] = wallClockLimit
      if not wallClockLimit:
        consumed['WallClockLimit'] = cpuLimit
      if not cpu:
        consumed['CPU'] = time.time() - self.startTime
      if not wallClock:
        consumed['WallClock'] = time.time() - self.startTime
      self.log.debug( "TimeLeft counters restored: " + str( consumed ) )
      return S_OK( consumed )
    else:
      self.log.info( 'Could not determine some parameters, this is the stdout from the batch system call\n%s' % ( result['Value'] ) )
      retVal = S_ERROR( 'Could not determine some parameters' )
      retVal['Value'] = consumed
      return retVal
예제 #20
0
  def __init__( self ):
    """ Standard constructor
    """
    self.log = gLogger.getSubLogger( 'LSFTimeLeft' )
    self.jobID = os.environ.get( 'LSB_JOBID' )
    self.queue = os.environ.get( 'LSB_QUEUE' )
    self.bin = os.environ.get( 'LSF_BINDIR' )
    self.host = os.environ.get( 'LSB_HOSTS' )
    self.year = time.strftime( '%Y', time.gmtime() )
    self.log.verbose( 'LSB_JOBID=%s, LSB_QUEUE=%s, LSF_BINDIR=%s, LSB_HOSTS=%s' % ( self.jobID,
                                                                                    self.queue,
                                                                                    self.bin,
                                                                                    self.host ) )

    self.cpuLimit = None
    self.cpuRef = None
    self.normRef = None
    self.wallClockLimit = None
    self.hostNorm = None

    cmd = '%s/bqueues -l %s' % ( self.bin, self.queue )
    result = runCommand( cmd )
    if not result['OK']:
      return

    self.log.debug( 'From %s' % cmd, result['Value'] )
    lines = str( result['Value'] ).split( '\n' )
    for i in xrange( len( lines ) ):
      if re.search( '.*CPULIMIT.*', lines[i] ):
        info = lines[i + 1].split()
        if len( info ) >= 4:
          self.cpuLimit = float( info[0] ) * 60
          self.cpuRef = info[3]
        else:
          self.log.warn( 'Problem parsing "%s" for CPU limit' % lines[i + 1] )
          self.cpuLimit = -1
      if re.search( '.*RUNLIMIT.*', lines[i] ):
        info = lines[i + 1].split()
        if len( info ) >= 1:
          self.wallClockLimit = float( info[0] ) * 60
        else:
          self.log.warn( 'Problem parsing "%s" for wall clock limit' % lines[i + 1] )

    modelMaxNorm = 0
    if self.cpuRef:
      # Now try to get the CPU_FACTOR for this reference CPU,
      # it must be either a Model, a Host or the largest Model

      cmd = '%s/lshosts -w %s' % ( self.bin, self.cpuRef )
      result = runCommand( cmd )
      if result['OK']:
        # At CERN this command will return an error since there is no host defined
        # with the name of the reference Host.
        lines = str( result['Value'] ).split( '\n' )
        l1 = lines[0].split()
        l2 = lines[1].split()
        if len( l1 ) > len( l2 ):
          self.log.error( "Failed lshost command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) )
        else:
          for i in range( len( l1 ) ):
            if l1[i] == 'cpuf':
              try:
                self.normRef = float( l2[i] )
                self.log.info( 'Reference Normalization taken from Host', '%s: %s' % ( self.cpuRef, self.normRef ) )
              except ValueError as e:
                self.log.exception( 'Exception parsing lshosts output', '', e )

      if not self.normRef:
        # Try if there is a model define with the name of cpuRef
        cmd = '%s/lsinfo -m' % ( self.bin )
        result = runCommand( cmd )
        if result['OK']:
          lines = str( result['Value'] ).split( '\n' )
          for line in lines[1:]:
            words = line.split()
            if len( words ) > 1:
              try:
                norm = float( words[1] )
                if norm > modelMaxNorm:
                  modelMaxNorm = norm
                if words[0].find( self.cpuRef ) > -1:
                  self.normRef = norm
                  self.log.info( 'Reference Normalization taken from Host Model',
                                 '%s: %s' % ( self.cpuRef, self.normRef ) )
              except ValueError as e:
                self.log.exception( 'Exception parsing lsfinfo output', '', e )

      if not self.normRef:
        # Now parse LSF configuration files
        if not os.path.isfile( './lsf.sh' ):
          os.symlink( os.path.join( os.environ['LSF_ENVDIR'], 'lsf.conf' ) , './lsf.sh' )
        # As the variables are not exported, we must force it
        ret = sourceEnv( 10, ['./lsf', '&& export LSF_CONFDIR' ] )
        if ret['OK']:
          lsfEnv = ret['outputEnv']
          shared = None
          try:
            egoShared = os.path.join( lsfEnv['LSF_CONFDIR'], 'ego.shared' )
            lsfShared = os.path.join( lsfEnv['LSF_CONFDIR'], 'lsf.shared' )
            if os.path.exists( egoShared ):
              shared = egoShared
            elif os.path.exists( lsfShared ):
              shared = lsfShared
          except KeyError as e:
            self.log.exception( 'Exception getting LSF configuration', '', e )
          if shared:
            f = open( shared )
            hostModelSection = False
            for line in f.readlines():
              if line.find( 'Begin HostModel' ) == 0:
                hostModelSection = True
                continue
              if not hostModelSection:
                continue
              if line.find( 'End HostModel' ) == 0:
                break
              line = line.strip()
              if line and line.split()[0] == self.cpuRef:
                try:
                  self.normRef = float( line.split()[1] )
                  self.log.info( 'Reference Normalization taken from Configuration File',
                                 '(%s) %s: %s' % ( shared, self.cpuRef, self.normRef ) )
                except ValueError as e:
                  self.log.exception( 'Exception reading LSF configuration', '', e )
          else:
            self.log.warn( 'Could not find LSF configuration' )
        else:
          self.log.error( 'Cannot source the LSF environment', ret['Message'] )
    if not self.normRef:
      # If nothing worked, take the maximum defined for a Model
      if modelMaxNorm:
        self.normRef = modelMaxNorm
        self.log.info( 'Reference Normalization taken from Max Model:', self.normRef )

    # Now get the Normalization for the current Host
    if self.host:
      cmd = '%s/lshosts -w %s' % ( self.bin, self.host )
      result = runCommand( cmd )
      if result['OK']:
        lines = str( result['Value'] ).split( '\n' )
        l1 = lines[0].split()
        l2 = lines[1].split()
        if len( l1 ) > len( l2 ):
          self.log.error( "Failed lshost command", "%s:\n %s\n %s" % ( cmd, lines[0], lines[0] ) )
        else:
          for i in range( len( l1 ) ):
            if l1[i] == 'cpuf':
              try:
                self.hostNorm = float( l2[i] )
                self.log.info( 'Host Normalization', '%s: %s' % ( self.host, self.hostNorm ) )
              except ValueError as e:
                self.log.exception( 'Exception parsing lshosts output', '', e )

      if self.hostNorm and self.normRef:
        self.hostNorm /= self.normRef
        self.log.info( 'CPU Normalization', self.hostNorm )
예제 #21
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = "qstat -f %s" % (self.jobID)
        result = runCommand(cmd)
        if not result["OK"]:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result["Value"]).split("\n")
        for line in lines:
            info = line.split()
            if re.search(".*resources_used.cput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' % line)
            if re.search(".*resources_used.pcput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpu = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                else:
                    self.log.warn('Problem parsing "%s" for CPU consumed' % line)
            if re.search(".*resources_used.walltime.*", line):
                if len(info) >= 3:
                    wcList = info[2].split(":")
                    wallClock = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for elapsed wall clock time' % line)
            if re.search(".*Resource_List.cput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search(".*Resource_List.pcput.*", line):
                if len(info) >= 3:
                    cpuList = info[2].split(":")
                    newcpuLimit = (float(cpuList[0]) * 60 + float(cpuList[1])) * 60 + float(cpuList[2])
                    if not cpuLimit or newcpuLimit < cpuLimit:
                        cpuLimit = newcpuLimit
                else:
                    self.log.warn('Problem parsing "%s" for CPU limit' % line)
            if re.search(".*Resource_List.walltime.*", line):
                if len(info) >= 3:
                    wcList = info[2].split(":")
                    wallClockLimit = (float(wcList[0]) * 60 + float(wcList[1])) * 60 + float(wcList[2])
                else:
                    self.log.warn('Problem parsing "%s" for wall clock limit' % line)

        consumed = {"CPU": cpu, "CPULimit": cpuLimit, "WallClock": wallClock, "WallClockLimit": wallClockLimit}
        self.log.debug(consumed)

        if None not in consumed.values():
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            self.log.info("Could not determine parameter", ",".join(missed))
            self.log.debug("This is the stdout from the batch system call\n%s" % (result["Value"]))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from PBS, assume that we ran for too short time
            if not cpuLimit:
                consumed["CPULimit"] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed["WallClockLimit"] = cpuLimit / 0.8
            if not cpu:
                consumed["CPU"] = int(time.time() - self.startTime)
            if not wallClock:
                consumed["WallClock"] = int(time.time() - self.startTime)
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = "Could not determine some parameters"
            self.log.info(msg, ":\nThis is the stdout from the batch system call\n%s" % (result["Value"]))
            retVal = S_ERROR(msg)
            retVal["Value"] = consumed
            return retVal
예제 #22
0
    def getResourceUsage(self):
        """Returns a dictionary containing CPUConsumed, CPULimit, WallClockConsumed
       and WallClockLimit for current slot.  All values returned in seconds.
    """
        cmd = 'qstat -f -j %s' % (self.jobID)
        result = runCommand(cmd)
        if not result['OK']:
            return result

        cpu = None
        cpuLimit = None
        wallClock = None
        wallClockLimit = None

        lines = str(result['Value']).split('\n')
        for line in lines:
            if re.search('usage.*cpu.*', line):
                match = re.search('cpu=([\d,:]*),', line)
                if match:
                    cpuList = match.groups()[0].split(':')
                try:
                    newcpu = 0.
                    if len(cpuList) == 3:
                        newcpu = (float(cpuList[0]) * 60 +
                                  float(cpuList[1])) * 60 + float(cpuList[2])
                    elif len(cpuList) == 4:
                        newcpu = (
                            (float(cpuList[0]) * 24 + float(cpuList[1])) * 60 +
                            float(cpuList[2])) * 60 + float(cpuList[3])
                    if not cpu or newcpu > cpu:
                        cpu = newcpu
                except ValueError:
                    self.log.warn('Problem parsing "%s" for CPU consumed' %
                                  line)
            elif re.search('hard resource_list.*cpu.*', line):
                match = re.search('_cpu=(\d*)', line)
                if match:
                    cpuLimit = float(match.groups()[0])
                match = re.search('_rt=(\d*)', line)
                if match:
                    wallClockLimit = float(match.groups()[0])

        # Some SGE batch systems apply CPU scaling factor to the CPU consumption figures
        if cpu:
            factor = _getCPUScalingFactor()
            if factor:
                cpu = cpu / factor

        consumed = {
            'CPU': cpu,
            'CPULimit': cpuLimit,
            'WallClock': wallClock,
            'WallClockLimit': wallClockLimit
        }

        if None not in consumed.values():
            # This cannot happen as we can't get wallClock from anywhere
            self.log.debug("TimeLeft counters complete:", str(consumed))
            return S_OK(consumed)
        else:
            missed = [key for key, val in consumed.items() if val is None]
            self.log.info('Could not determine parameter', ','.join(missed))
            self.log.debug(
                'This is the stdout from the batch system call\n%s' %
                (result['Value']))

        if cpuLimit or wallClockLimit:
            # We have got a partial result from SGE
            if not cpuLimit:
                # Take some margin
                consumed['CPULimit'] = wallClockLimit * 0.8
            if not wallClockLimit:
                consumed['WallClockLimit'] = cpuLimit / 0.8
            if not cpu:
                consumed['CPU'] = time.time() - self.startTime
            if not wallClock:
                consumed['WallClock'] = time.time() - self.startTime
            self.log.debug("TimeLeft counters restored:", str(consumed))
            return S_OK(consumed)
        else:
            msg = 'Could not determine some parameters'
            self.log.info(
                msg, ':\nThis is the stdout from the batch system call\n%s' %
                (result['Value']))
            retVal = S_ERROR(msg)
            retVal['Value'] = consumed
            return retVal