Ejemplo n.º 1
0
  def __init__(self, pid, exeThread, spObject, jobCPUTime,
               memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}):
    """ Constructor, takes system flag as argument.
    """
    self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800))  # 30 minutes
    self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800))  # 30 minutes
    self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2))  # SIGINT
    self.stopSigRegex = jobArgs.get('StopSigRegex', None)
    self.stopSigSent = False

    self.log = gLogger.getSubLogger("Watchdog")
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUTime = jobCPUTime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.profiler = Profiler(pid)
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    # defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.wallClockCheckSeconds = 5 * 60  # 5 minutes
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.wallClockCheckCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    self.scaleFactor = 1.0
    self.processors = processors
Ejemplo n.º 2
0
  def test_getTimeLeft( self ):
#     for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]:

    for batch, retValue in [( 'LSF', LSF_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      timeMock = MagicMock()

      tl = TimeLeft()
#      res = tl.getTimeLeft()
#      self.assertEqual( res['OK'], True )

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )

      tl.scaleFactor = 10.0
      tl.normFactor = 10.0
      tl.batchPlugin.bin = '/usr/bin'
      tl.batchPlugin.hostNorm = 10.0
      tl.batchPlugin.cpuLimit = 1000
      tl.batchPlugin.wallClockLimit = 1000

      with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.runCommand", new=rcMock ):
        with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.time", new=timeMock ):
          res = tl.getTimeLeft()
          self.assertEqual( res['OK'], True, res.get('Message', '') )

    for batch, retValue in [( 'SGE', SGE_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      tl = TimeLeft()
#       res = tl.getTimeLeft()
#       self.assertFalse( res['OK'] )

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )

      tl.scaleFactor = 10.0
      tl.normFactor = 10.0
      tl.batchPlugin.bin = '/usr/bin'
      tl.batchPlugin.hostNorm = 10.0
      tl.batchPlugin.cpuLimit = 1000
      tl.batchPlugin.wallClockLimit = 1000

      res = tl.getTimeLeft()
      self.assert_( res['OK'] )
      self.assertEqual( res['Value'], 9400.0 )
Ejemplo n.º 3
0
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        #Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceUniqueID = self.am_getOption('CEUniqueID', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceUniqueID = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceUniqueID
        ceInstance = ceFactory.getCE(ceUniqueID)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance

        self.computingElement = ceInstance['Value']
        self.diracRoot = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        #Localsite options
        self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd())
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        #Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot,
            self.am_getOption('JobWrapperTemplate', defaultWrapperLocation))
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info')
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        #Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0)
        self.gridCEQueue = gConfig.getValue(
            '/Resources/Computing/CEDefaults/GridCEQueue', '')
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()
Ejemplo n.º 4
0
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', self.timeLeft)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName)
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               self.pilotReference)
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', self.defaultProxyLength)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          self.cpuFactor)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay',
                                                    self.jobSubmissionDelay)
        self.fillingMode = self.am_getOption('FillingModeFlag',
                                             self.fillingMode)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft',
                                                 self.minimumTimeLeft)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', self.stopOnApplicationFailure)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', self.extraOptions)
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        return S_OK()
Ejemplo n.º 5
0
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance['Value']
        #Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        #Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', '')
        #Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0)
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()
Ejemplo n.º 6
0
  def test_getScaledCPU( self ):
    tl = TimeLeft()
    res = tl.getScaledCPU()
    self.assertEqual( res, 0 )

    tl.scaleFactor = 5.0
    tl.normFactor = 5.0

    for batch, retValue in [( 'LSF', LSF_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )
      res = tl.getScaledCPU()
      self.assertEqual( res, 0.0 )

    for batch, retValue in [( 'SGE', SGE_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )
      res = tl.getScaledCPU()
      self.assertEqual( res, 300.0 )
Ejemplo n.º 7
0
  def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}):
    """ Constructor, takes system flag as argument.
    """
    self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800))  # 30 minutes
    self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800))  # 30 minutes
    self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2))  # SIGINT
    self.stopSigRegex = jobArgs.get('StopSigRegex', None)
    self.stopSigSent = False

    self.log = gLogger.getSubLogger("Watchdog")
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUTime = jobCPUTime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.processMonitor = ProcessMonitor()
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    # defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.wallClockCheckSeconds = 5 * 60  # 5 minutes
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.wallClockCheckCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    self.scaleFactor = 1.0
    self.processors = processors
Ejemplo n.º 8
0
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
    # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceType = self.am_getOption( 'CEType', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceType = localCE

    # Create backend Computing Element
    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE( ceType )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance
    self.computingElement = ceInstance['Value']

    result = self.computingElement.getDescription()
    if not result['OK']:
      self.log.warn( "Can not get the CE description" )
      return result
    ceDict = result['Value']
    self.timeLeft = ceDict.get( 'CPUTime', 0.0 )
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft )

    self.initTimes = os.times()
    # Localsite options
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.minimumTimeLeft = self.am_getOption( 'MinimumTimeLeft', 1000 )
    self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True )
    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 )
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '' )
    # Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    self.pilotInfoReportedFlag = False
    return S_OK()
Ejemplo n.º 9
0
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
    # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceType = self.am_getOption( 'CEType', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceType = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE( ceType )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.initTimes = os.times()

    self.computingElement = ceInstance['Value']
    self.diracRoot = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( __file__ ) ) ) )
    #Localsite options
    self.siteRoot = gConfig.getValue( '/LocalSite/Root', os.getcwd() )
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
    self.jobWrapperTemplate = os.path.join( self.diracRoot,
                                            self.am_getOption( 'JobWrapperTemplate',
                                                                defaultWrapperLocation ) )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.defaultLogLevel = self.am_getOption( 'DefaultLogLevel', 'info' )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True )
    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 )
    self.jobCount = 0
    self.matchFailedCount = 0
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '' )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    self.pilotInfoReportedFlag = False
    return S_OK()
Ejemplo n.º 10
0
    def test_getScaledCPU(self):
        tl = TimeLeft()
        res = tl.getScaledCPU()
        self.assertEqual(res, 0)

        tl.scaleFactor = 5.0
        tl.normFactor = 5.0

        for batch, retValue in [('LSF', LSF_ReturnValue)]:
            self.tl = importlib.import_module(
                "DIRAC.Core.Utilities.TimeLeft.TimeLeft")
            rcMock = MagicMock()
            rcMock.return_value = S_OK(retValue)
            self.tl.runCommand = rcMock

            batchSystemName = '%sTimeLeft' % batch
            batchPlugin = __import__(
                'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName,
                globals(), locals(), [batchSystemName])
            batchStr = 'batchPlugin.%s()' % (batchSystemName)
            tl.batchPlugin = eval(batchStr)
            res = tl.getScaledCPU()
            self.assertEqual(res, 0.0)

        for batch, retValue in [('SGE', SGE_ReturnValue)]:
            self.tl = importlib.import_module(
                "DIRAC.Core.Utilities.TimeLeft.TimeLeft")
            rcMock = MagicMock()
            rcMock.return_value = S_OK(retValue)
            self.tl.runCommand = rcMock

            batchSystemName = '%sTimeLeft' % batch
            batchPlugin = __import__(
                'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName,
                globals(), locals(), [batchSystemName])
            batchStr = 'batchPlugin.%s()' % (batchSystemName)
            tl.batchPlugin = eval(batchStr)
            res = tl.getScaledCPU()
            self.assertEqual(res, 300.0)
Ejemplo n.º 11
0
class JobAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        #Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance['Value']
        self.diracRoot = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        #Localsite options
        self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd())
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        #Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot,
            self.am_getOption('JobWrapperTemplate', defaultWrapperLocation))
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info')
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        #Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0)
        self.gridCEQueue = gConfig.getValue(
            '/Resources/Computing/CEDefaults/GridCEQueue', '')
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])
            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        utime, stime, cutime, cstime, elapsed = os.times()
        cpuTime = utime + stime + cutime
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft - cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose('Log proxy change (to be instrumented)')
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self.__requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Invalid Proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if not 'groupProperties' in ret['Value']:
                print ret['Value']
                print proxyChain.dumpAllToString()
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve proxy')
            self.log.warn(retVal)
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if not jobParams.has_key('SoftwareDistModule'):
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module: %s' %
                         (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        result = module.execute()
        return result

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                    jobJDL, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        result = self.__createJobWrapper(jobID, jobParams, resourceParams,
                                         optimizerParams)

        if not result['OK']:
            return result

        wrapperData = result['Value']
        wrapperFile = wrapperData['execFile']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        wrapperName = os.path.basename(wrapperFile)
        self.log.info('Submitting %s to %sCE' % (wrapperName, self.ceName))

        #Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error(proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = 'dc%s' % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy,
                                                     wrapperData)

        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job %s submitted as %s' % (jobID, batchID))
            self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID))
            self.__setJobParam(jobID, 'LocalBatchID', str(batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'],
                                    self.__report)
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
                # make sure the Job is declared Failed
                self.__report(jobID, 'Failed', submission['Message'])
            return S_ERROR('%s CE Submission Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def __createJobWrapper(self, jobID, jobParams, resourceParams,
                           optimizerParams):
        """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
        arguments = {
            'Job': jobParams,
            'CE': resourceParams,
            'Optimizer': optimizerParams
        }
        self.log.verbose('Job arguments are: \n %s' % (arguments))

        workingDir = gConfig.getValue('/LocalSite/WorkingDirectory',
                                      self.siteRoot)
        if not os.path.exists('%s/job/Wrapper' % (workingDir)):
            try:
                os.makedirs('%s/job/Wrapper' % (workingDir))
            except Exception:
                self.log.exception()
                return S_ERROR('Could not create directory for wrapper script')

        jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % (workingDir, jobID)
        if os.path.exists(jobWrapperFile):
            self.log.verbose('Removing existing Job Wrapper for %s' % (jobID))
            os.remove(jobWrapperFile)
        fd = open(self.jobWrapperTemplate, 'r')
        wrapperTemplate = fd.read()
        fd.close()

        dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        timeStr = time.strftime("%H:%M", time.localtime(time.time()))
        date_time = '%s %s' % (dateStr, timeStr)
        signature = __RCSID__
        dPython = sys.executable

        systemConfig = ''
        if jobParams.has_key('SystemConfig'):
            systemConfig = jobParams['SystemConfig']
            self.log.verbose('Job system configuration requirement is %s' %
                             (systemConfig))
            if resourceParams.has_key('Root'):
                jobPython = '%s/%s/bin/python' % (resourceParams['Root'],
                                                  systemConfig)
                if os.path.exists(jobPython):
                    self.log.verbose('Found local python for job:\n%s' %
                                     (jobPython))
                    dPython = jobPython
                else:
                    if systemConfig == 'ANY':
                        self.log.verbose(
                            'Using standard available python %s for job' %
                            (dPython))
                    else:
                        self.log.warn(
                            'Job requested python \n%s\n but this is not available locally'
                            % (jobPython))
            else:
                self.log.warn('No LocalSite/Root defined')
        else:
            self.log.warn('Job has no system configuration requirement')

        if not systemConfig or systemConfig.lower() == 'any':
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            if not systemConfig:
                self.log.warn('Could not establish SystemConfig')

        logLevel = self.defaultLogLevel
        if jobParams.has_key('LogLevel'):
            logLevel = jobParams['LogLevel']
            self.log.info('Found Job LogLevel JDL parameter with value: %s' %
                          (logLevel))
        else:
            self.log.info(
                'Applying default LogLevel JDL parameter with value: %s' %
                (logLevel))

        realPythonPath = os.path.realpath(dPython)
        self.log.debug('Real python path after resolving links is:')
        self.log.debug(realPythonPath)
        dPython = realPythonPath

        siteRootPython = self.siteRoot
        self.log.debug('DIRACPython is:\n%s' % dPython)
        self.log.debug('SiteRootPythonDir is:\n%s' % siteRootPython)
        wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@",
                                                  str(signature))
        wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID))
        wrapperTemplate = wrapperTemplate.replace("@DATESTRING@",
                                                  str(date_time))
        wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments))
        wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@",
                                                  str(siteRootPython))
        wrapper = open(jobWrapperFile, "w")
        wrapper.write(wrapperTemplate)
        wrapper.close()
        jobExeFile = '%s/job/Wrapper/Job%s' % (workingDir, jobID)
        jobFileContents = \
    """#!/bin/sh
%s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no
""" % ( dPython, jobWrapperFile, logLevel )
        jobFile = open(jobExeFile, 'w')
        jobFile.write(jobFileContents)
        jobFile.close()

        data = {
            'execFile': jobExeFile,
            'wrapperTemplate': self.jobWrapperTemplate,
            'signature': signature,
            'wrapperFile': jobWrapperFile,
            'logLevel': logLevel,
            'jid': jobID,
            'datetime': date_time,
            'jobArgs': arguments,
            'sitePython': siteRootPython
        }
        return S_OK(data)

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()
        return S_OK(jdlFileName)

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR(
                'Job request to matcher service failed with exception')
Ejemplo n.º 12
0
class Watchdog(object):

  #############################################################################
  def __init__(self, pid, exeThread, spObject, jobCPUTime,
               memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}):
    """ Constructor, takes system flag as argument.
    """
    self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800))  # 30 minutes
    self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800))  # 30 minutes
    self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2))  # SIGINT
    self.stopSigRegex = jobArgs.get('StopSigRegex', None)
    self.stopSigSent = False

    self.log = gLogger.getSubLogger("Watchdog")
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUTime = jobCPUTime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.profiler = Profiler(pid)
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    # defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.wallClockCheckSeconds = 5 * 60  # 5 minutes
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.wallClockCheckCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    self.scaleFactor = 1.0
    self.processors = processors

  #############################################################################
  def initialize(self, loops=0):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info('Watchdog already initialized')
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue('/DIRAC/Setup', '')
    if not setup:
      return S_ERROR('Can not get the DIRAC Setup value')
    wms_instance = getSystemInstance("WorkloadManagement")
    if not wms_instance:
      return S_ERROR('Can not get the WorkloadManagement system instance')
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose('Watchdog initialization')
    self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag))
    # Test control flags
    self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1)
    self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1)
    self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1)
    self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1)
    self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0)
    self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0)
    self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1)
    # Other parameters
    self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10)  # 10 seconds
    self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60)  # 30 minute period
    self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60)  # 20 mins
    self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60)  # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1)  # on / off
    self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10)  # MB
    self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000)  # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60)  # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20)  # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5)  # ratio %age
    # After 5 sample times return null CPU consumption kill job
    self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5)
    if self.checkingTime < self.minCheckingTime:
      self.log.info(
          'Requested CheckingTime of %s setting to %s seconds (minimum)' %
          (self.checkingTime, self.minCheckingTime))
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime)
    self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0)

    return S_OK()

  def run(self):
    """ The main watchdog execution method
    """

    result = self.initialize()
    if not result['OK']:
      self.log.always('Can not start watchdog for the following reason')
      self.log.always(result['Message'])
      return result

    try:
      while True:
        self.log.debug('Starting watchdog loop # %d' % self.count)
        start_cycle_time = time.time()
        result = self.execute()
        exec_cycle_time = time.time() - start_cycle_time
        if not result['OK']:
          self.log.error("Watchdog error during execution", result['Message'])
          break
        elif result['Value'] == "Ended":
          break
        self.count += 1
        if exec_cycle_time < self.pollingTime:
          time.sleep(self.pollingTime - exec_cycle_time)
      return S_OK()
    except Exception:
      self.log.exception()
      return S_ERROR('Exception')

  #############################################################################
  def execute(self):
    """ The main agent execution method of the Watchdog.
    """

    if not self.exeThread.isAlive():
      self.__getUsageSummary()
      self.log.info('Process to monitor has completed, Watchdog will exit.')
      return S_OK("Ended")

    # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL
    if not self.stopSigSent and self.stopSigRegex is not None and (
            time.time() - self.initialValues['StartTime']) > self.wallClockCheckSeconds * self.wallClockCheckCount:
      self.wallClockCheckCount += 1
      self._performWallClockChecks()

    if self.littleTimeLeft:
      # if we have gone over enough iterations query again
      if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
        self.checkError = 'Job has reached the CPU limit of the queue'
        self.log.error(self.checkError, self.timeLeft)
        self.__killRunningThread()
        return S_OK()
      else:
        self.littleTimeLeftCount -= 1

    # Note: need to poll regularly to see if the thread is alive
    #      but only perform checks with a certain frequency
    if (time.time() - self.initialValues['StartTime']) > self.checkingTime * self.checkCount:
      self.checkCount += 1
      result = self._performChecks()
      if not result['OK']:
        self.log.warn('Problem during recent checks')
        self.log.warn(result['Message'])
      return S_OK()
    else:
      # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount))
      return S_OK()

  #############################################################################
  def _performWallClockChecks(self):
    """Watchdog performs the wall clock checks based on MJF. Signals are sent
       to processes if we need to stop, but function always returns S_OK()
    """
    mjf = MJF.MJF()

    try:
      wallClockSecondsLeft = mjf.getWallClockSecondsLeft()
    except Exception as e:
      # Just stop if we can't get the wall clock seconds left
      return S_OK()

    jobstartSeconds = mjf.getIntJobFeature('jobstart_secs')
    if jobstartSeconds is None:
      # Just stop if we don't know when the job started
      return S_OK()

    if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \
       (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds):
      # Need to send the signal! Assume it works to avoid sending the signal more than once
      self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber)
      self.stopSigSent = True

      try:
        for childPid in getChildrenPIDs(self.wrapperPID):
          try:
            cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip()
          except IOError:
            # Process gone away? Not running on Linux? Skip anyway
            continue

          if re.search(self.stopSigRegex, cmdline) is not None:
            self.log.info(
                'Sending signal %d to process ID %d, cmdline = "%s"' %
                (self.stopSigNumber, childPid, cmdline))
            os.kill(childPid, self.stopSigNumber)

      except Exception as e:
        self.log.error('Failed to send signals to JobWrapper children! (%s)' % str(e))

    return S_OK()

  #############################################################################
  def _performChecks(self):
    """The Watchdog checks are performed at a different period to the checking of the
       application thread and correspond to the checkingTime.
    """
    self.log.verbose('------------------------------------')
    self.log.verbose('Checking loop starts for Watchdog')
    heartBeatDict = {}

    msg = ''

    loadAvg = float(os.getloadavg()[0])
    msg += 'LoadAvg: %d ' % loadAvg
    heartBeatDict['LoadAverage'] = loadAvg
    if 'LoadAverage' not in self.parameters:
      self.parameters['LoadAverage'] = []
    self.parameters['LoadAverage'].append(loadAvg)

    memoryUsed = self.getMemoryUsed()
    msg += 'MemUsed: %.1f kb ' % (memoryUsed)
    heartBeatDict['MemoryUsed'] = memoryUsed
    if 'MemoryUsed' not in self.parameters:
      self.parameters['MemoryUsed'] = []
    self.parameters['MemoryUsed'].append(memoryUsed)

    result = self.profiler.getAllProcessData(withChildren=True)
    if result['OK']:
      vsize = result['Value']['stats']['vSizeUsage'] * 1024.
      rss = result['Value']['stats']['memoryUsage'] * 1024.
      heartBeatDict['Vsize'] = vsize
      heartBeatDict['RSS'] = rss
      self.parameters.setdefault('Vsize', [])
      self.parameters['Vsize'].append(vsize)
      self.parameters.setdefault('RSS', [])
      self.parameters['RSS'].append(rss)
      msg += "Job Vsize: %.1f kb " % vsize
      msg += "Job RSS: %.1f kb " % rss
    result = self.getDiskSpace()
    if not result['OK']:
      self.log.warn("Could not establish DiskSpace", result['Message'])
    else:
      msg += 'DiskSpace: %.1f MB ' % (result['Value'])
    if 'DiskSpace' not in self.parameters:
      self.parameters['DiskSpace'] = []
    if result['OK']:
      self.parameters['DiskSpace'].append(result['Value'])
      heartBeatDict['AvailableDiskSpace'] = result['Value']

    cpu = self.__getCPU()
    if not cpu['OK']:
      msg += 'CPU: ERROR '
      hmsCPU = 0
    else:
      cpu = cpu['Value']
      msg += 'CPU: %s (h:m:s) ' % (cpu)
      if 'CPUConsumed' not in self.parameters:
        self.parameters['CPUConsumed'] = []
      self.parameters['CPUConsumed'].append(cpu)
      hmsCPU = cpu
      rawCPU = self.__convertCPUTime(hmsCPU)
      if rawCPU['OK']:
        heartBeatDict['CPUConsumed'] = rawCPU['Value']

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn("Failed determining wall clock time", result['Message'])
    else:
      msg += 'WallClock: %.2f s ' % (result['Value'])
      self.parameters.setdefault('WallClockTime', list()).append(result['Value'])
      heartBeatDict['WallClockTime'] = result['Value']
    self.log.info(msg)

    result = self._checkProgress()
    if not result['OK']:
      self.checkError = result['Message']
      self.log.warn(self.checkError)

      if self.jobPeekFlag:
        result = self.__peek()
        if result['OK']:
          outputList = result['Value']
          size = len(outputList)
          self.log.info('Last %s lines of available application output:' % (size))
          self.log.info('================START================')
          for line in outputList:
            self.log.info(line)

          self.log.info('=================END=================')

      self.__killRunningThread()
      return S_OK()

    recentStdOut = 'None'
    if self.jobPeekFlag:
      result = self.__peek()
      if result['OK']:
        outputList = result['Value']
        size = len(outputList)
        recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % (size, Time.dateTime())
        border = '=' * len(recentStdOut)
        cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % (hmsCPU)
        if self.timeLeft:
          cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft
        recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border)
        self.log.info(recentStdOut)
        for line in outputList:
          self.log.info(line)
          recentStdOut += line + '\n'
      else:
        recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread'
        self.log.info(recentStdOut)
        self.peekFailCount += 1
        if self.peekFailCount > self.peekRetry:
          self.jobPeekFlag = 0
          self.log.warn('Turning off job peeking for remainder of execution')

    if 'JOBID' not in os.environ:
      self.log.info('Running without JOBID so parameters will not be reported')
      return S_OK()

    jobID = os.environ['JOBID']
    staticParamDict = {'StandardOutput': recentStdOut}
    self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict)
    return S_OK('Watchdog checking cycle complete')

  #############################################################################
  def __getCPU(self):
    """Uses os.times() to get CPU time and returns HH:MM:SS after conversion.
    """
    try:
      result = self.profiler.getAllProcessData(withChildren=True)
      if not result['OK']:
        self.log.warn('Problem while checking consumed CPU')
        return result
      cpuTime = result['Value']
      if cpuTime:
        cpuTimeTotal = cpuTime['stats']['cpuUsageSystem'] + cpuTime['stats']['cpuUsageUser']
        self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTimeTotal))
        return self.__getCPUHMS(cpuTimeTotal)
      else:
        self.log.error("CPU time consumed found to be 0")
        return S_ERROR()

    except Exception as e:
      self.log.warn('Could not determine CPU time consumed with exception')
      self.log.exception(e)
      return S_ERROR("Could not determine CPU time consumed with exception")

  #############################################################################
  def __getCPUHMS(self, cpuTime):
    mins, secs = divmod(cpuTime, 60)
    hours, mins = divmod(mins, 60)
    humanTime = '%02d:%02d:%02d' % (hours, mins, secs)
    self.log.verbose('Human readable CPU time is: %s' % humanTime)
    return S_OK(humanTime)

  #############################################################################
  def __interpretControlSignal(self, signalDict):
    """This method is called whenever a signal is sent via the result of
       sending a sign of life.
    """
    self.log.info('Received control signal')
    if isinstance(signalDict, dict):
      if 'Kill' in signalDict:
        self.log.info('Received Kill signal, stopping job via control signal')
        self.checkError = 'Received Kill signal'
        self.__killRunningThread()
      else:
        self.log.info('The following control signal was sent but not understood by the watchdog:')
        self.log.info(signalDict)
    else:
      self.log.info('Expected dictionary for control signal, received:\n%s' % (signalDict))

    return S_OK()

  #############################################################################
  def _checkProgress(self):
    """This method calls specific tests to determine whether the job execution
       is proceeding normally.  CS flags can easily be added to add or remove
       tests via central configuration.
    """
    report = ''

    if self.testWallClock:
      result = self.__checkWallClockTime()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'WallClock: OK, '
    else:
      report += 'WallClock: NA,'

    if self.testDiskSpace:
      result = self.__checkDiskSpace()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'DiskSpace: OK, '
    else:
      report += 'DiskSpace: NA,'

    if self.testLoadAvg:
      result = self.__checkLoadAverage()
      if not result['OK']:
        self.log.warn("Check of load average failed, but won't fail because of that: %s" % result['Message'])
        report += 'LoadAverage: ERROR, '
        return S_OK()
      report += 'LoadAverage: OK, '
    else:
      report += 'LoadAverage: NA,'

    if self.testCPUConsumed:
      result = self.__checkCPUConsumed()
      if not result['OK']:
        return result
      report += 'CPUConsumed: OK, '
    else:
      report += 'CPUConsumed: NA, '

    if self.testCPULimit:
      result = self.__checkCPULimit()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'CPULimit OK, '
    else:
      report += 'CPULimit: NA, '

    if self.testTimeLeft:
      self.__timeLeft()
      if self.timeLeft:
        report += 'TimeLeft: OK'
    else:
      report += 'TimeLeft: NA'

    if self.testMemoryLimit:
      result = self.__checkMemoryLimit()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'MemoryLimit OK, '
    else:
      report += 'MemoryLimit: NA, '

    self.log.info(report)
    return S_OK('All enabled checks passed')

  #############################################################################
  def __checkCPUConsumed(self):
    """ Checks whether the CPU consumed by application process is reasonable. This
        method will report stalled jobs to be killed.
    """
    self.log.info("Checking CPU Consumed")

    if 'WallClockTime' not in self.parameters:
      return S_ERROR('Missing WallClockTime info')
    if 'CPUConsumed' not in self.parameters:
      return S_ERROR('Missing CPUConsumed info')

    wallClockTime = self.parameters['WallClockTime'][-1]
    if wallClockTime < self.sampleCPUTime:
      self.log.info("Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % (
          wallClockTime,
          self.sampleCPUTime))
      return S_OK()

    intervals = max(1, int(self.sampleCPUTime / self.checkingTime))
    if len(self.parameters['CPUConsumed']) < intervals + 1:
      self.log.info("Not enough snapshots to calculate, there are %s and we need %s" %
                    (len(self.parameters['CPUConsumed']), intervals + 1))
      return S_OK()

    wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals]
    try:
      cpuTime = self.__convertCPUTime(self.parameters['CPUConsumed'][-1])['Value']
      # For some reason, some times the CPU consumed estimation returns 0
      # if cpuTime == 0:
      #   return S_OK()
      cpuTime -= self.__convertCPUTime(self.parameters['CPUConsumed'][-1 - intervals])['Value']
      if cpuTime < 0:
        self.log.warn('Consumed CPU time negative, something wrong may have happened, ignore')
        return S_OK()
      if wallClockTime <= 0:
        self.log.warn('Wallclock time should not be negative or zero, Ignore')
        return S_OK()

      ratio = (cpuTime / wallClockTime) * 100.

      self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio)
      # in case of error cpuTime might be 0, exclude this
      if ratio < self.minCPUWallClockRatio:
        if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'):
          self.log.info('N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload')
          return S_OK()
        self.log.info("Job is stalled!")
        return S_ERROR('Watchdog identified this job as stalled')
    except Exception as e:
      self.log.error("Cannot convert CPU consumed from string to int", str(e))

    return S_OK()

  #############################################################################

  def __convertCPUTime(self, cputime):
    """ Method to convert the CPU time as returned from the Watchdog
        instances to the equivalent DIRAC normalized CPU time to be compared
        to the Job CPU requirement.
    """
    cpuValue = 0
    cpuHMS = cputime.split(':')
    # for i in xrange( len( cpuHMS ) ):
    #   cpuHMS[i] = cpuHMS[i].replace( '00', '0' )

    try:
      hours = float(cpuHMS[0]) * 60 * 60
      mins = float(cpuHMS[1]) * 60
      secs = float(cpuHMS[2])
      cpuValue = float(hours + mins + secs)
    except Exception as x:
      self.log.warn(str(x))
      return S_ERROR('Could not calculate CPU time')

    # Normalization to be implemented
    normalizedCPUValue = cpuValue

    result = S_OK()
    result['Value'] = normalizedCPUValue
    self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue))
    return result

  #############################################################################

  def __checkCPULimit(self):
    """ Checks that the job has consumed more than the job CPU requirement
        (plus a configurable margin) and kills them as necessary.
    """
    consumedCPU = 0
    if 'CPUConsumed' in self.parameters:
      consumedCPU = self.parameters['CPUConsumed'][-1]

    consumedCPUDict = self.__convertCPUTime(consumedCPU)
    if consumedCPUDict['OK']:
      currentCPU = consumedCPUDict['Value']
    else:
      return S_OK('Not possible to determine current CPU consumed')

    if consumedCPU:
      limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100)
      cpuConsumed = float(currentCPU)
      if cpuConsumed > limit:
        self.log.info(
            'Job has consumed more than the specified CPU limit with an additional %s%% margin' %
            (self.jobCPUMargin))
        return S_ERROR('Job has exceeded maximum CPU time limit')
      else:
        return S_OK('Job within CPU limit')
    elif not currentCPU:
      self.log.verbose('Both initial and current CPU consumed are null')
      return S_OK('CPU consumed is not measurable yet')
    else:
      return S_OK('Not possible to determine CPU consumed')

  def __checkMemoryLimit(self):
    """ Checks that the job memory consumption is within a limit
    """
    if 'Vsize' in self.parameters:
      vsize = self.parameters['Vsize'][-1]

    if vsize and self.memoryLimit:
      if vsize > self.memoryLimit:
        vsize = vsize
        # Just a warning for the moment
        self.log.warn("Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit))

    return S_OK()

  #############################################################################
  def __checkDiskSpace(self):
    """Checks whether the CS defined minimum disk space is available.
    """
    if 'DiskSpace' in self.parameters:
      availSpace = self.parameters['DiskSpace'][-1]
      if availSpace >= 0 and availSpace < self.minDiskSpace:
        self.log.info('Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace))
        return S_ERROR('Job has insufficient disk space to continue')
      else:
        return S_OK('Job has enough disk space available')
    else:
      return S_ERROR('Available disk space could not be established')

  #############################################################################
  def __checkWallClockTime(self):
    """Checks whether the job has been running for the CS defined maximum
       wall clock time.
    """
    if 'StartTime' in self.initialValues:
      startTime = self.initialValues['StartTime']
      if time.time() - startTime > self.maxWallClockTime:
        self.log.info('Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime))
        return S_ERROR('Job has exceeded maximum wall clock time')
      else:
        return S_OK('Job within maximum wall clock time')
    else:
      return S_ERROR('Job start time could not be established')

  #############################################################################
  def __checkLoadAverage(self):
    """Checks whether the CS defined maximum load average is exceeded.
    """
    if 'LoadAverage' in self.parameters:
      loadAvg = self.parameters['LoadAverage'][-1]
      if loadAvg > float(self.loadAvgLimit):
        self.log.info('Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit))
        return S_ERROR('Job exceeded maximum load average')
      else:
        return S_OK('Job running with normal load average')
    else:
      return S_ERROR('Job load average not established')

  #############################################################################
  def __peek(self):
    """ Uses ExecutionThread.getOutput() method to obtain standard output
        from running thread via subprocess callback function.
    """
    result = self.exeThread.getOutput()
    if not result['OK']:
      self.log.warn('Could not obtain output from running application thread')
      self.log.warn(result['Message'])

    return result

  #############################################################################
  def calibrate(self):
    """ The calibrate method obtains the initial values for system memory and load
        and calculates the margin for error for the rest of the Watchdog cycle.
    """
    self.__getWallClockTime()
    self.parameters['WallClockTime'] = []

    cpuConsumed = self.__getCPU()
    if not cpuConsumed['OK']:
      self.log.warn("Could not establish CPU consumed, setting to 0.0")
      cpuConsumed = 0.0
    else:
      cpuConsumed = cpuConsumed['Value']

    self.initialValues['CPUConsumed'] = cpuConsumed
    self.parameters['CPUConsumed'] = []

    self.initialValues['LoadAverage'] = float(os.getloadavg()[0])
    self.parameters['LoadAverage'] = []

    memUsed = self.getMemoryUsed()

    self.initialValues['MemoryUsed'] = memUsed
    self.parameters['MemoryUsed'] = []

    result = self.profiler.getAllProcessData(withChildren=True)
    self.log.verbose('Job Memory: %s' % (result['Value']))
    if not result['OK']:
      self.log.warn('Could not get job memory usage')

    self.initialValues['Vsize'] = result['Value']['stats']['vSizeUsage'] * 1024.
    self.initialValues['RSS'] = result['Value']['stats']['memoryUsage'] * 1024.
    self.parameters['Vsize'] = []
    self.parameters['RSS'] = []

    result = self.getDiskSpace()
    self.log.verbose('DiskSpace: %s' % (result))
    if not result['OK']:
      self.log.warn("Could not establish DiskSpace")

    self.initialValues['DiskSpace'] = result['Value']
    self.parameters['DiskSpace'] = []

    result = self.getNodeInformation()
    self.log.verbose('NodeInfo: %s' % (result))
    if not result['OK']:
      self.log.warn("Could not establish static system information")

    if 'LSB_JOBID' in os.environ:
      result['LocalJobID'] = os.environ['LSB_JOBID']
    if 'PBS_JOBID' in os.environ:
      result['LocalJobID'] = os.environ['PBS_JOBID']
    if 'QSUB_REQNAME' in os.environ:
      result['LocalJobID'] = os.environ['QSUB_REQNAME']
    if 'JOB_ID' in os.environ:
      result['LocalJobID'] = os.environ['JOB_ID']

    self.__reportParameters(result, 'NodeInformation', True)
    self.__reportParameters(self.initialValues, 'InitialValues')
    return S_OK()

  def __timeLeft(self):
    """
      return Normalized CPU time left in the batch system
      0 if not available
      update self.timeLeft and self.littleTimeLeft
    """
    # Get CPU time left in the batch system
    result = self.timeLeftUtil.getTimeLeft(0.0)
    if not result['OK']:
      # Could not get CPU time left, we might need to wait for the first loop
      # or the Utility is not working properly for this batch system
      # or we are in a batch system
      timeLeft = 0
    else:
      timeLeft = result['Value']

    self.timeLeft = timeLeft
    if not self.littleTimeLeft:
      if timeLeft and timeLeft < self.grossTimeLeftLimit:
        self.log.info('TimeLeft bellow %s, now checking with higher frequency' % timeLeft)
        self.littleTimeLeft = True
        # TODO: better configurable way of doing this to be coded
        self.littleTimeLeftCount = 15
    else:
      if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit:
        timeLeft = -1

    return timeLeft

  #############################################################################
  def __getUsageSummary(self):
    """ Returns average load, memory etc. over execution of job thread
    """
    summary = {}
    # CPUConsumed
    if 'CPUConsumed' in self.parameters:
      cpuList = self.parameters['CPUConsumed']
      if cpuList:
        hmsCPU = cpuList[-1]
        rawCPU = self.__convertCPUTime(hmsCPU)
        if rawCPU['OK']:
          summary['LastUpdateCPU(s)'] = rawCPU['Value']
      else:
        summary['LastUpdateCPU(s)'] = 'Could not be estimated'
    # DiskSpace
    if 'DiskSpace' in self.parameters:
      space = self.parameters['DiskSpace']
      if space:
        value = abs(float(space[-1]) - float(self.initialValues['DiskSpace']))
        if value < 0:
          value = 0
        summary['DiskSpace(MB)'] = value
      else:
        summary['DiskSpace(MB)'] = 'Could not be estimated'
    # MemoryUsed
    if 'MemoryUsed' in self.parameters:
      memory = self.parameters['MemoryUsed']
      if memory:
        summary['MemoryUsed(kb)'] = abs(float(memory[-1]) - float(self.initialValues['MemoryUsed']))
      else:
        summary['MemoryUsed(kb)'] = 'Could not be estimated'
    # LoadAverage
    if 'LoadAverage' in self.parameters:
      laList = self.parameters['LoadAverage']
      if laList:
        summary['LoadAverage'] = float(sum(laList)) / float(len(laList))
      else:
        summary['LoadAverage'] = 'Could not be estimated'

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn("Failed determining wall clock time", result['Message'])
      summary['WallClockTime(s)'] = 0
      summary['ScaledCPUTime(s)'] = 0
    else:
      wallClock = result['Value']
      summary['WallClockTime(s)'] = wallClock
      summary['ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors

    self.__reportParameters(summary, 'UsageSummary', True)
    self.currentStats = summary

  #############################################################################
  def __reportParameters(self, params, title=None, report=False):
    """Will report parameters for job.
    """
    try:
      parameters = []
      self.log.info('==========================================================')
      if title:
        self.log.info('Watchdog will report %s' % (title))
      else:
        self.log.info('Watchdog will report parameters')
      self.log.info('==========================================================')
      vals = params
      if 'Value' in params:
        if vals['Value']:
          vals = params['Value']
      for k, v in vals.items():
        if v:
          self.log.info(str(k) + ' = ' + str(v))
          parameters.append((k, v))
      if report:
        self.__setJobParamList(parameters)

      self.log.info('==========================================================')
    except Exception as x:
      self.log.warn('Problem while reporting parameters')
      self.log.warn(str(x))

  #############################################################################
  def __getWallClockTime(self):
    """ Establishes the Wall Clock time spent since the Watchdog initialization"""
    result = S_OK()
    if 'StartTime' in self.initialValues:
      currentTime = time.time()
      wallClock = currentTime - self.initialValues['StartTime']
      result['Value'] = wallClock
    else:
      self.initialValues['StartTime'] = time.time()
      result['Value'] = 0.0

    return result

  #############################################################################
  def __killRunningThread(self):
    """ Will kill the running thread process and any child processes."""
    self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID()))
    result = self.spObject.killChild()
    self.applicationKilled = True
    self.log.info('Subprocess.killChild() returned:%s ' % (result))
    return S_OK('Thread killed')

  #############################################################################
  def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict):
    """ Sends sign of life 'heartbeat' signal and triggers control signal
        interpretation.
    """
    result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict)
    if not result['OK']:
      self.log.warn('Problem sending sign of life')
      self.log.warn(result)

    if result['OK'] and result['Value']:
      self.__interpretControlSignal(result['Value'])

    return result

  #############################################################################
  def __setJobParamList(self, value):
    """Wraps around setJobParameters of state update client
    """
    # job wrapper template sets the jobID variable
    if 'JOBID' not in os.environ:
      self.log.info('Running without JOBID so parameters will not be reported')
      return S_OK()
    jobID = os.environ['JOBID']
    jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value)
    self.log.verbose('setJobParameters(%s,%s)' % (jobID, value))
    if not jobParam['OK']:
      self.log.warn(jobParam['Message'])

    return jobParam

  #############################################################################
  def getNodeInformation(self):
    """ Attempts to retrieve all static system information, should be overridden in a subclass"""
    methodName = 'getNodeInformation'
    self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass')
    return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')

  #############################################################################
  def getMemoryUsed(self):
    """Obtains the memory used.
    """
    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \
        resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    return float(mem)

  #############################################################################
  def getDiskSpace(self):
    """ Attempts to get the available disk space, should be overridden in a subclass"""
    methodName = 'getDiskSpace'
    self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass')
    return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
Ejemplo n.º 13
0
def getCPUTime(cpuNormalizationFactor):
    """ Trying to get CPUTime (in seconds) from the CS or from TimeLeft. The default is a (low) 3600s.

      This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value
  """
    cpuTimeLeft = 0.
    cpuWorkLeft = gConfig.getValue('/LocalSite/CPUTimeLeft', 0)

    if not cpuWorkLeft:
        # Try and get the information from the CPU left utility
        result = TimeLeft().getTimeLeft()
        if result['OK']:
            cpuWorkLeft = result['Value']

    if cpuWorkLeft:
        # This is in HS06sseconds
        # We need to convert in real seconds
        if not cpuNormalizationFactor:  # if cpuNormalizationFactor passed in is 0, try get it from the local cfg
            cpuNormalizationFactor = gConfig.getValue(
                '/LocalSite/CPUNormalizationFactor', 0.0)
        if cpuNormalizationFactor:
            cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor  # this is a float

    if not cpuTimeLeft:
        # now we know that we have to find the CPUTimeLeft by looking in the CS
        # this is not granted to be correct as the CS units may not be real seconds
        gridCE = gConfig.getValue('/LocalSite/GridCE')
        ceQueue = gConfig.getValue('/LocalSite/CEQueue')
        if not ceQueue:
            # we have to look for a CEQueue in the CS
            # A bit hacky. We should better profit from something generic
            gLogger.warn(
                "No CEQueue in local configuration, looking to find one in CS")
            siteName = gConfig.getValue('/LocalSite/Site')
            queueSection = '/Resources/Sites/%s/%s/CEs/%s/Queues' % (
                siteName.split('.')[0], siteName, gridCE)
            res = gConfig.getSections(queueSection)
            if not res['OK']:
                raise RuntimeError(res['Message'])
            queues = res['Value']
            cpuTimes = [
                gConfig.getValue(queueSection + '/' + queue + '/maxCPUTime',
                                 10000.) for queue in queues
            ]
            # These are (real, wall clock) minutes - damn BDII!
            cpuTimeLeft = min(cpuTimes) * 60
        else:
            queueInfo = getQueueInfo('%s/%s' % (gridCE, ceQueue))
            cpuTimeLeft = 3600.
            if not queueInfo['OK'] or not queueInfo['Value']:
                gLogger.warn(
                    "Can't find a CE/queue, defaulting CPUTime to %d" %
                    cpuTimeLeft)
            else:
                queueCSSection = queueInfo['Value']['QueueCSSection']
                # These are (real, wall clock) minutes - damn BDII!
                cpuTimeInMinutes = gConfig.getValue(
                    '%s/maxCPUTime' % queueCSSection, 0.)
                if cpuTimeInMinutes:
                    cpuTimeLeft = cpuTimeInMinutes * 60.
                    gLogger.info("CPUTime for %s: %f" %
                                 (queueCSSection, cpuTimeLeft))
                else:
                    gLogger.warn(
                        "Can't find maxCPUTime for %s, defaulting CPUTime to %f"
                        % (queueCSSection, cpuTimeLeft))

    return int(cpuTimeLeft)
Ejemplo n.º 14
0
    def initialize(self, loops=0):
        """ Watchdog initialization.
    """
        if self.initialized:
            self.log.info('Watchdog already initialized')
            return S_OK()
        else:
            self.initialized = True

        setup = gConfig.getValue('/DIRAC/Setup', '')
        if not setup:
            return S_ERROR('Can not get the DIRAC Setup value')
        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR(
                'Can not get the WorkloadManagement system instance')
        self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

        self.maxcount = loops
        self.log.verbose('Watchdog initialization')
        self.log.info('Attempting to Initialize Watchdog for: %s' %
                      (self.systemFlag))
        # Test control flags
        self.testWallClock = gConfig.getValue(
            self.section + '/CheckWallClockFlag', 1)
        self.testDiskSpace = gConfig.getValue(
            self.section + '/CheckDiskSpaceFlag', 1)
        self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag',
                                            1)
        self.testCPUConsumed = gConfig.getValue(
            self.section + '/CheckCPUConsumedFlag', 1)
        self.testCPULimit = gConfig.getValue(
            self.section + '/CheckCPULimitFlag', 0)
        self.testMemoryLimit = gConfig.getValue(
            self.section + '/CheckMemoryLimitFlag', 0)
        self.testTimeLeft = gConfig.getValue(
            self.section + '/CheckTimeLeftFlag', 1)
        # Other parameters
        self.pollingTime = gConfig.getValue(self.section + '/PollingTime',
                                            10)  # 10 seconds
        self.checkingTime = gConfig.getValue(self.section + '/CheckingTime',
                                             30 * 60)  # 30 minute period
        self.minCheckingTime = gConfig.getValue(
            self.section + '/MinCheckingTime', 20 * 60)  # 20 mins
        self.maxWallClockTime = gConfig.getValue(
            self.section + '/MaxWallClockTime',
            3 * 24 * 60 * 60)  # e.g. 4 days
        self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag',
                                            1)  # on / off
        self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace',
                                             10)  # MB
        self.loadAvgLimit = gConfig.getValue(
            self.section + '/LoadAverageLimit', 1000)  # > 1000 and jobs killed
        self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime',
                                              30 *
                                              60)  # e.g. up to 20mins sample
        self.jobCPUMargin = gConfig.getValue(
            self.section + '/JobCPULimitMargin',
            20)  # %age buffer before killing job
        self.minCPUWallClockRatio = gConfig.getValue(
            self.section + '/MinCPUWallClockRatio', 5)  # ratio %age
        self.nullCPULimit = gConfig.getValue(
            self.section + '/NullCPUCountLimit',
            5)  # After 5 sample times return null CPU consumption kill job
        self.checkCount = 0
        self.nullCPUCount = 0
        if self.checkingTime < self.minCheckingTime:
            self.log.info(
                'Requested CheckingTime of %s setting to %s seconds (minimum)'
                % (self.checkingTime, self.minCheckingTime))
            self.checkingTime = self.minCheckingTime

        # The time left is returned in seconds @ 250 SI00 = 1 HS06,
        # the self.checkingTime and self.pollingTime are in seconds,
        # thus they need to be multiplied by a large enough factor
        self.grossTimeLeftLimit = 10 * self.checkingTime
        self.fineTimeLeftLimit = gConfig.getValue(
            self.section + '/TimeLeftLimit', 150 * self.pollingTime)

        self.timeLeftUtil = TimeLeft()
        self.timeLeft = 0
        self.littleTimeLeft = False
        return S_OK()
Ejemplo n.º 15
0
class Watchdog( object ):

  #############################################################################
  def __init__( self, pid, exeThread, spObject, jobCPUtime, memoryLimit = 0, systemFlag = 'linux2.4' ):
    """ Constructor, takes system flag as argument.
    """
    self.log = gLogger.getSubLogger( "Watchdog" )
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUtime = jobCPUtime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.processMonitor = ProcessMonitor()
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    #defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False


  #############################################################################
  def initialize( self, loops = 0 ):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info( 'Watchdog already initialized' )
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue( '/DIRAC/Setup', '' )
    if not setup:
      return S_ERROR( 'Can not get the DIRAC Setup value' )
    wms_instance = getSystemInstance( "WorkloadManagement" )
    if not wms_instance:
      return S_ERROR( 'Can not get the WorkloadManagement system instance' )
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose( 'Watchdog initialization' )
    self.log.info( 'Attempting to Initialize Watchdog for: %s' % ( self.systemFlag ) )
    # Test control flags
    self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1 )
    self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1 )
    self.testLoadAvg = gConfig.getValue( self.section + '/CheckLoadAvgFlag', 1 )
    self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1 )
    self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0 )
    self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0 )
    self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1 )
    # Other parameters
    self.pollingTime = gConfig.getValue( self.section + '/PollingTime', 10 )  # 10 seconds
    self.checkingTime = gConfig.getValue( self.section + '/CheckingTime', 30 * 60 )  # 30 minute period
    self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60 )  # 20 mins
    self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60 )  # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue( self.section + '/JobPeekFlag', 1 )  # on / off
    self.minDiskSpace = gConfig.getValue( self.section + '/MinDiskSpace', 10 )  # MB
    self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000 )  # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue( self.section + '/CPUSampleTime', 30 * 60 )  # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20 )  # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5 )  # ratio %age
    self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5 )  # After 5 sample times return null CPU consumption kill job
    if self.checkingTime < self.minCheckingTime:
      self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % ( self.checkingTime, self.minCheckingTime ) )
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime )

    return S_OK()

  def run( self ):
    """ The main watchdog execution method
    """

    result = self.initialize()
    if not result['OK']:
      self.log.always( 'Can not start watchdog for the following reason' )
      self.log.always( result['Message'] )
      return result

    try:
      while True:
        self.log.debug( 'Starting watchdog loop # %d' % self.count )
        start_cycle_time = time.time()
        result = self.execute()
        exec_cycle_time = time.time() - start_cycle_time
        if not result[ 'OK' ]:
          self.log.error( "Watchdog error during execution", result[ 'Message' ] )
          break
        elif result['Value'] == "Ended":
          break
        self.count += 1
        if exec_cycle_time < self.pollingTime:
          time.sleep( self.pollingTime - exec_cycle_time )
      return S_OK()
    except Exception:
      self.log.exception()
      return S_ERROR( 'Exception' )

  #############################################################################
  def execute( self ):
    """ The main agent execution method of the Watchdog.
    """
    if not self.exeThread.isAlive():
      self.__getUsageSummary()
      self.log.info( 'Process to monitor has completed, Watchdog will exit.' )
      return S_OK( "Ended" )

    if self.littleTimeLeft:
      # if we have gone over enough iterations query again
      if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
        self.checkError = 'Job has reached the CPU limit of the queue'
        self.log.error( self.checkError, self.timeLeft )
        self.__killRunningThread()
        return S_OK()
      else:
        self.littleTimeLeftCount -= 1


    # Note: need to poll regularly to see if the thread is alive
    #      but only perform checks with a certain frequency
    if ( time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount:
      self.checkCount += 1
      result = self._performChecks()
      if not result['OK']:
        self.log.warn( 'Problem during recent checks' )
        self.log.warn( result['Message'] )
      return S_OK()
    else:
      # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount))
      return S_OK()


  #############################################################################
  def _performChecks( self ):
    """The Watchdog checks are performed at a different period to the checking of the
       application thread and correspond to the checkingTime.
    """
    self.log.verbose( '------------------------------------' )
    self.log.verbose( 'Checking loop starts for Watchdog' )
    heartBeatDict = {}

    msg = ''

    loadAvg = self.getLoadAverage()
    if not loadAvg['OK']:
      msg += 'LoadAvg: ERROR'
    else:
      loadAvg = loadAvg['Value']
      msg += 'LoadAvg: %d ' % loadAvg
      heartBeatDict['LoadAverage'] = loadAvg
      if not self.parameters.has_key( 'LoadAverage' ):
        self.parameters['LoadAverage'] = []
      self.parameters['LoadAverage'].append( loadAvg )

    memoryUsed = self.getMemoryUsed()
    if not memoryUsed['OK']:
      msg += 'MemUsed: ERROR '
    else:
      memoryUsed = memoryUsed['Value']
      msg += 'MemUsed: %.1f kb ' % ( memoryUsed )
      heartBeatDict['MemoryUsed'] = memoryUsed
      if not self.parameters.has_key( 'MemoryUsed' ):
        self.parameters['MemoryUsed'] = []
      self.parameters['MemoryUsed'].append( memoryUsed )

    result = self.processMonitor.getMemoryConsumed( self.wrapperPID )
    if result['OK']:
      vsize = result['Value']['Vsize']/1024.
      rss = result['Value']['RSS']/1024.
      heartBeatDict['Vsize'] = vsize
      heartBeatDict['RSS'] = rss
      self.parameters.setdefault( 'Vsize', [] )
      self.parameters['Vsize'].append( vsize )
      self.parameters.setdefault( 'RSS', [] )
      self.parameters['RSS'].append( rss )
      msg += "Job Vsize: %.1f kb " % vsize
      msg += "Job RSS: %.1f kb " % rss
    result = self.getDiskSpace()
    if not result['OK']:
      self.log.warn( "Could not establish DiskSpace", result['Message'] )
    else:
      msg += 'DiskSpace: %.1f MB ' % ( result['Value'] )
    if not self.parameters.has_key( 'DiskSpace' ):
      self.parameters['DiskSpace'] = []
    if result['OK']:
      self.parameters['DiskSpace'].append( result['Value'] )
      heartBeatDict['AvailableDiskSpace'] = result['Value']
    
    cpu = self.__getCPU()
    if not cpu['OK']:
      msg += 'CPU: ERROR '
      hmsCPU = 0
    else:
      cpu = cpu['Value']
      msg += 'CPU: %s (h:m:s) ' % ( cpu )
      if not self.parameters.has_key( 'CPUConsumed' ):
        self.parameters['CPUConsumed'] = []
      self.parameters['CPUConsumed'].append( cpu )
      hmsCPU = cpu
      rawCPU = self.__convertCPUTime( hmsCPU )
      if rawCPU['OK']:
        heartBeatDict['CPUConsumed'] = rawCPU['Value']
    
    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn( "Failed determining wall clock time", result['Message'] )
    else:
      msg += 'WallClock: %.2f s ' % ( result['Value'] )
      self.parameters.setdefault( 'WallClockTime', list() ).append( result['Value'] )
      heartBeatDict['WallClockTime'] = result['Value']
    self.log.info( msg )

    result = self._checkProgress()
    if not result['OK']:
      self.checkError = result['Message']
      self.log.warn( self.checkError )

      if self.jobPeekFlag:
        result = self.__peek()
        if result['OK']:
          outputList = result['Value']
          size = len( outputList )
          self.log.info( 'Last %s lines of available application output:' % ( size ) )
          self.log.info( '================START================' )
          for line in outputList:
            self.log.info( line )

          self.log.info( '=================END=================' )

      self.__killRunningThread()
      return S_OK()

    recentStdOut = 'None'
    if self.jobPeekFlag:
      result = self.__peek()
      if result['OK']:
        outputList = result['Value']
        size = len( outputList )
        recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime() )
        border = '=' * len( recentStdOut )
        cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU )
        if self.timeLeft:
          cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft
        recentStdOut = '\n%s\n%s\n%s\n%s\n' % ( border, recentStdOut, cpuTotal, border )
        self.log.info( recentStdOut )
        for line in outputList:
          self.log.info( line )
          recentStdOut += line + '\n'
      else:
        recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread'
        self.log.info( recentStdOut )
        self.peekFailCount += 1
        if self.peekFailCount > self.peekRetry:
          self.jobPeekFlag = 0
          self.log.warn( 'Turning off job peeking for remainder of execution' )

    if not os.environ.has_key( 'JOBID' ):
      self.log.info( 'Running without JOBID so parameters will not be reported' )
      return S_OK()

    jobID = os.environ['JOBID']
    staticParamDict = {'StandardOutput':recentStdOut}
    self.__sendSignOfLife( int( jobID ), heartBeatDict, staticParamDict )
    return S_OK( 'Watchdog checking cycle complete' )

  #############################################################################
  def __getCPU( self ):
    """Uses os.times() to get CPU time and returns HH:MM:SS after conversion.
    """
    try:
      cpuTime = self.processMonitor.getCPUConsumed( self.wrapperPID )
      if not cpuTime['OK']:
        self.log.warn( 'Problem while checking consumed CPU' )
        return cpuTime
      cpuTime = cpuTime['Value']
      if cpuTime:
        self.log.verbose( "Raw CPU time consumed (s) = %s" % ( cpuTime ) )
        return self.__getCPUHMS( cpuTime )
      else:
        self.log.error( "CPU time consumed found to be 0" )
        return S_ERROR()

    except Exception as e:
      self.log.warn( 'Could not determine CPU time consumed with exception' )
      self.log.exception( e )
      return S_ERROR( "Could not determine CPU time consumed with exception" )



  #############################################################################
  def __getCPUHMS( self, cpuTime ):
    mins, secs = divmod( cpuTime, 60 )
    hours, mins = divmod( mins, 60 )
    humanTime = '%02d:%02d:%02d' % ( hours, mins, secs )
    self.log.verbose( 'Human readable CPU time is: %s' % humanTime )
    return S_OK( humanTime )

  #############################################################################
  def __interpretControlSignal( self, signalDict ):
    """This method is called whenever a signal is sent via the result of
       sending a sign of life.
    """
    self.log.info( 'Received control signal' )
    if type( signalDict ) == type( {} ):
      if signalDict.has_key( 'Kill' ):
        self.log.info( 'Received Kill signal, stopping job via control signal' )
        self.checkError = 'Received Kill signal'
        self.__killRunningThread()
      else:
        self.log.info( 'The following control signal was sent but not understood by the watchdog:' )
        self.log.info( signalDict )
    else:
      self.log.info( 'Expected dictionary for control signal, received:\n%s' % ( signalDict ) )

    return S_OK()

  #############################################################################
  def _checkProgress( self ):
    """This method calls specific tests to determine whether the job execution
       is proceeding normally.  CS flags can easily be added to add or remove
       tests via central configuration.
    """
    report = ''

    if self.testWallClock:
      result = self.__checkWallClockTime()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'WallClock: OK, '
    else:
      report += 'WallClock: NA,'

    if self.testDiskSpace:
      result = self.__checkDiskSpace()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'DiskSpace: OK, '
    else:
      report += 'DiskSpace: NA,'

    if self.testLoadAvg:
      result = self.__checkLoadAverage()
      if not result['OK']:
        self.log.warn( "Check of load average failed, but won't fail because of that: %s" % result['Message'] )
        report += 'LoadAverage: ERROR, '
        return S_OK()
      report += 'LoadAverage: OK, '
    else:
      report += 'LoadAverage: NA,'

    if self.testCPUConsumed:
      result = self.__checkCPUConsumed()
      if not result['OK']:
        return result
      report += 'CPUConsumed: OK, '
    else:
      report += 'CPUConsumed: NA, '

    if self.testCPULimit:
      result = self.__checkCPULimit()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'CPULimit OK, '
    else:
      report += 'CPULimit: NA, '
        
    if self.testTimeLeft:
      self.__timeLeft()
      if self.timeLeft:
        report += 'TimeLeft: OK'
    else:
      report += 'TimeLeft: NA'

    if self.testMemoryLimit:
      result = self.__checkMemoryLimit()
      if not result['OK']:
        self.log.warn( result['Message'] )
        return result
      report += 'MemoryLimit OK, '
    else:
      report += 'MemoryLimit: NA, '

    self.log.info( report )
    return S_OK( 'All enabled checks passed' )

  #############################################################################
  def __checkCPUConsumed( self ):
    """ Checks whether the CPU consumed by application process is reasonable. This
        method will report stalled jobs to be killed.
    """
    self.log.info( "Checking CPU Consumed" )

    if 'WallClockTime' not in self.parameters:
      return S_ERROR( 'Missing WallClockTime info' )
    if 'CPUConsumed' not in self.parameters:
      return S_ERROR( 'Missing CPUConsumed info' )

    wallClockTime = self.parameters['WallClockTime'][-1]
    if wallClockTime < self.sampleCPUTime:
      self.log.info( "Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % ( wallClockTime,
                                                                                            self.sampleCPUTime ) )
      return S_OK()

    intervals = max( 1, int( self.sampleCPUTime / self.checkingTime ) )
    if len( self.parameters['CPUConsumed'] ) < intervals + 1:
      self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % ( len( self.parameters['CPUConsumed'] ),
                                                                                          intervals + 1 ) )
      return S_OK()

    wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals ]
    try:
      cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1] )['Value']
      # For some reason, some times the CPU consumed estimation returns 0
      # if cpuTime == 0:
      #   return S_OK()
      cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals ] )['Value']
      if cpuTime < 0:
        self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' )
        return S_OK()
      if wallClockTime <= 0:
        self.log.warn( 'Wallclock time should not be negative or zero, Ignore' )
        return S_OK()

      ratio = ( cpuTime / wallClockTime ) * 100.

      self.log.info( "CPU/Wallclock ratio is %.2f%%" % ratio )
      # in case of error cpuTime might be 0, exclude this
      if ratio < self.minCPUWallClockRatio:
        if os.path.exists( 'DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK' ):
          self.log.info( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' )
          return S_OK()
        self.log.info( "Job is stalled!" )
        return S_ERROR( 'Watchdog identified this job as stalled' )
    except Exception as e:
      self.log.error( "Cannot convert CPU consumed from string to int", str( e ) )

    return S_OK()

  #############################################################################

  def __convertCPUTime( self, cputime ):
    """ Method to convert the CPU time as returned from the Watchdog
        instances to the equivalent DIRAC normalized CPU time to be compared
        to the Job CPU requirement.
    """
    cpuValue = 0
    cpuHMS = cputime.split( ':' )
    # for i in xrange( len( cpuHMS ) ):
    #   cpuHMS[i] = cpuHMS[i].replace( '00', '0' )

    try:
      hours = float( cpuHMS[0] ) * 60 * 60
      mins = float( cpuHMS[1] ) * 60
      secs = float( cpuHMS[2] )
      cpuValue = float( hours + mins + secs )
    except Exception as x:
      self.log.warn( str( x ) )
      return S_ERROR( 'Could not calculate CPU time' )

    # Normalization to be implemented
    normalizedCPUValue = cpuValue

    result = S_OK()
    result['Value'] = normalizedCPUValue
    self.log.debug( 'CPU value %s converted to %s' % ( cputime, normalizedCPUValue ) )
    return result

  #############################################################################

  def __checkCPULimit( self ):
    """ Checks that the job has consumed more than the job CPU requirement
        (plus a configurable margin) and kills them as necessary.
    """
    consumedCPU = 0
    if self.parameters.has_key( 'CPUConsumed' ):
      consumedCPU = self.parameters['CPUConsumed'][-1]

    consumedCPUDict = self.__convertCPUTime( consumedCPU )
    if consumedCPUDict['OK']:
      currentCPU = consumedCPUDict['Value']
    else:
      return S_OK( 'Not possible to determine current CPU consumed' )

    if consumedCPU:
      limit = self.jobCPUtime + self.jobCPUtime * ( self.jobCPUMargin / 100 )
      cpuConsumed = float( currentCPU )
      if cpuConsumed > limit:
        self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % ( self.jobCPUMargin ) )
        return S_ERROR( 'Job has exceeded maximum CPU time limit' )
      else:
        return S_OK( 'Job within CPU limit' )
    elif not currentCPU:
      self.log.verbose( 'Both initial and current CPU consumed are null' )
      return S_OK( 'CPU consumed is not measurable yet' )
    else:
      return S_OK( 'Not possible to determine CPU consumed' )

  def __checkMemoryLimit( self ):
    """ Checks that the job memory consumption is within a limit
    """
    if self.parameters.has_key( 'Vsize' ):
      vsize = self.parameters['Vsize'][-1]
      
    if vsize and self.memoryLimit:
      if vsize > self.memoryLimit:
        vsize = vsize
        # Just a warning for the moment
        self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % ( vsize, self.memoryLimit ) )  
    
    return S_OK()

  #############################################################################
  def __checkDiskSpace( self ):
    """Checks whether the CS defined minimum disk space is available.
    """
    if self.parameters.has_key( 'DiskSpace' ):
      availSpace = self.parameters['DiskSpace'][-1]
      if availSpace >= 0 and availSpace < self.minDiskSpace:
        self.log.info( 'Not enough local disk space for job to continue, defined in CS as %s MB' % ( self.minDiskSpace ) )
        return S_ERROR( 'Job has insufficient disk space to continue' )
      else:
        return S_OK( 'Job has enough disk space available' )
    else:
      return S_ERROR( 'Available disk space could not be established' )

  #############################################################################
  def __checkWallClockTime( self ):
    """Checks whether the job has been running for the CS defined maximum
       wall clock time.
    """
    if self.initialValues.has_key( 'StartTime' ):
      startTime = self.initialValues['StartTime']
      if time.time() - startTime > self.maxWallClockTime:
        self.log.info( 'Job has exceeded maximum wall clock time of %s seconds' % ( self.maxWallClockTime ) )
        return S_ERROR( 'Job has exceeded maximum wall clock time' )
      else:
        return S_OK( 'Job within maximum wall clock time' )
    else:
      return S_ERROR( 'Job start time could not be established' )

  #############################################################################
  def __checkLoadAverage( self ):
    """Checks whether the CS defined maximum load average is exceeded.
    """
    if self.parameters.has_key( 'LoadAverage' ):
      loadAvg = self.parameters['LoadAverage'][-1]
      if loadAvg > float( self.loadAvgLimit ):
        self.log.info( 'Maximum load average exceeded, defined in CS as %s ' % ( self.loadAvgLimit ) )
        return S_ERROR( 'Job exceeded maximum load average' )
      else:
        return S_OK( 'Job running with normal load average' )
    else:
      return S_ERROR( 'Job load average not established' )

  #############################################################################
  def __peek( self ):
    """ Uses ExecutionThread.getOutput() method to obtain standard output
        from running thread via subprocess callback function.
    """
    result = self.exeThread.getOutput()
    if not result['OK']:
      self.log.warn( 'Could not obtain output from running application thread' )
      self.log.warn( result['Message'] )

    return result

  #############################################################################
  def calibrate( self ):
    """ The calibrate method obtains the initial values for system memory and load
        and calculates the margin for error for the rest of the Watchdog cycle.
    """
    self.__getWallClockTime()
    self.parameters['WallClockTime'] = []

    cpuConsumed = self.__getCPU()
    if not cpuConsumed['OK']:
      self.log.warn( "Could not establish CPU consumed, setting to 0.0" )
      cpuConsumed = 0.0
    else:
      cpuConsumed = cpuConsumed['Value']

    self.initialValues['CPUConsumed'] = cpuConsumed
    self.parameters['CPUConsumed'] = []

    loadAvg = self.getLoadAverage()
    if not loadAvg['OK']:
      self.log.warn( "Could not establish LoadAverage, setting to 0" )
      loadAvg = 0
    else:
      loadAvg = loadAvg['Value']

    self.initialValues['LoadAverage'] = loadAvg
    self.parameters['LoadAverage'] = []

    memUsed = self.getMemoryUsed()
    if not memUsed['OK']:
      self.log.warn( "Could not establish MemoryUsed, setting to 0" )
      memUsed = 0
    else:
      memUsed = memUsed['Value']

    self.initialValues['MemoryUsed'] = memUsed
    self.parameters['MemoryUsed'] = []
    
    result = self.processMonitor.getMemoryConsumed( self.wrapperPID )
    self.log.verbose( 'Job Memory: %s' % ( result['Value'] ) )
    if not result['OK']:
      self.log.warn( 'Could not get job memory usage' )

    self.initialValues['Vsize'] = result['Value']['Vsize']/1024.
    self.initialValues['RSS'] = result['Value']['RSS']/1024.
    self.parameters['Vsize'] = []
    self.parameters['RSS'] = []

    result = self.getDiskSpace()
    self.log.verbose( 'DiskSpace: %s' % ( result ) )
    if not result['OK']:
      self.log.warn( "Could not establish DiskSpace" )

    self.initialValues['DiskSpace'] = result['Value']
    self.parameters['DiskSpace'] = []

    result = self.getNodeInformation()
    self.log.verbose( 'NodeInfo: %s' % ( result ) )
    if not result['OK']:
      self.log.warn( "Could not establish static system information" )

    if os.environ.has_key( 'LSB_JOBID' ):
      result['LocalJobID'] = os.environ['LSB_JOBID']
    if os.environ.has_key( 'PBS_JOBID' ):
      result['LocalJobID'] = os.environ['PBS_JOBID']
    if os.environ.has_key( 'QSUB_REQNAME' ):
      result['LocalJobID'] = os.environ['QSUB_REQNAME']
    if os.environ.has_key( 'JOB_ID' ):
      result['LocalJobID'] = os.environ['JOB_ID']

    self.__reportParameters( result, 'NodeInformation', True )
    self.__reportParameters( self.initialValues, 'InitialValues' )
    return S_OK()

  def __timeLeft( self ):
    """
      return Normalized CPU time left in the batch system
      0 if not available
      update self.timeLeft and self.littleTimeLeft
    """
    # Get CPU time left in the batch system
    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if not result['OK']:
      # Could not get CPU time left, we might need to wait for the first loop
      # or the Utility is not working properly for this batch system
      # or we are in a batch system
      timeLeft = 0
    else:
      timeLeft = result['Value']

    self.timeLeft = timeLeft
    if not self.littleTimeLeft:
      if timeLeft and timeLeft < self.grossTimeLeftLimit:
        self.log.info( 'TimeLeft bellow %s, now checking with higher frequency' % timeLeft )
        self.littleTimeLeft = True
        # TODO: better configurable way of doing this to be coded
        self.littleTimeLeftCount = 15
    else:
      if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit:
        timeLeft = -1

    return timeLeft

  #############################################################################
  def __getUsageSummary( self ):
    """ Returns average load, memory etc. over execution of job thread
    """
    summary = {}
    # CPUConsumed
    if self.parameters.has_key( 'CPUConsumed' ):
      cpuList = self.parameters['CPUConsumed']
      if cpuList:
        hmsCPU = cpuList[-1]
        rawCPU = self.__convertCPUTime( hmsCPU )
        if rawCPU['OK']:
          summary['LastUpdateCPU(s)'] = rawCPU['Value']
      else:
        summary['LastUpdateCPU(s)'] = 'Could not be estimated'
    # DiskSpace
    if self.parameters.has_key( 'DiskSpace' ):
      space = self.parameters['DiskSpace']
      if space:
        value = abs( float( space[-1] ) - float( self.initialValues['DiskSpace'] ) )
        if value < 0:
          value = 0
        summary['DiskSpace(MB)'] = value
      else:
        summary['DiskSpace(MB)'] = 'Could not be estimated'
    # MemoryUsed
    if self.parameters.has_key( 'MemoryUsed' ):
      memory = self.parameters['MemoryUsed']
      if memory:
        summary['MemoryUsed(kb)'] = abs( float( memory[-1] ) - float( self.initialValues['MemoryUsed'] ) )
      else:
        summary['MemoryUsed(kb)'] = 'Could not be estimated'
    # LoadAverage
    if self.parameters.has_key( 'LoadAverage' ):
      laList = self.parameters['LoadAverage']
      if laList:
        summary['LoadAverage'] = float( sum( laList ) ) / float( len( laList ) )
      else:
        summary['LoadAverage'] = 'Could not be estimated'

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn( "Failed determining wall clock time", result['Message'] )
      summary['WallClockTime(s)'] = 0
    else:
      wallClock = result['Value']
      summary['WallClockTime(s)'] = wallClock

    self.__reportParameters( summary, 'UsageSummary', True )
    self.currentStats = summary


  #############################################################################
  def __reportParameters( self, params, title = None, report = False ):
    """Will report parameters for job.
    """
    try:
      parameters = []
      self.log.info( '==========================================================' )
      if title:
        self.log.info( 'Watchdog will report %s' % ( title ) )
      else:
        self.log.info( 'Watchdog will report parameters' )
      self.log.info( '==========================================================' )
      vals = params
      if params.has_key( 'Value' ):
        if vals['Value']:
          vals = params['Value']
      for k, v in vals.items():
        if v:
          self.log.info( str( k ) + ' = ' + str( v ) )
          parameters.append( ( k, v ) )
      if report:
        self.__setJobParamList( parameters )

      self.log.info( '==========================================================' )
    except Exception as x:
      self.log.warn( 'Problem while reporting parameters' )
      self.log.warn( str( x ) )

  #############################################################################
  def __getWallClockTime( self ):
    """ Establishes the Wall Clock time spent since the Watchdog initialization"""
    result = S_OK()
    if self.initialValues.has_key( 'StartTime' ):
      currentTime = time.time()
      wallClock = currentTime - self.initialValues['StartTime']
      result['Value'] = wallClock
    else:
      self.initialValues['StartTime'] = time.time()
      result['Value'] = 0.0

    return result

  #############################################################################
  def __killRunningThread( self ):
    """ Will kill the running thread process and any child processes."""
    self.log.info( 'Sending kill signal to application PID %s' % ( self.spObject.getChildPID() ) )
    result = self.spObject.killChild()
    self.applicationKilled = True
    self.log.info( 'Subprocess.killChild() returned:%s ' % ( result ) )
    return S_OK( 'Thread killed' )

  #############################################################################
  def __sendSignOfLife( self, jobID, heartBeatDict, staticParamDict ):
    """ Sends sign of life 'heartbeat' signal and triggers control signal
        interpretation.
    """
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 )
    result = jobReport.sendHeartBeat( jobID, heartBeatDict, staticParamDict )
    if not result['OK']:
      self.log.warn( 'Problem sending sign of life' )
      self.log.warn( result )

    if result['OK'] and result['Value']:
      self.__interpretControlSignal( result['Value'] )

    return result

  #############################################################################
  def __setJobParamList( self, value ):
    """Wraps around setJobParameters of state update client
    """
    # job wrapper template sets the jobID variable
    if not os.environ.has_key( 'JOBID' ):
      self.log.info( 'Running without JOBID so parameters will not be reported' )
      return S_OK()
    jobID = os.environ['JOBID']
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 )
    jobParam = jobReport.setJobParameters( int( jobID ), value )
    self.log.verbose( 'setJobParameters(%s,%s)' % ( jobID, value ) )
    if not jobParam['OK']:
      self.log.warn( jobParam['Message'] )

    return jobParam

  #############################################################################
  def getNodeInformation( self ):
    """ Attempts to retrieve all static system information, should be overridden in a subclass"""
    methodName = 'getNodeInformation'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )

  #############################################################################
  def getLoadAverage( self ):
    """ Attempts to get the load average, should be overridden in a subclass"""
    methodName = 'getLoadAverage'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )

  #############################################################################
  def getMemoryUsed( self ):
    """ Attempts to get the memory used, should be overridden in a subclass"""
    methodName = 'getMemoryUsed'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )

  #############################################################################
  def getDiskSpace( self ):
    """ Attempts to get the available disk space, should be overridden in a subclass"""
    methodName = 'getDiskSpace'
    self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
    return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
Ejemplo n.º 16
0
class JobAgent( AgentModule ):
  """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
    # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceType = self.am_getOption( 'CEType', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceType = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE( ceType )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.initTimes = os.times()

    self.computingElement = ceInstance['Value']
    #Localsite options
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True )
    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 )
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '' )
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    self.pilotInfoReportedFlag = False
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    utime, stime, cutime, _cstime, _elapsed = os.times()
    cpuTime = utime + stime + cutime
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft - cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __changeProxy( self, oldProxy, newProxy ):
    """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
    self.log.verbose( 'Log proxy change (to be instrumented)' )
    return S_OK()

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Invalid Proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      groupProps = ret['Value']['groupProperties']
      if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve proxy' )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if not jobParams.has_key( 'SoftwareDistModule' ):
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    return module.execute()

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, proxyChain ):
    """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
    logLevel = self.am_getOption( 'DefaultLogLevel', 'INFO' )
    defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate',
                                                'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' )
    result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams,
                               extraOptions = self.extraOptions, defaultWrapperLocation = defaultWrapperLocation,
                               log = self.log, logLevel = logLevel )
    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    self.log.info( 'Submitting %s to %sCE' % ( os.path.basename( wrapperFile ), self.ceName ) )

    #Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'], self.__report )
      else:
        if 'Value' in submission:
          self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % ( str( submission['Value'] ) ) )
        # make sure the Job is declared Failed
        self.__report( jobID, 'Failed', submission['Message'] )
      return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __requestJob( self, ceDict ):
    """Request a single job from the matcher service.
    """
    try:
      matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
      result = matcher.requestJob( ceDict )
      return result
    except Exception, x:
      self.log.exception( lException = x )
      return S_ERROR( 'Job request to matcher service failed with exception' )
Ejemplo n.º 17
0
class JobAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    #Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
   # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceUniqueID = self.am_getOption( 'CEUniqueID', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceUniqueID = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceUniqueID
    ceInstance = ceFactory.getCE( ceUniqueID )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.computingElement = ceInstance['Value']
    self.diracRoot = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( __file__ ) ) ) )
    #Localsite options
    self.siteRoot = gConfig.getValue( '/LocalSite/Root', os.getcwd() )
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
    self.jobWrapperTemplate = os.path.join( self.diracRoot,
                                            self.am_getOption( 'JobWrapperTemplate',
                                                                defaultWrapperLocation ) )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.defaultLogLevel = self.am_getOption( 'DefaultLogLevel', 'info' )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.jobCount = 0
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '' )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        ceJDL = self.computingElement.getJDL()
        resourceJDL = ceJDL['Value']
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    ceJDL = self.computingElement.getJDL()
    resourceJDL = ceJDL['Value']
    self.log.verbose( resourceJDL )
    start = time.time()
    jobRequest = self.__requestJob( resourceJDL )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    if not jobRequest['OK']:
      if re.search( 'No work available', jobRequest['Message'] ):
        self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        return S_OK( jobRequest['Message'] )

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'SystemConfig' ):
      self.log.warn( 'Job has no system configuration defined in JDL parameters' )
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig )
      if not systemConfig:
        self.log.warn( '/LocalSite/Architecture is not defined' )
      params['SystemConfig'] = systemConfig
    else:
      systemConfig = params['SystemConfig']
      if systemConfig.lower() == 'any':
        systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
        self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =',
                       '"%s" since it was set to "ANY" in the job description' % systemConfig )
        if not systemConfig:
          self.log.warn( '/LocalSite/Architecture is not defined' )
        params['SystemConfig'] = systemConfig

    if not params.has_key( 'MaxCPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      self.__setJobParam( jobID, 'MatcherServiceTime', str( matchTime ) )
      if self.gridCEQueue:
        self.__setJobParam( jobID, 'GridCEQueue', self.gridCEQueue )
      self.__report( jobID, 'Matched', 'Job Received by Agent' )
      self.__setJobSite( jobID, self.siteName )
      self.__reportPilotInfo( jobID )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ] )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Is this necessary at all?
      saveJDL = self.__saveJobJDLRequest( jobID, jobJDL )
      #self.__report(jobID,'Matched','Job Prepared to Submit')

      resourceParameters = self.__getJDLParameters( resourceJDL )
      if not resourceParameters['OK']:
        return resourceParameters
      resourceParams = resourceParameters['Value']

      software = self.__checkInstallSoftware( jobID, params, resourceParams )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg )

      self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, resourceParams, optimizerParams, jobJDL, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'] )

      self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception' )

    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    utime, stime, cutime, cstime, elapsed = os.times()
    cpuTime = utime + stime + cutime
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft - cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __changeProxy( self, oldProxy, newProxy ):
    """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
    self.log.verbose( 'Log proxy change (to be instrumented)' )
    return S_OK()

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Invalid Proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      if Properties.GENERIC_PILOT in ret['Value']['groupProperties']:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve proxy' )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if not jobParams.has_key( 'SoftwareDistModule' ):
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    #HACK: Delete when svn repo is in production!
    softwareDist = softwareDist.replace( "DIRAC.LHCbSystem.", "LHCbDIRAC.Core." )
    #END OF HACK
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    result = module.execute()
    return result

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain ):
    """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
    result = self.__createJobWrapper( jobID, jobParams, resourceParams, optimizerParams )

    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    wrapperName = os.path.basename( wrapperFile )
    self.log.info( 'Submitting %s to %sCE' % ( wrapperName, self.ceName ) )

    #Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'], self.__report )
      return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __createJobWrapper( self, jobID, jobParams, resourceParams, optimizerParams ):
    """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
    arguments = {'Job':jobParams,
                 'CE':resourceParams,
                 'Optimizer':optimizerParams}
    self.log.verbose( 'Job arguments are: \n %s' % ( arguments ) )

    workingDir = gConfig.getValue( '/LocalSite/WorkingDirectory', self.siteRoot )
    if not os.path.exists( '%s/job/Wrapper' % ( workingDir ) ):
      try:
        os.makedirs( '%s/job/Wrapper' % ( workingDir ) )
      except Exception:
        self.log.exception()
        return S_ERROR( 'Could not create directory for wrapper script' )

    jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % ( workingDir, jobID )
    if os.path.exists( jobWrapperFile ):
      self.log.verbose( 'Removing existing Job Wrapper for %s' % ( jobID ) )
      os.remove( jobWrapperFile )
    fd = open( self.jobWrapperTemplate, 'r' )
    wrapperTemplate = fd.read()
    fd.close()

    dateStr = time.strftime( "%Y-%m-%d", time.localtime( time.time() ) )
    timeStr = time.strftime( "%H:%M", time.localtime( time.time() ) )
    date_time = '%s %s' % ( dateStr, timeStr )
    signature = __RCSID__
    dPython = sys.executable

    systemConfig = ''
    if jobParams.has_key( 'SystemConfig' ):
      systemConfig = jobParams['SystemConfig']
      self.log.verbose( 'Job system configuration requirement is %s' % ( systemConfig ) )
      if resourceParams.has_key( 'Root' ):
        jobPython = '%s/%s/bin/python' % ( resourceParams['Root'], systemConfig )
        if os.path.exists( jobPython ):
          self.log.verbose( 'Found local python for job:\n%s' % ( jobPython ) )
          dPython = jobPython
        else:
          if systemConfig == 'ANY':
            self.log.verbose( 'Using standard available python %s for job' % ( dPython ) )
          else:
            self.log.warn( 'Job requested python \n%s\n but this is not available locally' % ( jobPython ) )
      else:
        self.log.warn( 'No LocalSite/Root defined' )
    else:
      self.log.warn( 'Job has no system configuration requirement' )

    if not systemConfig or systemConfig.lower() == 'any':
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      if not systemConfig:
        self.log.warn( 'Could not establish SystemConfig' )

    logLevel = self.defaultLogLevel
    if jobParams.has_key( 'LogLevel' ):
      logLevel = jobParams['LogLevel']
      self.log.info( 'Found Job LogLevel JDL parameter with value: %s' % ( logLevel ) )
    else:
      self.log.info( 'Applying default LogLevel JDL parameter with value: %s' % ( logLevel ) )

    realPythonPath = os.path.realpath( dPython )
    self.log.debug( 'Real python path after resolving links is:' )
    self.log.debug( realPythonPath )
    dPython = realPythonPath

    siteRootPython = self.siteRoot
    self.log.debug( 'DIRACPython is:\n%s' % dPython )
    self.log.debug( 'SiteRootPythonDir is:\n%s' % siteRootPython )
    wrapperTemplate = wrapperTemplate.replace( "@SIGNATURE@", str( signature ) )
    wrapperTemplate = wrapperTemplate.replace( "@JOBID@", str( jobID ) )
    wrapperTemplate = wrapperTemplate.replace( "@DATESTRING@", str( date_time ) )
    wrapperTemplate = wrapperTemplate.replace( "@JOBARGS@", str( arguments ) )
    wrapperTemplate = wrapperTemplate.replace( "@SITEPYTHON@", str( siteRootPython ) )
    wrapper = open ( jobWrapperFile, "w" )
    wrapper.write( wrapperTemplate )
    wrapper.close ()
    jobExeFile = '%s/job/Wrapper/Job%s' % ( workingDir, jobID )
    jobFileContents = \
"""#!/bin/sh
%s %s -o LogLevel=%s
""" % ( dPython, jobWrapperFile, logLevel )
    jobFile = open( jobExeFile, 'w' )
    jobFile.write( jobFileContents )
    jobFile.close()
    return S_OK( jobExeFile )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()
    return S_OK( jdlFileName )

  #############################################################################
  def __requestJob( self, resourceJDL ):
    """Request a single job from the matcher service.
    """
    try:
      matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
      result = matcher.requestJob( resourceJDL )
      return result
    except Exception, x:
      self.log.exception( lException = x )
      return S_ERROR( 'Job request to matcher service failed with exception' )
Ejemplo n.º 18
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', 0.0)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', 1000)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', '')
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                          'BoincHostName'):
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose('Log proxy change (to be instrumented)')
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self.__requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Failed to setup proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if not 'groupProperties' in ret['Value']:
                print ret['Value']
                print proxyChain.dumpAllToString()
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve payload proxy',
                           retVal['Message'])
            self.log.warn(retVal)
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if 'SoftwareDistModule' not in jobParams:
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module: %s' %
                         (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        return module.execute()

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                    proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
        defaultWrapperLocation = self.am_getOption(
            'JobWrapperTemplate',
            'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
        result = createJobWrapper(
            jobID,
            jobParams,
            resourceParams,
            optimizerParams,
            extraOptions=self.extraOptions,
            defaultWrapperLocation=defaultWrapperLocation,
            log=self.log,
            logLevel=logLevel)
        if not result['OK']:
            return result

        wrapperFile = result['Value']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        self.log.info('Submitting %s to %sCE' %
                      (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error('Invalid proxy', proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = 'dc%s' % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy)

        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job %s submitted as %s' % (jobID, batchID))
            self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID))
            self.__setJobParam(jobID, 'LocalBatchID', str(batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'])
                return S_OK(
                )  # Without this job is marked as failed at line 265 above
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
                # make sure the Job is declared Failed
                self.__report(jobID, 'Failed', submission['Message'])
            return S_ERROR('%s CE Submission Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR(
                "Job request to matcher service failed with exception")

    #############################################################################
    def __getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search('\[', jdl):
                jdl = '[' + jdl + ']'
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith('{'):
                    self.log.debug('Found list type parameter %s' % (param))
                    rawValues = value.replace('{',
                                              '').replace('}', '').replace(
                                                  '"', '').split()
                    valueList = []
                    for val in rawValues:
                        if re.search(',$', val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', '').replace(
                        '{', '"{').replace('}', '}"')
                    self.log.debug('Found standard parameter %s: %s' %
                                   (param, parameters[param]))
            return S_OK(parameters)
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR('Exception while extracting JDL parameters for job')
Ejemplo n.º 19
0
class JobAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption("MonitoringEnabled", False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption("MaxCycles", loops)

        ceType = self.am_getOption("CEType", "InProcess")
        localCE = gConfig.getValue("/LocalSite/LocalCE", "")
        if localCE:
            self.log.info("Defining CE from local configuration = %s" % localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance["OK"]:
            self.log.warn(ceInstance["Message"])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance["Value"]
        self.diracRoot = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        # Localsite options
        self.siteRoot = gConfig.getValue("/LocalSite/Root", os.getcwd())
        self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown")
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown")
        self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        defaultWrapperLocation = "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py"
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot, self.am_getOption("JobWrapperTemplate", defaultWrapperLocation)
        )
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10)
        self.defaultLogLevel = self.am_getOption("DefaultLogLevel", "info")
        self.fillingMode = self.am_getOption("FillingModeFlag", False)
        self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True)
        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", 0.0)
        self.gridCEQueue = gConfig.getValue("/Resources/Computing/CEDefaults/GridCEQueue", "")
        self.timeLeftError = ""
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])
            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error(jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                self.log.error(jobRequest["Message"])
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if not params.has_key("JobID"):
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if not params.has_key("JobType"):
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if not params.has_key("SystemConfig"):
            self.log.warn("Job has no system configuration defined in JDL parameters")
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            self.log.info(
                "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig
            )
            if not systemConfig:
                self.log.warn("/LocalSite/Architecture is not defined")
            params["SystemConfig"] = systemConfig
        else:
            systemConfig = params["SystemConfig"]
            if systemConfig.lower() == "any":
                systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
                self.log.info(
                    "Setting SystemConfig = /LocalSite/Architecture =",
                    '"%s" since it was set to "ANY" in the job description' % systemConfig,
                )
                if not systemConfig:
                    self.log.warn("/LocalSite/Architecture is not defined")
                params["SystemConfig"] = systemConfig

        if not params.has_key("CPUTime"):
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"]))
        self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False)

            if os.environ.has_key("BOINC_JOB_ID"):
                # Report BOINC environment
                for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]:
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure)
            if "Value" in result and result["Value"]:
                proxyChain = result["Value"]

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job %s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure)

            self.log.verbose("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    "Payload execution failed with error code %s" % submission["PayloadFailed"],
                    self.stopOnApplicationFailure,
                )

            self.log.verbose("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure
            )

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"]

        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        utime, stime, cutime, cstime, elapsed = os.times()
        cpuTime = utime + stime + cutime
        self.log.info("Current raw CPU time consumed is %s" % cpuTime)
        timeleft = self.timeLeft - cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose("Log proxy change (to be instrumented)")
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False):
            proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
            if not proxyResult["OK"]:
                self.log.error("Invalid Proxy", proxyResult["Message"])
                return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
            return S_OK(proxyResult["Value"])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret["OK"]:
                self.log.error("Invalid Proxy", ret["Message"])
                return S_ERROR("Invalid Proxy")

            proxyChain = ret["Value"]["chain"]
            if not "groupProperties" in ret["Value"]:
                print ret["Value"]
                print proxyChain.dumpAllToString()
                self.log.error("Invalid Proxy", "Group has no properties defined")
                return S_ERROR("Proxy has no group properties defined")

            groupProps = ret["Value"]["groupProperties"]
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
                if not proxyResult["OK"]:
                    self.log.error("Invalid Proxy", proxyResult["Message"])
                    return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
                proxyChain = proxyResult["Value"]

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info("No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal["OK"]:
            self.log.error("Could not retrieve proxy")
            self.log.warn(retVal)
            os.system("dirac-proxy-info")
            sys.stdout.flush()
            return S_ERROR("Error retrieving proxy")

        chain = retVal["Value"]
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if not jobParams.has_key("SoftwareDistModule"):
            msg = "Job has no software installation requirement"
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, "Matched", "Installing Software")
        softwareDist = jobParams["SoftwareDistModule"]
        self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist))
        argumentsDict = {"Job": jobParams, "CE": resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance["OK"]:
            return moduleInstance

        module = moduleInstance["Value"]
        result = module.execute()
        return result

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        result = self.__createJobWrapper(jobID, jobParams, resourceParams, optimizerParams)

        if not result["OK"]:
            return result

        wrapperData = result["Value"]
        wrapperFile = wrapperData["execFile"]
        self.__report(jobID, "Matched", "Submitted To CE")

        wrapperName = os.path.basename(wrapperFile)
        self.log.info("Submitting %s to %sCE" % (wrapperName, self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy["OK"]:
            self.log.error(proxy)
            return S_ERROR("Payload Proxy Not Found")

        payloadProxy = proxy["Value"]
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = "dc%s" % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy, wrapperData)

        ret = S_OK("Job submitted")

        if submission["OK"]:
            batchID = submission["Value"]
            self.log.info("Job %s submitted as %s" % (jobID, batchID))
            self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID))
            self.__setJobParam(jobID, "LocalBatchID", str(batchID))
            if "PayloadFailed" in submission:
                ret["PayloadFailed"] = submission["PayloadFailed"]
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error("Job submission failed", jobID)
            self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName))
            if "ReschedulePayload" in submission:
                rescheduleFailedJob(jobID, submission["Message"], self.__report)
            else:
                if "Value" in submission:
                    self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"])))
                # make sure the Job is declared Failed
                self.__report(jobID, "Failed", submission["Message"])
            return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"]))

        return ret

    #############################################################################
    def __createJobWrapper(self, jobID, jobParams, resourceParams, optimizerParams):
        """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
        arguments = {"Job": jobParams, "CE": resourceParams, "Optimizer": optimizerParams}
        self.log.verbose("Job arguments are: \n %s" % (arguments))

        workingDir = gConfig.getValue("/LocalSite/WorkingDirectory", self.siteRoot)
        if not os.path.exists("%s/job/Wrapper" % (workingDir)):
            try:
                os.makedirs("%s/job/Wrapper" % (workingDir))
            except Exception:
                self.log.exception()
                return S_ERROR("Could not create directory for wrapper script")

        jobWrapperFile = "%s/job/Wrapper/Wrapper_%s" % (workingDir, jobID)
        if os.path.exists(jobWrapperFile):
            self.log.verbose("Removing existing Job Wrapper for %s" % (jobID))
            os.remove(jobWrapperFile)
        fd = open(self.jobWrapperTemplate, "r")
        wrapperTemplate = fd.read()
        fd.close()

        dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        timeStr = time.strftime("%H:%M", time.localtime(time.time()))
        date_time = "%s %s" % (dateStr, timeStr)
        signature = __RCSID__
        dPython = sys.executable

        systemConfig = ""
        if jobParams.has_key("SystemConfig"):
            systemConfig = jobParams["SystemConfig"]
            self.log.verbose("Job system configuration requirement is %s" % (systemConfig))
            if resourceParams.has_key("Root"):
                jobPython = "%s/%s/bin/python" % (resourceParams["Root"], systemConfig)
                if os.path.exists(jobPython):
                    self.log.verbose("Found local python for job:\n%s" % (jobPython))
                    dPython = jobPython
                else:
                    if systemConfig == "ANY":
                        self.log.verbose("Using standard available python %s for job" % (dPython))
                    else:
                        self.log.warn("Job requested python \n%s\n but this is not available locally" % (jobPython))
            else:
                self.log.warn("No LocalSite/Root defined")
        else:
            self.log.warn("Job has no system configuration requirement")

        if not systemConfig or systemConfig.lower() == "any":
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            if not systemConfig:
                self.log.warn("Could not establish SystemConfig")

        logLevel = self.defaultLogLevel
        if jobParams.has_key("LogLevel"):
            logLevel = jobParams["LogLevel"]
            self.log.info("Found Job LogLevel JDL parameter with value: %s" % (logLevel))
        else:
            self.log.info("Applying default LogLevel JDL parameter with value: %s" % (logLevel))

        realPythonPath = os.path.realpath(dPython)
        self.log.debug("Real python path after resolving links is:")
        self.log.debug(realPythonPath)
        dPython = realPythonPath

        siteRootPython = self.siteRoot
        self.log.debug("DIRACPython is:\n%s" % dPython)
        self.log.debug("SiteRootPythonDir is:\n%s" % siteRootPython)
        wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@", str(signature))
        wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID))
        wrapperTemplate = wrapperTemplate.replace("@DATESTRING@", str(date_time))
        wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments))
        wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@", str(siteRootPython))
        wrapper = open(jobWrapperFile, "w")
        wrapper.write(wrapperTemplate)
        wrapper.close()
        jobExeFile = "%s/job/Wrapper/Job%s" % (workingDir, jobID)
        jobFileContents = """#!/bin/sh
%s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no
""" % (
            dPython,
            jobWrapperFile,
            logLevel,
        )
        jobFile = open(jobExeFile, "w")
        jobFile.write(jobFileContents)
        jobFile.close()

        data = {
            "execFile": jobExeFile,
            "wrapperTemplate": self.jobWrapperTemplate,
            "signature": signature,
            "wrapperFile": jobWrapperFile,
            "logLevel": logLevel,
            "jid": jobID,
            "datetime": date_time,
            "jobArgs": arguments,
            "sitePython": siteRootPython,
        }
        return S_OK(data)

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient("WorkloadManagement/Matcher", timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR("Job request to matcher service failed with exception")
Ejemplo n.º 20
0
  def test_getTimeLeft( self ):
#     for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]:

    for batch, retValue in [( 'LSF', LSF_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      tl = TimeLeft()
#      res = tl.getTimeLeft()
#      self.assertEqual( res['OK'], True )

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )

      tl.scaleFactor = 10.0
      tl.normFactor = 10.0
      tl.batchPlugin.bin = '/usr/bin'
      tl.batchPlugin.hostNorm = 10.0
      tl.batchPlugin.cpuLimit = 1000
      tl.batchPlugin.wallClockLimit = 1000

      with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.runCommand", new=rcMock ):
        res = tl.getTimeLeft()
        self.assertEqual( res['OK'], True, res.get('Message', '') )

    for batch, retValue in [( 'SGE', SGE_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      tl = TimeLeft()
#       res = tl.getTimeLeft()
#       self.assertFalse( res['OK'] )

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )

      tl.scaleFactor = 10.0
      tl.normFactor = 10.0
      tl.batchPlugin.bin = '/usr/bin'
      tl.batchPlugin.hostNorm = 10.0
      tl.batchPlugin.cpuLimit = 1000
      tl.batchPlugin.wallClockLimit = 1000

      res = tl.getTimeLeft()
      self.assert_( res['OK'] )
      self.assertEqual( res['Value'], 9400.0 )
Ejemplo n.º 21
0
def getCPUTime( cpuNormalizationFactor ):
  """ Trying to get CPUTime left for execution (in seconds).

      It will first look to get the work left looking for batch system information useing the TimeLeft utility.
      If it succeeds, it will convert it in real second, and return it.

      If it fails, it tries to get it from the static info found in CS.
      If it fails, it returns the default, which is a large 9999999, that we may consider as "Infinite".

      This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value

      args:
        cpuNormalizationFactor (float): the CPU power of the current Worker Node. If not passed in, it's get from the local configuration

      returns:
        cpuTimeLeft (int): the CPU time left, in seconds
  """
  cpuTimeLeft = 0.
  cpuWorkLeft = gConfig.getValue( '/LocalSite/CPUTimeLeft', 0 )

  if not cpuWorkLeft:
    # Try and get the information from the CPU left utility
    result = TimeLeft().getTimeLeft()
    if result['OK']:
      cpuWorkLeft = result['Value']

  if cpuWorkLeft > 0:
    # This is in HS06sseconds
    # We need to convert in real seconds
    if not cpuNormalizationFactor:  # if cpuNormalizationFactor passed in is 0, try get it from the local cfg
      cpuNormalizationFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    if cpuNormalizationFactor:
      cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor  # this is a float

  if not cpuTimeLeft:
    # now we know that we have to find the CPUTimeLeft by looking in the CS
    # this is not granted to be correct as the CS units may not be real seconds
    gridCE = gConfig.getValue( '/LocalSite/GridCE' )
    ceQueue = gConfig.getValue( '/LocalSite/CEQueue' )
    if not ceQueue:
      # we have to look for a ceQueue in the CS
      # A bit hacky. We should better profit from something generic
      gLogger.warn( "No CEQueue in local configuration, looking to find one in CS" )
      siteName = gConfig.getValue( '/LocalSite/Site' )
      queueSection = '/Resources/Sites/%s/%s/CEs/%s/Queues' % ( siteName.split( '.' )[0], siteName, gridCE )
      res = gConfig.getSections( queueSection )
      if not res['OK']:
        raise RuntimeError( res['Message'] )
      queues = res['Value']
      cpuTimes = [gConfig.getValue( queueSection + '/' + queue + '/maxCPUTime', 9999999. ) for queue in queues]
      # These are (real, wall clock) minutes - damn BDII!
      cpuTimeLeft = min( cpuTimes ) * 60
    else:
      queueInfo = getQueueInfo( '%s/%s' % ( gridCE, ceQueue ) )
      cpuTimeLeft = 9999999.
      if not queueInfo['OK'] or not queueInfo['Value']:
        gLogger.warn( "Can't find a CE/queue, defaulting CPUTime to %d" % cpuTimeLeft )
      else:
        queueCSSection = queueInfo['Value']['QueueCSSection']
        # These are (real, wall clock) minutes - damn BDII!
        cpuTimeInMinutes = gConfig.getValue( '%s/maxCPUTime' % queueCSSection, 0. )
        if cpuTimeInMinutes:
          cpuTimeLeft = cpuTimeInMinutes * 60.
          gLogger.info( "CPUTime for %s: %f" % ( queueCSSection, cpuTimeLeft ) )
        else:
          gLogger.warn( "Can't find maxCPUTime for %s, defaulting CPUTime to %f" % ( queueCSSection, cpuTimeLeft ) )

  return int( cpuTimeLeft )
Ejemplo n.º 22
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """
    def __init__(self,
                 agentName,
                 loadName,
                 baseAgentName=False,
                 properties=None):
        """ Just defines some default parameters
    """
        if not properties:
            properties = {}
        super(JobAgent, self).__init__(agentName, loadName, baseAgentName,
                                       properties)

        self.ceName = 'InProcess'
        self.computingElement = None
        self.timeLeft = 0.0

        self.initTimes = os.times()
        # Localsite options
        self.siteName = 'Unknown'
        self.pilotReference = 'Unknown'
        self.defaultProxyLength = 86400 * 5
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = 0.0
        self.jobSubmissionDelay = 10
        self.fillingMode = False
        self.minimumTimeLeft = 1000
        self.stopOnApplicationFailure = True
        self.stopAfterFailedMatches = 10
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = ''
        # Timeleft
        self.timeLeftUtil = None
        self.timeLeftError = ''
        self.pilotInfoReportedFlag = False

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring, logLevel INFO, limited cycles
        self.am_setOption('MonitoringEnabled', False)
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', self.ceName)
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration',
                          '= %s' % localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn("Can't instantiate a CE", ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        if isinstance(result['Value'], list):
            ceDict = result['Value'][0]
        else:
            ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', self.timeLeft)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName)
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               self.pilotReference)
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', self.defaultProxyLength)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          self.cpuFactor)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay',
                                                    self.jobSubmissionDelay)
        self.fillingMode = self.am_getOption('FillingModeFlag',
                                             self.fillingMode)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft',
                                                 self.minimumTimeLeft)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', self.stopOnApplicationFailure)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', self.extraOptions)
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Temporary mechanism to pass a shutdown message to the agent
            if os.path.exists('/var/lib/dirac_drain'):
                return self.__finish('Node is being drained by an operator')
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(
                        "Disabling filling mode as errors calculating time left",
                        self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              '%d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            else:
                self.log.info('CE is not available')
                return self.__finish('CE Not Available')

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirement for a number of processors
        processors = int(params.get('NumberOfProcessors', 1))
        wholeNode = 'WholeNode' in params

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(thisp,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self._setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self._rescheduleFailedJob(jobID, result['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            result = self._submitJob(jobID, params, ceDict, optimizerParams,
                                     proxyChain, processors, wholeNode)
            if not result['OK']:
                self.__report(jobID, 'Failed', result['Message'])
                return self.__finish(result['Message'])
            elif 'PayloadFailed' in result:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self._getCPUTimeLeft()

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def _getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def _setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self._requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Failed to setup proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if 'groupProperties' not in ret['Value']:
                print(ret['Value'])
                print(proxyChain.dumpAllToString())
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self._requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def _requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy', 'for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve payload proxy',
                           retVal['Message'])
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def _checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if 'SoftwareDistModule' not in jobParams:
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module',
                         ': %s' % (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        return module.execute()

    #############################################################################
    def _submitJob(self,
                   jobID,
                   jobParams,
                   resourceParams,
                   optimizerParams,
                   proxyChain,
                   processors,
                   wholeNode=False):
        """ Submit job to the Computing Element instance after creating a custom
        Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
        defaultWrapperLocation = self.am_getOption(
            'JobWrapperTemplate',
            'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
        jobDesc = {
            "jobID": jobID,
            "jobParams": jobParams,
            "resourceParams": resourceParams,
            "optimizerParams": optimizerParams,
            "extraOptions": self.extraOptions,
            "defaultWrapperLocation": defaultWrapperLocation
        }
        result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc)
        if not result['OK']:
            return result

        wrapperFile = result['Value']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        self.log.info(
            'Submitting JobWrapper',
            '%s to %sCE' % (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error('Invalid proxy', proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        submission = self.computingElement.submitJob(
            wrapperFile,
            payloadProxy,
            numberOfProcessors=processors,
            wholeNode=wholeNode,
            jobDesc=jobDesc,
            log=self.log,
            logLevel=logLevel)
        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job submitted', '%s as %s' % (jobID, batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'])
                return S_OK()  # Without this, the job is marked as failed
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
            return S_ERROR('%s CE Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def _getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search(r'\[', jdl):
                jdl = '[' + jdl + ']'
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith('{'):
                    self.log.debug('Found list type parameter %s' % (param))
                    rawValues = value.replace('{',
                                              '').replace('}', '').replace(
                                                  '"', '').split()
                    valueList = []
                    for val in rawValues:
                        if re.search(',$', val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', '').replace(
                        '{', '"{').replace('}', '}"')
                    self.log.debug('Found standard parameter %s: %s' %
                                   (param, parameters[param]))
            return S_OK(parameters)
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR('Exception while extracting JDL parameters for job')

    #############################################################################
    def __report(self, jobID, status, minorStatus):
        """Wraps around setJobStatus of state update client
    """
        jobStatus = JobStateUpdateClient().setJobStatus(
            int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName)
        self.log.verbose(
            'Setting job status', 'setJobStatus(%s,%s,%s,%s)' %
            (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName))
        if not jobStatus['OK']:
            self.log.warn('Issue setting the job status', jobStatus['Message'])

        return jobStatus

    #############################################################################
    def __setJobParam(self, jobID, name, value):
        """Wraps around setJobParameter of state update client
    """
        jobParam = JobStateUpdateClient().setJobParameter(
            int(jobID), str(name), str(value))
        self.log.verbose('Setting job parameter',
                         'setJobParameter(%s,%s,%s)' % (jobID, name, value))
        if not jobParam['OK']:
            self.log.warn('Issue setting the job parameter',
                          jobParam['Message'])

        return jobParam

    #############################################################################
    def __finish(self, message, stop=True):
        """Force the JobAgent to complete gracefully.
    """
        if stop:
            self.log.info('JobAgent will stop',
                          'with message "%s", execution complete.' % message)
            self.am_stopExecution()
            return S_ERROR(message)
        else:
            return S_OK(message)

    #############################################################################
    def _rescheduleFailedJob(self, jobID, message, stop=True):
        """
    Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
    """

        self.log.warn('Failure ==> rescheduling', '(during %s)' % (message))

        jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName)

        # Setting a job parameter does not help since the job will be rescheduled,
        # instead set the status with the cause and then another status showing the
        # reschedule operation.

        jobReport.setJobStatus(status='Rescheduled',
                               application=message,
                               sendFlag=True)

        self.log.info('Job will be rescheduled')
        result = JobManagerClient().rescheduleJob(jobID)
        if not result['OK']:
            self.log.error('Failed to reschedule job', result['Message'])
            return self.__finish('Problem Rescheduling Job', stop)

        self.log.info('Job Rescheduled', jobID)
        return self.__finish('Job Rescheduled', stop)

    #############################################################################
    def finalize(self):
        """ Job Agent finalization method
    """

        gridCE = gConfig.getValue('/LocalSite/GridCE', '')
        queue = gConfig.getValue('/LocalSite/CEQueue', '')
        result = PilotManagerClient().setPilotStatus(str(self.pilotReference),
                                                     'Done', gridCE,
                                                     'Report from JobAgent',
                                                     self.siteName, queue)
        if not result['OK']:
            self.log.warn('Issue setting the pilot status', result['Message'])

        return S_OK()
Ejemplo n.º 23
0
class JobAgent(AgentModule):
  """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

  def __init__(self, agentName, loadName, baseAgentName=False, properties=None):
    """ Just defines some default parameters
    """
    if not properties:
      properties = {}
    super(JobAgent, self).__init__(agentName, loadName, baseAgentName, properties)

    self.ceName = 'InProcess'
    self.computingElement = None
    self.timeLeft = 0.0

    self.initTimes = os.times()
    # Localsite options
    self.siteName = 'Unknown'
    self.pilotReference = 'Unknown'
    self.defaultProxyLength = 86400 * 5
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = 0.0
    self.jobSubmissionDelay = 10
    self.fillingMode = False
    self.minimumTimeLeft = 1000
    self.stopOnApplicationFailure = True
    self.stopAfterFailedMatches = 10
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = ''
    # Timeleft
    self.timeLeftUtil = None
    self.timeLeftError = ''
    self.pilotInfoReportedFlag = False

  #############################################################################
  def initialize(self, loops=0):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring, logLevel INFO, limited cycles
    self.am_setOption('MonitoringEnabled', False)
    self.am_setOption('MaxCycles', loops)

    ceType = self.am_getOption('CEType', 'InProcess')
    localCE = gConfig.getValue('/LocalSite/LocalCE', '')
    if localCE:
      self.log.info('Defining CE from local configuration = %s' % localCE)
      ceType = localCE

    # Create backend Computing Element
    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE(ceType)
    if not ceInstance['OK']:
      self.log.warn(ceInstance['Message'])
      return ceInstance
    self.computingElement = ceInstance['Value']

    result = self.computingElement.getDescription()
    if not result['OK']:
      self.log.warn("Can not get the CE description")
      return result
    ceDict = result['Value']
    self.timeLeft = ceDict.get('CPUTime', self.timeLeft)
    self.timeLeft = gConfig.getValue('/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

    self.initTimes = os.times()
    # Localsite options
    self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName)
    self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference)
    self.defaultProxyLength = gConfig.getValue('/Registry/DefaultProxyLifeTime', self.defaultProxyLength)
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor)
    self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay)
    self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode)
    self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft)
    self.stopOnApplicationFailure = self.am_getOption('StopOnApplicationFailure', self.stopOnApplicationFailure)
    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)
    self.extraOptions = gConfig.getValue('/AgentJobRequirements/ExtraOptions', self.extraOptions)
    # Timeleft
    self.timeLeftUtil = TimeLeft()
    return S_OK()

  #############################################################################
  def execute(self):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish('Node is being drained by an operator')
      # Only call timeLeft utility after a job has been picked up
      self.log.info('Attempting to check CPU time left for filling mode')
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn(self.timeLeftError)
          return self.__finish(self.timeLeftError)
        self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft))
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish('No more time left')
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
        if not result['OK']:
          return self.__finish(result['Message'])

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join('.', self.extraOptions)
        else:
          localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
        localCfg.writeToFile(localConfigFile)

      else:
        return self.__finish('Filling Mode is Disabled')

    self.log.verbose('Job Agent execution loop')
    result = self.computingElement.available()
    if not result['OK']:
      self.log.info('Resource is not available')
      self.log.info(result['Message'])
      return self.__finish('CE Not Available')

    self.log.info(result['Message'])

    ceInfoDict = result['CEInfoDict']
    runningJobs = ceInfoDict.get("RunningJobs")
    availableSlots = result['Value']

    if not availableSlots:
      if runningJobs:
        self.log.info('No available slots with %d running jobs' % runningJobs)
        return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs)
      else:
        self.log.info('CE is not available')
        return self.__finish('CE Not Available')

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    # Add pilot information
    gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if 'PilotReference' not in ceDict:
      ceDict['PilotReference'] = str(self.pilotReference)
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict('/AgentJobRequirements')
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update(requirementsDict)
      self.log.info('Requirements:', requirementsDict)

    self.log.verbose(ceDict)
    start = time.time()
    jobRequest = MatcherClient().requestJob(ceDict)
    matchTime = time.time() - start
    self.log.info('MatcherTime = %.2f (s)' % (matchTime))

    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)

    if not jobRequest['OK']:
      if re.search('No match found', jobRequest['Message']):
        self.log.notice('Job request OK: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("seconds timeout") != -1:
        self.log.error('Timeout while requesting job', jobRequest['Message'])
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("Pilot version does not match") != -1:
        errorMsg = 'Pilot version does not match the production version'
        self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, ''))
        return S_ERROR(jobRequest['Message'])
      else:
        self.log.notice('Failed to get jobs: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False)
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param))
        return self.__finish('Matcher Failed')
      elif not matcherInfo[param]:
        self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param))
        return self.__finish('Matcher Failed')
      else:
        self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param]))

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters(jobJDL)
    if not parameters['OK']:
      self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
      self.log.warn(parameters['Message'])
      return self.__finish('JDL Problem')

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report(jobID, 'Failed', msg)
      self.log.warn(msg)
      return self.__finish('JDL Problem')
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn('Job has no JobType defined in JDL parameters')
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn('Job has no CPU requirement defined in JDL parameters')

    # Job requirement for a number of processors
    processors = int(params.get('NumberOfProcessors', 1))
    wholeNode = 'WholeNode' in params

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose('Job request successful: \n', jobRequest['Value'])
    self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
    self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
    self.jobCount += 1
    try:
      jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
      jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False)

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'):
          jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False)

      jobReport.setJobStatus('Matched', 'Job Received by Agent')
      result = self.__setupProxy(ownerDN, jobGroup)
      if not result['OK']:
        return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure)
      proxyChain = result.get('Value')

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest(jobID, jobJDL)

      software = self.__checkInstallSoftware(jobID, params, ceDict)
      if not software['OK']:
        self.log.error('Failed to install software for job', '%s' % (jobID))
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

      self.log.debug('Before %sCE submitJob()' % (self.ceName))
      result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode)
      if not result['OK']:
        self.__report(jobID, 'Failed', result['Message'])
        return self.__finish(result['Message'])
      elif 'PayloadFailed' in result:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % result['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish(message, self.stopOnApplicationFailure)
        else:
          self.log.info(message)

      self.log.debug('After %sCE submitJob()' % (self.ceName))
    except Exception as subExcept:  # pylint: disable=broad-except
      self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
      return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure)

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

    result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    return S_OK('Job Agent cycle complete')

  #############################################################################
  def __saveJobJDLRequest(self, jobID, jobJDL):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd(jobJDL)
    classAdJob.insertAttributeString('LocalCE', self.ceName)
    jdlFileName = jobID + '.jdl'
    jdlFile = open(jdlFileName, 'w')
    jdl = classAdJob.asJDL()
    jdlFile.write(jdl)
    jdlFile.close()

  #############################################################################
  def __getCPUTimeLeft(self):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    cpuTime = sum(os.times()[:-1])
    self.log.info('Current raw CPU time consumed is %s' % cpuTime)
    timeleft = self.timeLeft
    if self.cpuFactor:
      timeleft -= cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __setupProxy(self, ownerDN, ownerGroup):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
      proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
      if not proxyResult['OK']:
        self.log.error('Failed to setup proxy', proxyResult['Message'])
        return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message'])
      return S_OK(proxyResult['Value'])
    else:
      ret = getProxyInfo(disableVOMS=True)
      if not ret['OK']:
        self.log.error('Invalid Proxy', ret['Message'])
        return S_ERROR('Invalid Proxy')

      proxyChain = ret['Value']['chain']
      if 'groupProperties' not in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error('Invalid Proxy', 'Group has no properties defined')
        return S_ERROR('Proxy has no group properties defined')

      groupProps = ret['Value']['groupProperties']
      if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
        proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
        if not proxyResult['OK']:
          self.log.error('Invalid Proxy', proxyResult['Message'])
          return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message'])
        proxyChain = proxyResult['Value']

    return S_OK(proxyChain)

  #############################################################################
  def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
    token = gConfig.getValue("/Security/ProxyToken", "")
    if not token:
      self.log.info("No token defined. Trying to download proxy without token")
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup,
                                                         self.defaultProxyLength, token)
    if not retVal['OK']:
      self.log.error('Could not retrieve payload proxy', retVal['Message'])
      self.log.warn(retVal)
      os.system('dirac-proxy-info')
      sys.stdout.flush()
      return S_ERROR('Error retrieving proxy')

    chain = retVal['Value']
    return S_OK(chain)

  #############################################################################
  def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if 'SoftwareDistModule' not in jobParams:
      msg = 'Job has no software installation requirement'
      self.log.verbose(msg)
      return S_OK(msg)

    self.__report(jobID, 'Matched', 'Installing Software')
    softwareDist = jobParams['SoftwareDistModule']
    self.log.verbose('Found VO Software Distribution module: %s' % (softwareDist))
    argumentsDict = {'Job': jobParams, 'CE': resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    return module.execute()

  #############################################################################
  def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                  proxyChain, processors, wholeNode=False):
    """ Submit job to the Computing Element instance after creating a custom
        Job Wrapper with the available job parameters.
    """
    logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
    defaultWrapperLocation = self.am_getOption('JobWrapperTemplate',
                                               'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
    jobDesc = {"jobID": jobID,
               "jobParams": jobParams,
               "resourceParams": resourceParams,
               "optimizerParams": optimizerParams,
               "extraOptions": self.extraOptions,
               "defaultWrapperLocation": defaultWrapperLocation}
    result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc)
    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report(jobID, 'Matched', 'Submitted To CE')

    self.log.info('Submitting JobWrapper %s to %sCE' % (os.path.basename(wrapperFile), self.ceName))

    # Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error('Invalid proxy', proxy)
      return S_ERROR('Payload Proxy Not Found')

    payloadProxy = proxy['Value']
    submission = self.computingElement.submitJob(wrapperFile, payloadProxy,
                                                 numberOfProcessors=processors,
                                                 wholeNode=wholeNode,
                                                 jobDesc=jobDesc,
                                                 log=self.log,
                                                 logLevel=logLevel)
    ret = S_OK('Job submitted')

    if submission['OK']:
      batchID = submission['Value']
      self.log.info('Job %s submitted as %s' % (jobID, batchID))
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep(self.jobSubmissionDelay)
    else:
      self.log.error('Job submission failed', jobID)
      self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName))
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob(jobID, submission['Message'])
        return S_OK()  # Without this, the job is marked as failed
      else:
        if 'Value' in submission:
          self.log.error('Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value'])))
      return S_ERROR('%s CE Error: %s' % (self.ceName, submission['Message']))

    return ret

  #############################################################################
  def __getJDLParameters(self, jdl):
    """Returns a dictionary of JDL parameters.
    """
    try:
      parameters = {}
#      print jdl
      if not re.search(r'\[', jdl):
        jdl = '[' + jdl + ']'
      classAdJob = ClassAd(jdl)
      paramsDict = classAdJob.contents
      for param, value in paramsDict.items():
        if value.strip().startswith('{'):
          self.log.debug('Found list type parameter %s' % (param))
          rawValues = value.replace('{', '').replace('}', '').replace('"', '').split()
          valueList = []
          for val in rawValues:
            if re.search(',$', val):
              valueList.append(val[:-1])
            else:
              valueList.append(val)
          parameters[param] = valueList
        else:
          parameters[param] = value.replace('"', '').replace('{', '"{').replace('}', '}"')
          self.log.debug('Found standard parameter %s: %s' % (param, parameters[param]))
      return S_OK(parameters)
    except Exception as x:
      self.log.exception(lException=x)
      return S_ERROR('Exception while extracting JDL parameters for job')

  #############################################################################
  def __report(self, jobID, status, minorStatus):
    """Wraps around setJobStatus of state update client
    """
    jobStatus = JobStateUpdateClient().setJobStatus(int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName)
    self.log.verbose('setJobStatus(%s,%s,%s,%s)' % (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName))
    if not jobStatus['OK']:
      self.log.warn(jobStatus['Message'])

    return jobStatus

  #############################################################################
  def __setJobParam(self, jobID, name, value):
    """Wraps around setJobParameter of state update client
    """
    jobParam = JobStateUpdateClient().setJobParameter(int(jobID), str(name), str(value))
    self.log.verbose('setJobParameter(%s,%s,%s)' % (jobID, name, value))
    if not jobParam['OK']:
      self.log.warn(jobParam['Message'])

    return jobParam

  #############################################################################
  def __finish(self, message, stop=True):
    """Force the JobAgent to complete gracefully.
    """
    if stop:
      self.log.info('JobAgent will stop with message "%s", execution complete.' % message)
      self.am_stopExecution()
      return S_ERROR(message)
    else:
      return S_OK(message)

  #############################################################################
  def __rescheduleFailedJob(self, jobID, message, stop=True):
    """
    Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
    """

    self.log.warn('Failure during %s' % (message))

    jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName)

    # Setting a job parameter does not help since the job will be rescheduled,
    # instead set the status with the cause and then another status showing the
    # reschedule operation.

    jobReport.setJobStatus(status='Rescheduled',
                           application=message,
                           sendFlag=True)

    self.log.info('Job will be rescheduled')
    result = JobManagerClient().rescheduleJob(jobID)
    if not result['OK']:
      self.log.error('Failed to reschedule job', result['Message'])
      return self.__finish('Problem Rescheduling Job', stop)

    self.log.info('Job Rescheduled %s' % (jobID))
    return self.__finish('Job Rescheduled', stop)

  #############################################################################
  def finalize(self):
    """ Job Agent finalization method
    """

    gridCE = gConfig.getValue('/LocalSite/GridCE', '')
    queue = gConfig.getValue('/LocalSite/CEQueue', '')
    wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator')
    result = wmsAdmin.setPilotStatus(str(self.pilotReference), 'Done', gridCE,
                                     'Report from JobAgent', self.siteName, queue)
    if not result['OK']:
      self.log.warn(result['Message'])

    return S_OK()
Ejemplo n.º 24
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption("MonitoringEnabled", False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption("MaxCycles", loops)

        ceType = self.am_getOption("CEType", "InProcess")
        localCE = gConfig.getValue("/LocalSite/LocalCE", "")
        if localCE:
            self.log.info("Defining CE from local configuration = %s" % localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance["OK"]:
            self.log.warn(ceInstance["Message"])
            return ceInstance
        self.computingElement = ceInstance["Value"]

        result = self.computingElement.getDescription()
        if not result["OK"]:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result["Value"]
        self.timeLeft = ceDict.get("CPUTime", 0.0)
        self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown")
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown")
        self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10)
        self.fillingMode = self.am_getOption("FillingModeFlag", False)
        self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", 1000)
        self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True)
        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue("/AgentJobRequirements/ExtraOptions", "")
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeftError = ""
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString("LocalCE", self.ceName)
        jdlFileName = jobID + ".jdl"
        jdlFile = open(jdlFileName, "w")
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info("Current raw CPU time consumed is %s" % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose("Log proxy change (to be instrumented)")
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False):
            proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
            if not proxyResult["OK"]:
                self.log.error("Failed to setup proxy", proxyResult["Message"])
                return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
            return S_OK(proxyResult["Value"])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret["OK"]:
                self.log.error("Invalid Proxy", ret["Message"])
                return S_ERROR("Invalid Proxy")

            proxyChain = ret["Value"]["chain"]
            if not "groupProperties" in ret["Value"]:
                print ret["Value"]
                print proxyChain.dumpAllToString()
                self.log.error("Invalid Proxy", "Group has no properties defined")
                return S_ERROR("Proxy has no group properties defined")

            groupProps = ret["Value"]["groupProperties"]
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
                if not proxyResult["OK"]:
                    self.log.error("Invalid Proxy", proxyResult["Message"])
                    return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
                proxyChain = proxyResult["Value"]

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info("No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal["OK"]:
            self.log.error("Could not retrieve payload proxy", retVal["Message"])
            self.log.warn(retVal)
            os.system("dirac-proxy-info")
            sys.stdout.flush()
            return S_ERROR("Error retrieving proxy")

        chain = retVal["Value"]
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if "SoftwareDistModule" not in jobParams:
            msg = "Job has no software installation requirement"
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, "Matched", "Installing Software")
        softwareDist = jobParams["SoftwareDistModule"]
        self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist))
        argumentsDict = {"Job": jobParams, "CE": resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance["OK"]:
            return moduleInstance

        module = moduleInstance["Value"]
        return module.execute()

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption("DefaultLogLevel", "INFO")
        defaultWrapperLocation = self.am_getOption(
            "JobWrapperTemplate", "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py"
        )
        result = createJobWrapper(
            jobID,
            jobParams,
            resourceParams,
            optimizerParams,
            extraOptions=self.extraOptions,
            defaultWrapperLocation=defaultWrapperLocation,
            log=self.log,
            logLevel=logLevel,
        )
        if not result["OK"]:
            return result

        wrapperFile = result["Value"]
        self.__report(jobID, "Matched", "Submitted To CE")

        self.log.info("Submitting %s to %sCE" % (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy["OK"]:
            self.log.error("Invalid proxy", proxy)
            return S_ERROR("Payload Proxy Not Found")

        payloadProxy = proxy["Value"]
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = "dc%s" % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy)

        ret = S_OK("Job submitted")

        if submission["OK"]:
            batchID = submission["Value"]
            self.log.info("Job %s submitted as %s" % (jobID, batchID))
            self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID))
            self.__setJobParam(jobID, "LocalBatchID", str(batchID))
            if "PayloadFailed" in submission:
                ret["PayloadFailed"] = submission["PayloadFailed"]
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error("Job submission failed", jobID)
            self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName))
            if "ReschedulePayload" in submission:
                rescheduleFailedJob(jobID, submission["Message"])
                return S_OK()  # Without this job is marked as failed at line 265 above
            else:
                if "Value" in submission:
                    self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"])))
                # make sure the Job is declared Failed
                self.__report(jobID, "Failed", submission["Message"])
            return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"]))

        return ret

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient("WorkloadManagement/Matcher", timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR("Job request to matcher service failed with exception")

    #############################################################################
    def __getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search("\[", jdl):
                jdl = "[" + jdl + "]"
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith("{"):
                    self.log.debug("Found list type parameter %s" % (param))
                    rawValues = value.replace("{", "").replace("}", "").replace('"', "").split()
                    valueList = []
                    for val in rawValues:
                        if re.search(",$", val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', "").replace("{", '"{').replace("}", '}"')
                    self.log.debug("Found standard parameter %s: %s" % (param, parameters[param]))
            return S_OK(parameters)
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR("Exception while extracting JDL parameters for job")