Beispiel #1
0
  def test_getScaledCPU( self ):
    tl = TimeLeft()
    res = tl.getScaledCPU()
    self.assertEqual( res, 0 )

    tl.scaleFactor = 5.0
    tl.normFactor = 5.0

    for batch, retValue in [( 'LSF', LSF_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )
      res = tl.getScaledCPU()
      self.assertEqual( res, 0.0 )

    for batch, retValue in [( 'SGE', SGE_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' %
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )
      res = tl.getScaledCPU()
      self.assertEqual( res, 300.0 )
Beispiel #2
0
  def test_getScaledCPU( self ):
    tl = TimeLeft()
    res = tl.getScaledCPU()
    self.assertEqual( res, 0 )

    tl.scaleFactor = 5.0
    tl.normFactor = 5.0

    for batch, retValue in [( 'LSF', LSF_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % #pylint: disable=unused-variable
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )
      res = tl.getScaledCPU()
      self.assertEqual( res, 0.0 )

    for batch, retValue in [( 'SGE', SGE_ReturnValue )]:
      self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
      rcMock = MagicMock()
      rcMock.return_value = S_OK( retValue )
      self.tl.runCommand = rcMock

      batchSystemName = '%sTimeLeft' % batch
      batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % #pylint: disable=unused-variable
                                batchSystemName, globals(), locals(), [batchSystemName] )
      batchStr = 'batchPlugin.%s()' % ( batchSystemName )
      tl.batchPlugin = eval( batchStr )
      res = tl.getScaledCPU()
      self.assertEqual( res, 300.0 )

      for batch, retValue in [( 'MJF', MJF_ReturnValue )]:
        self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" )
        rcMock = MagicMock()
        rcMock.return_value = S_OK( retValue )
        self.tl.runCommand = rcMock

        batchSystemName = '%sTimeLeft' % batch
        batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % #pylint: disable=unused-variable
                                  batchSystemName, globals(), locals(), [batchSystemName] )
        batchStr = 'batchPlugin.%s()' % ( batchSystemName )
        tl.batchPlugin = eval( batchStr )
        res = tl.getScaledCPU()
        self.assertEqual( res, 0.0 )
Beispiel #3
0
class JobAgent( AgentModule ):
  """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    #Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
   # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceUniqueID = self.am_getOption( 'CEUniqueID', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceUniqueID = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceUniqueID
    ceInstance = ceFactory.getCE( ceUniqueID )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.computingElement = ceInstance['Value']
    self.diracRoot = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( __file__ ) ) ) )
    #Localsite options
    self.siteRoot = gConfig.getValue( '/LocalSite/Root', os.getcwd() )
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
    self.jobWrapperTemplate = os.path.join( self.diracRoot,
                                            self.am_getOption( 'JobWrapperTemplate',
                                                                defaultWrapperLocation ) )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.defaultLogLevel = self.am_getOption( 'DefaultLogLevel', 'info' )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.jobCount = 0
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '' )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        ceJDL = self.computingElement.getJDL()
        resourceJDL = ceJDL['Value']
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    ceJDL = self.computingElement.getJDL()
    resourceJDL = ceJDL['Value']
    self.log.verbose( resourceJDL )
    start = time.time()
    jobRequest = self.__requestJob( resourceJDL )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    if not jobRequest['OK']:
      if re.search( 'No work available', jobRequest['Message'] ):
        self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        return S_OK( jobRequest['Message'] )

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'SystemConfig' ):
      self.log.warn( 'Job has no system configuration defined in JDL parameters' )
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig )
      if not systemConfig:
        self.log.warn( '/LocalSite/Architecture is not defined' )
      params['SystemConfig'] = systemConfig
    else:
      systemConfig = params['SystemConfig']
      if systemConfig.lower() == 'any':
        systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
        self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =',
                       '"%s" since it was set to "ANY" in the job description' % systemConfig )
        if not systemConfig:
          self.log.warn( '/LocalSite/Architecture is not defined' )
        params['SystemConfig'] = systemConfig

    if not params.has_key( 'MaxCPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      self.__setJobParam( jobID, 'MatcherServiceTime', str( matchTime ) )
      if self.gridCEQueue:
        self.__setJobParam( jobID, 'GridCEQueue', self.gridCEQueue )
      self.__report( jobID, 'Matched', 'Job Received by Agent' )
      self.__setJobSite( jobID, self.siteName )
      self.__reportPilotInfo( jobID )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ] )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Is this necessary at all?
      saveJDL = self.__saveJobJDLRequest( jobID, jobJDL )
      #self.__report(jobID,'Matched','Job Prepared to Submit')

      resourceParameters = self.__getJDLParameters( resourceJDL )
      if not resourceParameters['OK']:
        return resourceParameters
      resourceParams = resourceParameters['Value']

      software = self.__checkInstallSoftware( jobID, params, resourceParams )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg )

      self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, resourceParams, optimizerParams, jobJDL, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'] )

      self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception' )

    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    utime, stime, cutime, cstime, elapsed = os.times()
    cpuTime = utime + stime + cutime
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft - cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __changeProxy( self, oldProxy, newProxy ):
    """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
    self.log.verbose( 'Log proxy change (to be instrumented)' )
    return S_OK()

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Invalid Proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      if Properties.GENERIC_PILOT in ret['Value']['groupProperties']:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve proxy' )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if not jobParams.has_key( 'SoftwareDistModule' ):
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    #HACK: Delete when svn repo is in production!
    softwareDist = softwareDist.replace( "DIRAC.LHCbSystem.", "LHCbDIRAC.Core." )
    #END OF HACK
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    result = module.execute()
    return result

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain ):
    """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
    result = self.__createJobWrapper( jobID, jobParams, resourceParams, optimizerParams )

    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    wrapperName = os.path.basename( wrapperFile )
    self.log.info( 'Submitting %s to %sCE' % ( wrapperName, self.ceName ) )

    #Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'], self.__report )
      return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __createJobWrapper( self, jobID, jobParams, resourceParams, optimizerParams ):
    """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
    arguments = {'Job':jobParams,
                 'CE':resourceParams,
                 'Optimizer':optimizerParams}
    self.log.verbose( 'Job arguments are: \n %s' % ( arguments ) )

    workingDir = gConfig.getValue( '/LocalSite/WorkingDirectory', self.siteRoot )
    if not os.path.exists( '%s/job/Wrapper' % ( workingDir ) ):
      try:
        os.makedirs( '%s/job/Wrapper' % ( workingDir ) )
      except Exception:
        self.log.exception()
        return S_ERROR( 'Could not create directory for wrapper script' )

    jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % ( workingDir, jobID )
    if os.path.exists( jobWrapperFile ):
      self.log.verbose( 'Removing existing Job Wrapper for %s' % ( jobID ) )
      os.remove( jobWrapperFile )
    fd = open( self.jobWrapperTemplate, 'r' )
    wrapperTemplate = fd.read()
    fd.close()

    dateStr = time.strftime( "%Y-%m-%d", time.localtime( time.time() ) )
    timeStr = time.strftime( "%H:%M", time.localtime( time.time() ) )
    date_time = '%s %s' % ( dateStr, timeStr )
    signature = __RCSID__
    dPython = sys.executable

    systemConfig = ''
    if jobParams.has_key( 'SystemConfig' ):
      systemConfig = jobParams['SystemConfig']
      self.log.verbose( 'Job system configuration requirement is %s' % ( systemConfig ) )
      if resourceParams.has_key( 'Root' ):
        jobPython = '%s/%s/bin/python' % ( resourceParams['Root'], systemConfig )
        if os.path.exists( jobPython ):
          self.log.verbose( 'Found local python for job:\n%s' % ( jobPython ) )
          dPython = jobPython
        else:
          if systemConfig == 'ANY':
            self.log.verbose( 'Using standard available python %s for job' % ( dPython ) )
          else:
            self.log.warn( 'Job requested python \n%s\n but this is not available locally' % ( jobPython ) )
      else:
        self.log.warn( 'No LocalSite/Root defined' )
    else:
      self.log.warn( 'Job has no system configuration requirement' )

    if not systemConfig or systemConfig.lower() == 'any':
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      if not systemConfig:
        self.log.warn( 'Could not establish SystemConfig' )

    logLevel = self.defaultLogLevel
    if jobParams.has_key( 'LogLevel' ):
      logLevel = jobParams['LogLevel']
      self.log.info( 'Found Job LogLevel JDL parameter with value: %s' % ( logLevel ) )
    else:
      self.log.info( 'Applying default LogLevel JDL parameter with value: %s' % ( logLevel ) )

    realPythonPath = os.path.realpath( dPython )
    self.log.debug( 'Real python path after resolving links is:' )
    self.log.debug( realPythonPath )
    dPython = realPythonPath

    siteRootPython = self.siteRoot
    self.log.debug( 'DIRACPython is:\n%s' % dPython )
    self.log.debug( 'SiteRootPythonDir is:\n%s' % siteRootPython )
    wrapperTemplate = wrapperTemplate.replace( "@SIGNATURE@", str( signature ) )
    wrapperTemplate = wrapperTemplate.replace( "@JOBID@", str( jobID ) )
    wrapperTemplate = wrapperTemplate.replace( "@DATESTRING@", str( date_time ) )
    wrapperTemplate = wrapperTemplate.replace( "@JOBARGS@", str( arguments ) )
    wrapperTemplate = wrapperTemplate.replace( "@SITEPYTHON@", str( siteRootPython ) )
    wrapper = open ( jobWrapperFile, "w" )
    wrapper.write( wrapperTemplate )
    wrapper.close ()
    jobExeFile = '%s/job/Wrapper/Job%s' % ( workingDir, jobID )
    jobFileContents = \
"""#!/bin/sh
%s %s -o LogLevel=%s
""" % ( dPython, jobWrapperFile, logLevel )
    jobFile = open( jobExeFile, 'w' )
    jobFile.write( jobFileContents )
    jobFile.close()
    return S_OK( jobExeFile )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()
    return S_OK( jdlFileName )

  #############################################################################
  def __requestJob( self, resourceJDL ):
    """Request a single job from the matcher service.
    """
    try:
      matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
      result = matcher.requestJob( resourceJDL )
      return result
    except Exception, x:
      self.log.exception( lException = x )
      return S_ERROR( 'Job request to matcher service failed with exception' )
Beispiel #4
0
class JobAgent( AgentModule ):
  """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
    # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceType = self.am_getOption( 'CEType', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceType = localCE

    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE( ceType )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance

    self.initTimes = os.times()

    self.computingElement = ceInstance['Value']
    #Localsite options
    self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 )
    #Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', False )
    self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True )
    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 )
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '' )
    #Timeleft
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 )
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    self.pilotInfoReportedFlag = False
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    utime, stime, cutime, _cstime, _elapsed = os.times()
    cpuTime = utime + stime + cutime
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft - cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __changeProxy( self, oldProxy, newProxy ):
    """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
    self.log.verbose( 'Log proxy change (to be instrumented)' )
    return S_OK()

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Invalid Proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      groupProps = ret['Value']['groupProperties']
      if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve proxy' )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if not jobParams.has_key( 'SoftwareDistModule' ):
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    return module.execute()

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, proxyChain ):
    """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
    logLevel = self.am_getOption( 'DefaultLogLevel', 'INFO' )
    defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate',
                                                'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' )
    result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams,
                               extraOptions = self.extraOptions, defaultWrapperLocation = defaultWrapperLocation,
                               log = self.log, logLevel = logLevel )
    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    self.log.info( 'Submitting %s to %sCE' % ( os.path.basename( wrapperFile ), self.ceName ) )

    #Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'], self.__report )
      else:
        if 'Value' in submission:
          self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % ( str( submission['Value'] ) ) )
        # make sure the Job is declared Failed
        self.__report( jobID, 'Failed', submission['Message'] )
      return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __requestJob( self, ceDict ):
    """Request a single job from the matcher service.
    """
    try:
      matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
      result = matcher.requestJob( ceDict )
      return result
    except Exception, x:
      self.log.exception( lException = x )
      return S_ERROR( 'Job request to matcher service failed with exception' )
Beispiel #5
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', 0.0)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', 1000)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', '')
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                          'BoincHostName'):
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose('Log proxy change (to be instrumented)')
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self.__requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Failed to setup proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if not 'groupProperties' in ret['Value']:
                print ret['Value']
                print proxyChain.dumpAllToString()
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve payload proxy',
                           retVal['Message'])
            self.log.warn(retVal)
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if 'SoftwareDistModule' not in jobParams:
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module: %s' %
                         (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        return module.execute()

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                    proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption('DefaultLogLevel', 'INFO')
        defaultWrapperLocation = self.am_getOption(
            'JobWrapperTemplate',
            'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py')
        result = createJobWrapper(
            jobID,
            jobParams,
            resourceParams,
            optimizerParams,
            extraOptions=self.extraOptions,
            defaultWrapperLocation=defaultWrapperLocation,
            log=self.log,
            logLevel=logLevel)
        if not result['OK']:
            return result

        wrapperFile = result['Value']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        self.log.info('Submitting %s to %sCE' %
                      (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error('Invalid proxy', proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = 'dc%s' % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy)

        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job %s submitted as %s' % (jobID, batchID))
            self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID))
            self.__setJobParam(jobID, 'LocalBatchID', str(batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'])
                return S_OK(
                )  # Without this job is marked as failed at line 265 above
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
                # make sure the Job is declared Failed
                self.__report(jobID, 'Failed', submission['Message'])
            return S_ERROR('%s CE Submission Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR(
                "Job request to matcher service failed with exception")

    #############################################################################
    def __getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search('\[', jdl):
                jdl = '[' + jdl + ']'
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith('{'):
                    self.log.debug('Found list type parameter %s' % (param))
                    rawValues = value.replace('{',
                                              '').replace('}', '').replace(
                                                  '"', '').split()
                    valueList = []
                    for val in rawValues:
                        if re.search(',$', val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', '').replace(
                        '{', '"{').replace('}', '}"')
                    self.log.debug('Found standard parameter %s: %s' %
                                   (param, parameters[param]))
            return S_OK(parameters)
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR('Exception while extracting JDL parameters for job')
Beispiel #6
0
class JobAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        #Disable monitoring
        self.am_setOption('MonitoringEnabled', False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', 'InProcess')
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration = %s' %
                          localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn(ceInstance['Message'])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance['Value']
        self.diracRoot = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        #Localsite options
        self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd())
        self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown')
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               'Unknown')
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', 86400 * 5)
        #Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          0.0)
        defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py'
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot,
            self.am_getOption('JobWrapperTemplate', defaultWrapperLocation))
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10)
        self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info')
        self.fillingMode = self.am_getOption('FillingModeFlag', False)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', True)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        #Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0)
        self.gridCEQueue = gConfig.getValue(
            '/Resources/Computing/CEDefaults/GridCEQueue', '')
        self.timeLeftError = ''
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])
            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        utime, stime, cutime, cstime, elapsed = os.times()
        cpuTime = utime + stime + cutime
        self.log.info('Current raw CPU time consumed is %s' % cpuTime)
        timeleft = self.timeLeft - cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose('Log proxy change (to be instrumented)')
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False):
            proxyResult = self.__requestProxyFromProxyManager(
                ownerDN, ownerGroup)
            if not proxyResult['OK']:
                self.log.error('Invalid Proxy', proxyResult['Message'])
                return S_ERROR('Failed to setup proxy: %s' %
                               proxyResult['Message'])
            return S_OK(proxyResult['Value'])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret['OK']:
                self.log.error('Invalid Proxy', ret['Message'])
                return S_ERROR('Invalid Proxy')

            proxyChain = ret['Value']['chain']
            if not 'groupProperties' in ret['Value']:
                print ret['Value']
                print proxyChain.dumpAllToString()
                self.log.error('Invalid Proxy',
                               'Group has no properties defined')
                return S_ERROR('Proxy has no group properties defined')

            groupProps = ret['Value']['groupProperties']
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(
                    ownerDN, ownerGroup)
                if not proxyResult['OK']:
                    self.log.error('Invalid Proxy', proxyResult['Message'])
                    return S_ERROR('Failed to setup proxy: %s' %
                                   proxyResult['Message'])
                proxyChain = proxyResult['Value']

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info(
                "No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(
            ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal['OK']:
            self.log.error('Could not retrieve proxy')
            self.log.warn(retVal)
            os.system('dirac-proxy-info')
            sys.stdout.flush()
            return S_ERROR('Error retrieving proxy')

        chain = retVal['Value']
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if not jobParams.has_key('SoftwareDistModule'):
            msg = 'Job has no software installation requirement'
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, 'Matched', 'Installing Software')
        softwareDist = jobParams['SoftwareDistModule']
        self.log.verbose('Found VO Software Distribution module: %s' %
                         (softwareDist))
        argumentsDict = {'Job': jobParams, 'CE': resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance['OK']:
            return moduleInstance

        module = moduleInstance['Value']
        result = module.execute()
        return result

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams,
                    jobJDL, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        result = self.__createJobWrapper(jobID, jobParams, resourceParams,
                                         optimizerParams)

        if not result['OK']:
            return result

        wrapperData = result['Value']
        wrapperFile = wrapperData['execFile']
        self.__report(jobID, 'Matched', 'Submitted To CE')

        wrapperName = os.path.basename(wrapperFile)
        self.log.info('Submitting %s to %sCE' % (wrapperName, self.ceName))

        #Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy['OK']:
            self.log.error(proxy)
            return S_ERROR('Payload Proxy Not Found')

        payloadProxy = proxy['Value']
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = 'dc%s' % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy,
                                                     wrapperData)

        ret = S_OK('Job submitted')

        if submission['OK']:
            batchID = submission['Value']
            self.log.info('Job %s submitted as %s' % (jobID, batchID))
            self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID))
            self.__setJobParam(jobID, 'LocalBatchID', str(batchID))
            if 'PayloadFailed' in submission:
                ret['PayloadFailed'] = submission['PayloadFailed']
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error('Job submission failed', jobID)
            self.__setJobParam(jobID, 'ErrorMessage',
                               '%s CE Submission Error' % (self.ceName))
            if 'ReschedulePayload' in submission:
                rescheduleFailedJob(jobID, submission['Message'],
                                    self.__report)
            else:
                if 'Value' in submission:
                    self.log.error(
                        'Error in DIRAC JobWrapper:',
                        'exit code = %s' % (str(submission['Value'])))
                # make sure the Job is declared Failed
                self.__report(jobID, 'Failed', submission['Message'])
            return S_ERROR('%s CE Submission Error: %s' %
                           (self.ceName, submission['Message']))

        return ret

    #############################################################################
    def __createJobWrapper(self, jobID, jobParams, resourceParams,
                           optimizerParams):
        """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
        arguments = {
            'Job': jobParams,
            'CE': resourceParams,
            'Optimizer': optimizerParams
        }
        self.log.verbose('Job arguments are: \n %s' % (arguments))

        workingDir = gConfig.getValue('/LocalSite/WorkingDirectory',
                                      self.siteRoot)
        if not os.path.exists('%s/job/Wrapper' % (workingDir)):
            try:
                os.makedirs('%s/job/Wrapper' % (workingDir))
            except Exception:
                self.log.exception()
                return S_ERROR('Could not create directory for wrapper script')

        jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % (workingDir, jobID)
        if os.path.exists(jobWrapperFile):
            self.log.verbose('Removing existing Job Wrapper for %s' % (jobID))
            os.remove(jobWrapperFile)
        fd = open(self.jobWrapperTemplate, 'r')
        wrapperTemplate = fd.read()
        fd.close()

        dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        timeStr = time.strftime("%H:%M", time.localtime(time.time()))
        date_time = '%s %s' % (dateStr, timeStr)
        signature = __RCSID__
        dPython = sys.executable

        systemConfig = ''
        if jobParams.has_key('SystemConfig'):
            systemConfig = jobParams['SystemConfig']
            self.log.verbose('Job system configuration requirement is %s' %
                             (systemConfig))
            if resourceParams.has_key('Root'):
                jobPython = '%s/%s/bin/python' % (resourceParams['Root'],
                                                  systemConfig)
                if os.path.exists(jobPython):
                    self.log.verbose('Found local python for job:\n%s' %
                                     (jobPython))
                    dPython = jobPython
                else:
                    if systemConfig == 'ANY':
                        self.log.verbose(
                            'Using standard available python %s for job' %
                            (dPython))
                    else:
                        self.log.warn(
                            'Job requested python \n%s\n but this is not available locally'
                            % (jobPython))
            else:
                self.log.warn('No LocalSite/Root defined')
        else:
            self.log.warn('Job has no system configuration requirement')

        if not systemConfig or systemConfig.lower() == 'any':
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            if not systemConfig:
                self.log.warn('Could not establish SystemConfig')

        logLevel = self.defaultLogLevel
        if jobParams.has_key('LogLevel'):
            logLevel = jobParams['LogLevel']
            self.log.info('Found Job LogLevel JDL parameter with value: %s' %
                          (logLevel))
        else:
            self.log.info(
                'Applying default LogLevel JDL parameter with value: %s' %
                (logLevel))

        realPythonPath = os.path.realpath(dPython)
        self.log.debug('Real python path after resolving links is:')
        self.log.debug(realPythonPath)
        dPython = realPythonPath

        siteRootPython = self.siteRoot
        self.log.debug('DIRACPython is:\n%s' % dPython)
        self.log.debug('SiteRootPythonDir is:\n%s' % siteRootPython)
        wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@",
                                                  str(signature))
        wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID))
        wrapperTemplate = wrapperTemplate.replace("@DATESTRING@",
                                                  str(date_time))
        wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments))
        wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@",
                                                  str(siteRootPython))
        wrapper = open(jobWrapperFile, "w")
        wrapper.write(wrapperTemplate)
        wrapper.close()
        jobExeFile = '%s/job/Wrapper/Job%s' % (workingDir, jobID)
        jobFileContents = \
    """#!/bin/sh
%s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no
""" % ( dPython, jobWrapperFile, logLevel )
        jobFile = open(jobExeFile, 'w')
        jobFile.write(jobFileContents)
        jobFile.close()

        data = {
            'execFile': jobExeFile,
            'wrapperTemplate': self.jobWrapperTemplate,
            'signature': signature,
            'wrapperFile': jobWrapperFile,
            'logLevel': logLevel,
            'jid': jobID,
            'datetime': date_time,
            'jobArgs': arguments,
            'sitePython': siteRootPython
        }
        return S_OK(data)

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString('LocalCE', self.ceName)
        jdlFileName = jobID + '.jdl'
        jdlFile = open(jdlFileName, 'w')
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()
        return S_OK(jdlFileName)

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient('WorkloadManagement/Matcher', timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR(
                'Job request to matcher service failed with exception')
Beispiel #7
0
class JobAgent( AgentModule ):
  """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

  def __init__(self, agentName, loadName, baseAgentName = False, properties = None):
    """ Just defines some default parameters
    """
    if not properties:
      properties = {}
    super( JobAgent, self ).__init__( agentName, loadName, baseAgentName, properties )

    self.ceName = 'InProcess'
    self.computingElement = None
    self.timeLeft = 0.0

    self.initTimes = os.times()
    # Localsite options
    self.siteName = 'Unknown'
    self.pilotReference = 'Unknown'
    self.defaultProxyLength = 86400 * 5
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = 0.0
    self.jobSubmissionDelay = 10
    self.fillingMode = False
    self.minimumTimeLeft = 1000
    self.stopOnApplicationFailure = True
    self.stopAfterFailedMatches = 10
    self.jobCount = 0
    self.matchFailedCount = 0
    self.extraOptions = ''
    # Timeleft
    self.timeLeftUtil = None
    self.timeLeftError = ''
    self.scaledCPUTime = 0.0
    self.pilotInfoReportedFlag = False


  #############################################################################
  def initialize( self, loops = 0 ):
    """Sets default parameters and creates CE instance
    """
    # Disable monitoring
    self.am_setOption( 'MonitoringEnabled', False )
    # self.log.setLevel('debug') #temporary for debugging
    self.am_setOption( 'MaxCycles', loops )

    ceType = self.am_getOption( 'CEType', 'InProcess' )
    localCE = gConfig.getValue( '/LocalSite/LocalCE', '' )
    if localCE:
      self.log.info( 'Defining CE from local configuration = %s' % localCE )
      ceType = localCE

    # Create backend Computing Element
    ceFactory = ComputingElementFactory()
    self.ceName = ceType
    ceInstance = ceFactory.getCE( ceType )
    if not ceInstance['OK']:
      self.log.warn( ceInstance['Message'] )
      return ceInstance
    self.computingElement = ceInstance['Value']

    result = self.computingElement.getDescription()
    if not result['OK']:
      self.log.warn( "Can not get the CE description" )
      return result
    ceDict = result['Value']
    self.timeLeft = ceDict.get( 'CPUTime', self.timeLeft )
    self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft )

    self.initTimes = os.times()
    # Localsite options
    self.siteName = gConfig.getValue( '/LocalSite/Site', self.siteName )
    self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', self.pilotReference )
    self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength )
    # Agent options
    # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
    self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', self.cpuFactor )
    self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', self.jobSubmissionDelay )
    self.fillingMode = self.am_getOption( 'FillingModeFlag', self.fillingMode )
    self.minimumTimeLeft = self.am_getOption( 'MinimumTimeLeft', self.minimumTimeLeft )
    self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure )
    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )
    self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions )
    # Timeleft
    self.timeLeftUtil = TimeLeft()
    return S_OK()

  #############################################################################
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish( 'Node is being drained by an operator' )
      # Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish( 'No more time left' )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection( '/LocalSite' ):
          localCfg.createNewSection( '/LocalSite' )
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )

      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )
      self.log.info( 'Requirements:', requirementsDict )

    processors, wholeNode = self.__getProcessors()
    ceDict['Processors'] = processors
    ceDict['WholeNode'] = wholeNode
    self.log.info( 'Configured number of processors: %d, WholeNode: %s' % ( processors, wholeNode ) )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( 'Timeout while requesting job', jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        errorMsg = 'Pilot version does not match the production version'
        self.log.error( errorMsg, jobRequest['Message'].replace( errorMsg, '' ) )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n', jobRequest['Value'] )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for p in ( 'BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName' ):
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      proxyChain = result.get( 'Value' )

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job', '%s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % submission['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish( message, self.stopOnApplicationFailure )
        else:
          self.log.info( message )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum( os.times()[:-1] ) - sum( self.initTimes[:-1] )

    result = self.timeLeftUtil.getTimeLeft( cpuTime, processors )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    scaledCPUTime = self.timeLeftUtil.getScaledCPU( processors )
    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )

  #############################################################################
  def __saveJobJDLRequest( self, jobID, jobJDL ):
    """Save job JDL local to JobAgent.
    """
    classAdJob = ClassAd( jobJDL )
    classAdJob.insertAttributeString( 'LocalCE', self.ceName )
    jdlFileName = jobID + '.jdl'
    jdlFile = open( jdlFileName, 'w' )
    jdl = classAdJob.asJDL()
    jdlFile.write( jdl )
    jdlFile.close()

  #############################################################################
  def __getCPUTimeLeft( self ):
    """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
    cpuTime = sum( os.times()[:-1] )
    self.log.info( 'Current raw CPU time consumed is %s' % cpuTime )
    timeleft = self.timeLeft
    if self.cpuFactor:
      timeleft -= cpuTime * self.cpuFactor
    return timeleft

  #############################################################################
  def __setupProxy( self, ownerDN, ownerGroup ):
    """
    Retrieve a proxy for the execution of the job
    """
    if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ):
      proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
      if not proxyResult['OK']:
        self.log.error( 'Failed to setup proxy', proxyResult['Message'] )
        return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
      return S_OK( proxyResult['Value'] )
    else:
      ret = getProxyInfo( disableVOMS = True )
      if not ret['OK']:
        self.log.error( 'Invalid Proxy', ret['Message'] )
        return S_ERROR( 'Invalid Proxy' )

      proxyChain = ret['Value']['chain']
      if not 'groupProperties' in ret['Value']:
        print ret['Value']
        print proxyChain.dumpAllToString()
        self.log.error( 'Invalid Proxy', 'Group has no properties defined' )
        return S_ERROR( 'Proxy has no group properties defined' )

      groupProps = ret['Value']['groupProperties']
      if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
        proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup )
        if not proxyResult['OK']:
          self.log.error( 'Invalid Proxy', proxyResult['Message'] )
          return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] )
        proxyChain = proxyResult['Value']

    return S_OK( proxyChain )

  #############################################################################
  def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ):
    """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

    self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) )
    token = gConfig.getValue( "/Security/ProxyToken", "" )
    if not token:
      self.log.info( "No token defined. Trying to download proxy without token" )
      token = False
    retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup,
                                                          self.defaultProxyLength, token )
    if not retVal[ 'OK' ]:
      self.log.error( 'Could not retrieve payload proxy', retVal['Message'] )
      self.log.warn( retVal )
      os.system( 'dirac-proxy-info' )
      sys.stdout.flush()
      return S_ERROR( 'Error retrieving proxy' )

    chain = retVal[ 'Value' ]
    return S_OK( chain )

  #############################################################################
  def __checkInstallSoftware( self, jobID, jobParams, resourceParams ):
    """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
    if 'SoftwareDistModule' not in jobParams:
      msg = 'Job has no software installation requirement'
      self.log.verbose( msg )
      return S_OK( msg )

    self.__report( jobID, 'Matched', 'Installing Software' )
    softwareDist = jobParams['SoftwareDistModule']
    self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) )
    argumentsDict = {'Job':jobParams, 'CE':resourceParams}
    moduleFactory = ModuleFactory()
    moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict )
    if not moduleInstance['OK']:
      return moduleInstance

    module = moduleInstance['Value']
    return module.execute()

  #############################################################################
  def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, proxyChain ):
    """ Submit job to the Computing Element instance after creating a custom
        Job Wrapper with the available job parameters.
    """
    logLevel = self.am_getOption( 'DefaultLogLevel', 'INFO' )
    defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate',
                                                'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' )
    result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams,
                               extraOptions = self.extraOptions, defaultWrapperLocation = defaultWrapperLocation,
                               log = self.log, logLevel = logLevel )
    if not result['OK']:
      return result

    wrapperFile = result['Value']
    self.__report( jobID, 'Matched', 'Submitted To CE' )

    self.log.info( 'Submitting JobWrapper %s to %sCE' % ( os.path.basename( wrapperFile ), self.ceName ) )

    # Pass proxy to the CE
    proxy = proxyChain.dumpAllToString()
    if not proxy['OK']:
      self.log.error( 'Invalid proxy', proxy )
      return S_ERROR( 'Payload Proxy Not Found' )

    payloadProxy = proxy['Value']
    # FIXME: how can we set the batchID before we submit, this makes no sense
    batchID = 'dc%s' % ( jobID )
    submission = self.computingElement.submitJob( wrapperFile, payloadProxy )

    ret = S_OK( 'Job submitted' )

    if submission['OK']:
      batchID = submission['Value']
      self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) )
      self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) )
      self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) )
      if 'PayloadFailed' in submission:
        ret['PayloadFailed'] = submission['PayloadFailed']
        return ret
      time.sleep( self.jobSubmissionDelay )
    else:
      self.log.error( 'Job submission failed', jobID )
      self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) )
      if 'ReschedulePayload' in submission:
        rescheduleFailedJob( jobID, submission['Message'] )
        return S_OK()  # Without this job is marked as failed at line 265 above
      else:
        if 'Value' in submission:
          self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % ( str( submission['Value'] ) ) )
      return S_ERROR( '%s CE Error: %s' % ( self.ceName, submission['Message'] ) )

    return ret

  #############################################################################
  def __requestJob( self, ceDict ):
    """Request a single job from the matcher service.
    """
    matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 )
    return matcher.requestJob( ceDict )

  #############################################################################
  def __getJDLParameters( self, jdl ):
    """Returns a dictionary of JDL parameters.
    """
    try:
      parameters = {}
#      print jdl
      if not re.search( r'\[', jdl ):
        jdl = '[' + jdl + ']'
      classAdJob = ClassAd( jdl )
      paramsDict = classAdJob.contents
      for param, value in paramsDict.items():
        if value.strip().startswith( '{' ):
          self.log.debug( 'Found list type parameter %s' % ( param ) )
          rawValues = value.replace( '{', '' ).replace( '}', '' ).replace( '"', '' ).split()
          valueList = []
          for val in rawValues:
            if re.search( ',$', val ):
              valueList.append( val[:-1] )
            else:
              valueList.append( val )
          parameters[param] = valueList
        else:
          parameters[param] = value.replace( '"', '' ).replace( '{', '"{' ).replace( '}', '}"' )
          self.log.debug( 'Found standard parameter %s: %s' % ( param, parameters[param] ) )
      return S_OK( parameters )
    except Exception as x:
      self.log.exception( lException = x )
      return S_ERROR( 'Exception while extracting JDL parameters for job' )

  #############################################################################
  def __report( self, jobID, status, minorStatus ):
    """Wraps around setJobStatus of state update client
    """
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    jobStatus = jobReport.setJobStatus( int( jobID ), status, minorStatus, 'JobAgent@%s' % self.siteName )
    self.log.verbose( 'setJobStatus(%s,%s,%s,%s)' % ( jobID, status, minorStatus, 'JobAgent@%s' % self.siteName ) )
    if not jobStatus['OK']:
      self.log.warn( jobStatus['Message'] )

    return jobStatus

  #############################################################################
  # FIXME: this is not called anywhere...?
  def __setJobSite( self, jobID, site ):
    """Wraps around setJobSite of state update client
    """
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    jobSite = jobReport.setJobSite( int( jobID ), site )
    self.log.verbose( 'setJobSite(%s,%s)' % ( jobID, site ) )
    if not jobSite['OK']:
      self.log.warn( jobSite['Message'] )

    return jobSite

  #############################################################################
  def __setJobParam( self, jobID, name, value ):
    """Wraps around setJobParameter of state update client
    """
    jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    jobParam = jobReport.setJobParameter( int( jobID ), str( name ), str( value ) )
    self.log.verbose( 'setJobParameter(%s,%s,%s)' % ( jobID, name, value ) )
    if not jobParam['OK']:
      self.log.warn( jobParam['Message'] )

    return jobParam

  #############################################################################
  def __finish( self, message, stop = True ):
    """Force the JobAgent to complete gracefully.
    """
    if stop:
      self.log.info( 'JobAgent will stop with message "%s", execution complete.' % message )
      self.am_stopExecution()
      return S_ERROR( message )
    else:
      return S_OK( message )

  #############################################################################
  def __rescheduleFailedJob( self, jobID, message, stop = True ):
    """
    Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
    """

    self.log.warn( 'Failure during %s' % ( message ) )

    jobManager = RPCClient( 'WorkloadManagement/JobManager' )
    jobReport = JobReport( int( jobID ), 'JobAgent@%s' % self.siteName )

    # Setting a job parameter does not help since the job will be rescheduled,
    # instead set the status with the cause and then another status showing the
    # reschedule operation.

    jobReport.setJobStatus( status = 'Rescheduled',
                            application = message,
                            sendFlag = True )

    self.log.info( 'Job will be rescheduled' )
    result = jobManager.rescheduleJob( jobID )
    if not result['OK']:
      self.log.error( 'Failed to reschedule job', result['Message'] )
      return self.__finish( 'Problem Rescheduling Job', stop )

    self.log.info( 'Job Rescheduled %s' % ( jobID ) )
    return self.__finish( 'Job Rescheduled', stop )

  #############################################################################
  def __getProcessors( self ):
    """
    Return number of processors from gConfig and a boolean corresponding to WholeNode option
    """
    tag = gConfig.getValue( '/Resources/Computing/CEDefaults/Tag', None )

    if tag is None: return 1, False

    self.log.verbose( "__getProcessors: /Resources/Computing/CEDefaults/Tag", repr( tag ) )

    # look for a pattern like "12345Processors" in tag list
    m = re.match( r'^(.*\D)?(?P<processors>\d+)Processors([ \t,].*)?$', tag )
    if m:
      return int( m.group( 'processors' ) ), False

    # In WholeNode case, detect number of cores from the host
    if re.match( r'^(.*,\s*)?WholeNode([ \t,].*)?$', tag ):
      return getNumberOfCores(), True

    return 1, False

  #############################################################################
  def finalize( self ):
    """ Job Agent finalization method
    """

    gridCE = gConfig.getValue( '/LocalSite/GridCE', '' )
    queue = gConfig.getValue( '/LocalSite/CEQueue', '' )
    wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' )
    result = wmsAdmin.setPilotStatus( str( self.pilotReference ), 'Done', gridCE,
                                      'Report from JobAgent', self.siteName, queue )
    if not result['OK']:
      self.log.warn( result['Message'] )

    return S_OK()
Beispiel #8
0
class JobAgent(AgentModule):
    """
      The specific agents must provide the following methods:
      - initialize() for initial settings
      - beginExecution()
      - execute() - the main method called in the agent cycle
      - endExecution()
      - finalize() - the graceful exit of the method, this one is usually used
                 for the agent restart
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption("MonitoringEnabled", False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption("MaxCycles", loops)

        ceType = self.am_getOption("CEType", "InProcess")
        localCE = gConfig.getValue("/LocalSite/LocalCE", "")
        if localCE:
            self.log.info("Defining CE from local configuration = %s" % localCE)
            ceType = localCE

        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance["OK"]:
            self.log.warn(ceInstance["Message"])
            return ceInstance

        self.initTimes = os.times()

        self.computingElement = ceInstance["Value"]
        self.diracRoot = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        # Localsite options
        self.siteRoot = gConfig.getValue("/LocalSite/Root", os.getcwd())
        self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown")
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown")
        self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        defaultWrapperLocation = "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py"
        self.jobWrapperTemplate = os.path.join(
            self.diracRoot, self.am_getOption("JobWrapperTemplate", defaultWrapperLocation)
        )
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10)
        self.defaultLogLevel = self.am_getOption("DefaultLogLevel", "info")
        self.fillingMode = self.am_getOption("FillingModeFlag", False)
        self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True)
        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", 0.0)
        self.gridCEQueue = gConfig.getValue("/Resources/Computing/CEDefaults/GridCEQueue", "")
        self.timeLeftError = ""
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])
            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error(jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                self.log.error(jobRequest["Message"])
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if not params.has_key("JobID"):
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if not params.has_key("JobType"):
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if not params.has_key("SystemConfig"):
            self.log.warn("Job has no system configuration defined in JDL parameters")
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            self.log.info(
                "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig
            )
            if not systemConfig:
                self.log.warn("/LocalSite/Architecture is not defined")
            params["SystemConfig"] = systemConfig
        else:
            systemConfig = params["SystemConfig"]
            if systemConfig.lower() == "any":
                systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
                self.log.info(
                    "Setting SystemConfig = /LocalSite/Architecture =",
                    '"%s" since it was set to "ANY" in the job description' % systemConfig,
                )
                if not systemConfig:
                    self.log.warn("/LocalSite/Architecture is not defined")
                params["SystemConfig"] = systemConfig

        if not params.has_key("CPUTime"):
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"]))
        self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False)

            if os.environ.has_key("BOINC_JOB_ID"):
                # Report BOINC environment
                for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]:
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure)
            if "Value" in result and result["Value"]:
                proxyChain = result["Value"]

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job %s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure)

            self.log.verbose("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    "Payload execution failed with error code %s" % submission["PayloadFailed"],
                    self.stopOnApplicationFailure,
                )

            self.log.verbose("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure
            )

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"]

        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        utime, stime, cutime, cstime, elapsed = os.times()
        cpuTime = utime + stime + cutime
        self.log.info("Current raw CPU time consumed is %s" % cpuTime)
        timeleft = self.timeLeft - cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose("Log proxy change (to be instrumented)")
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False):
            proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
            if not proxyResult["OK"]:
                self.log.error("Invalid Proxy", proxyResult["Message"])
                return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
            return S_OK(proxyResult["Value"])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret["OK"]:
                self.log.error("Invalid Proxy", ret["Message"])
                return S_ERROR("Invalid Proxy")

            proxyChain = ret["Value"]["chain"]
            if not "groupProperties" in ret["Value"]:
                print ret["Value"]
                print proxyChain.dumpAllToString()
                self.log.error("Invalid Proxy", "Group has no properties defined")
                return S_ERROR("Proxy has no group properties defined")

            groupProps = ret["Value"]["groupProperties"]
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
                if not proxyResult["OK"]:
                    self.log.error("Invalid Proxy", proxyResult["Message"])
                    return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
                proxyChain = proxyResult["Value"]

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info("No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal["OK"]:
            self.log.error("Could not retrieve proxy")
            self.log.warn(retVal)
            os.system("dirac-proxy-info")
            sys.stdout.flush()
            return S_ERROR("Error retrieving proxy")

        chain = retVal["Value"]
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if not jobParams.has_key("SoftwareDistModule"):
            msg = "Job has no software installation requirement"
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, "Matched", "Installing Software")
        softwareDist = jobParams["SoftwareDistModule"]
        self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist))
        argumentsDict = {"Job": jobParams, "CE": resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance["OK"]:
            return moduleInstance

        module = moduleInstance["Value"]
        result = module.execute()
        return result

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        result = self.__createJobWrapper(jobID, jobParams, resourceParams, optimizerParams)

        if not result["OK"]:
            return result

        wrapperData = result["Value"]
        wrapperFile = wrapperData["execFile"]
        self.__report(jobID, "Matched", "Submitted To CE")

        wrapperName = os.path.basename(wrapperFile)
        self.log.info("Submitting %s to %sCE" % (wrapperName, self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy["OK"]:
            self.log.error(proxy)
            return S_ERROR("Payload Proxy Not Found")

        payloadProxy = proxy["Value"]
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = "dc%s" % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy, wrapperData)

        ret = S_OK("Job submitted")

        if submission["OK"]:
            batchID = submission["Value"]
            self.log.info("Job %s submitted as %s" % (jobID, batchID))
            self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID))
            self.__setJobParam(jobID, "LocalBatchID", str(batchID))
            if "PayloadFailed" in submission:
                ret["PayloadFailed"] = submission["PayloadFailed"]
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error("Job submission failed", jobID)
            self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName))
            if "ReschedulePayload" in submission:
                rescheduleFailedJob(jobID, submission["Message"], self.__report)
            else:
                if "Value" in submission:
                    self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"])))
                # make sure the Job is declared Failed
                self.__report(jobID, "Failed", submission["Message"])
            return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"]))

        return ret

    #############################################################################
    def __createJobWrapper(self, jobID, jobParams, resourceParams, optimizerParams):
        """This method creates a job wrapper filled with the CE and Job parameters
       to executed the job.
    """
        arguments = {"Job": jobParams, "CE": resourceParams, "Optimizer": optimizerParams}
        self.log.verbose("Job arguments are: \n %s" % (arguments))

        workingDir = gConfig.getValue("/LocalSite/WorkingDirectory", self.siteRoot)
        if not os.path.exists("%s/job/Wrapper" % (workingDir)):
            try:
                os.makedirs("%s/job/Wrapper" % (workingDir))
            except Exception:
                self.log.exception()
                return S_ERROR("Could not create directory for wrapper script")

        jobWrapperFile = "%s/job/Wrapper/Wrapper_%s" % (workingDir, jobID)
        if os.path.exists(jobWrapperFile):
            self.log.verbose("Removing existing Job Wrapper for %s" % (jobID))
            os.remove(jobWrapperFile)
        fd = open(self.jobWrapperTemplate, "r")
        wrapperTemplate = fd.read()
        fd.close()

        dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        timeStr = time.strftime("%H:%M", time.localtime(time.time()))
        date_time = "%s %s" % (dateStr, timeStr)
        signature = __RCSID__
        dPython = sys.executable

        systemConfig = ""
        if jobParams.has_key("SystemConfig"):
            systemConfig = jobParams["SystemConfig"]
            self.log.verbose("Job system configuration requirement is %s" % (systemConfig))
            if resourceParams.has_key("Root"):
                jobPython = "%s/%s/bin/python" % (resourceParams["Root"], systemConfig)
                if os.path.exists(jobPython):
                    self.log.verbose("Found local python for job:\n%s" % (jobPython))
                    dPython = jobPython
                else:
                    if systemConfig == "ANY":
                        self.log.verbose("Using standard available python %s for job" % (dPython))
                    else:
                        self.log.warn("Job requested python \n%s\n but this is not available locally" % (jobPython))
            else:
                self.log.warn("No LocalSite/Root defined")
        else:
            self.log.warn("Job has no system configuration requirement")

        if not systemConfig or systemConfig.lower() == "any":
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            if not systemConfig:
                self.log.warn("Could not establish SystemConfig")

        logLevel = self.defaultLogLevel
        if jobParams.has_key("LogLevel"):
            logLevel = jobParams["LogLevel"]
            self.log.info("Found Job LogLevel JDL parameter with value: %s" % (logLevel))
        else:
            self.log.info("Applying default LogLevel JDL parameter with value: %s" % (logLevel))

        realPythonPath = os.path.realpath(dPython)
        self.log.debug("Real python path after resolving links is:")
        self.log.debug(realPythonPath)
        dPython = realPythonPath

        siteRootPython = self.siteRoot
        self.log.debug("DIRACPython is:\n%s" % dPython)
        self.log.debug("SiteRootPythonDir is:\n%s" % siteRootPython)
        wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@", str(signature))
        wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID))
        wrapperTemplate = wrapperTemplate.replace("@DATESTRING@", str(date_time))
        wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments))
        wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@", str(siteRootPython))
        wrapper = open(jobWrapperFile, "w")
        wrapper.write(wrapperTemplate)
        wrapper.close()
        jobExeFile = "%s/job/Wrapper/Job%s" % (workingDir, jobID)
        jobFileContents = """#!/bin/sh
%s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no
""" % (
            dPython,
            jobWrapperFile,
            logLevel,
        )
        jobFile = open(jobExeFile, "w")
        jobFile.write(jobFileContents)
        jobFile.close()

        data = {
            "execFile": jobExeFile,
            "wrapperTemplate": self.jobWrapperTemplate,
            "signature": signature,
            "wrapperFile": jobWrapperFile,
            "logLevel": logLevel,
            "jid": jobID,
            "datetime": date_time,
            "jobArgs": arguments,
            "sitePython": siteRootPython,
        }
        return S_OK(data)

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient("WorkloadManagement/Matcher", timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR("Job request to matcher service failed with exception")
Beispiel #9
0
class JobAgent(AgentModule):
    """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.
  """

    #############################################################################
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring
        self.am_setOption("MonitoringEnabled", False)
        # self.log.setLevel('debug') #temporary for debugging
        self.am_setOption("MaxCycles", loops)

        ceType = self.am_getOption("CEType", "InProcess")
        localCE = gConfig.getValue("/LocalSite/LocalCE", "")
        if localCE:
            self.log.info("Defining CE from local configuration = %s" % localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance["OK"]:
            self.log.warn(ceInstance["Message"])
            return ceInstance
        self.computingElement = ceInstance["Value"]

        result = self.computingElement.getDescription()
        if not result["OK"]:
            self.log.warn("Can not get the CE description")
            return result
        ceDict = result["Value"]
        self.timeLeft = ceDict.get("CPUTime", 0.0)
        self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown")
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown")
        self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10)
        self.fillingMode = self.am_getOption("FillingModeFlag", False)
        self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", 1000)
        self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True)
        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10)
        self.jobCount = 0
        self.matchFailedCount = 0
        self.extraOptions = gConfig.getValue("/AgentJobRequirements/ExtraOptions", "")
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        self.timeLeftError = ""
        self.scaledCPUTime = 0.0
        self.pilotInfoReportedFlag = False
        return S_OK()

    #############################################################################
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")

    #############################################################################
    def __saveJobJDLRequest(self, jobID, jobJDL):
        """Save job JDL local to JobAgent.
    """
        classAdJob = ClassAd(jobJDL)
        classAdJob.insertAttributeString("LocalCE", self.ceName)
        jdlFileName = jobID + ".jdl"
        jdlFile = open(jdlFileName, "w")
        jdl = classAdJob.asJDL()
        jdlFile.write(jdl)
        jdlFile.close()

    #############################################################################
    def __getCPUTimeLeft(self):
        """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config.
    """
        cpuTime = sum(os.times()[:-1])
        self.log.info("Current raw CPU time consumed is %s" % cpuTime)
        timeleft = self.timeLeft
        if self.cpuFactor:
            timeleft -= cpuTime * self.cpuFactor
        return timeleft

    #############################################################################
    def __changeProxy(self, oldProxy, newProxy):
        """Can call glexec utility here to set uid or simply log the changeover
       of a proxy.
    """
        self.log.verbose("Log proxy change (to be instrumented)")
        return S_OK()

    #############################################################################
    def __setupProxy(self, ownerDN, ownerGroup):
        """
    Retrieve a proxy for the execution of the job
    """
        if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False):
            proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
            if not proxyResult["OK"]:
                self.log.error("Failed to setup proxy", proxyResult["Message"])
                return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
            return S_OK(proxyResult["Value"])
        else:
            ret = getProxyInfo(disableVOMS=True)
            if not ret["OK"]:
                self.log.error("Invalid Proxy", ret["Message"])
                return S_ERROR("Invalid Proxy")

            proxyChain = ret["Value"]["chain"]
            if not "groupProperties" in ret["Value"]:
                print ret["Value"]
                print proxyChain.dumpAllToString()
                self.log.error("Invalid Proxy", "Group has no properties defined")
                return S_ERROR("Proxy has no group properties defined")

            groupProps = ret["Value"]["groupProperties"]
            if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps:
                proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup)
                if not proxyResult["OK"]:
                    self.log.error("Invalid Proxy", proxyResult["Message"])
                    return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"])
                proxyChain = proxyResult["Value"]

        return S_OK(proxyChain)

    #############################################################################
    def __requestProxyFromProxyManager(self, ownerDN, ownerGroup):
        """Retrieves user proxy with correct role for job and sets up environment to
       run job locally.
    """

        self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup))
        token = gConfig.getValue("/Security/ProxyToken", "")
        if not token:
            self.log.info("No token defined. Trying to download proxy without token")
            token = False
        retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token)
        if not retVal["OK"]:
            self.log.error("Could not retrieve payload proxy", retVal["Message"])
            self.log.warn(retVal)
            os.system("dirac-proxy-info")
            sys.stdout.flush()
            return S_ERROR("Error retrieving proxy")

        chain = retVal["Value"]
        return S_OK(chain)

    #############################################################################
    def __checkInstallSoftware(self, jobID, jobParams, resourceParams):
        """Checks software requirement of job and whether this is already present
       before installing software locally.
    """
        if "SoftwareDistModule" not in jobParams:
            msg = "Job has no software installation requirement"
            self.log.verbose(msg)
            return S_OK(msg)

        self.__report(jobID, "Matched", "Installing Software")
        softwareDist = jobParams["SoftwareDistModule"]
        self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist))
        argumentsDict = {"Job": jobParams, "CE": resourceParams}
        moduleFactory = ModuleFactory()
        moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict)
        if not moduleInstance["OK"]:
            return moduleInstance

        module = moduleInstance["Value"]
        return module.execute()

    #############################################################################
    def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain):
        """Submit job to the Computing Element instance after creating a custom
       Job Wrapper with the available job parameters.
    """
        logLevel = self.am_getOption("DefaultLogLevel", "INFO")
        defaultWrapperLocation = self.am_getOption(
            "JobWrapperTemplate", "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py"
        )
        result = createJobWrapper(
            jobID,
            jobParams,
            resourceParams,
            optimizerParams,
            extraOptions=self.extraOptions,
            defaultWrapperLocation=defaultWrapperLocation,
            log=self.log,
            logLevel=logLevel,
        )
        if not result["OK"]:
            return result

        wrapperFile = result["Value"]
        self.__report(jobID, "Matched", "Submitted To CE")

        self.log.info("Submitting %s to %sCE" % (os.path.basename(wrapperFile), self.ceName))

        # Pass proxy to the CE
        proxy = proxyChain.dumpAllToString()
        if not proxy["OK"]:
            self.log.error("Invalid proxy", proxy)
            return S_ERROR("Payload Proxy Not Found")

        payloadProxy = proxy["Value"]
        # FIXME: how can we set the batchID before we submit, this makes no sense
        batchID = "dc%s" % (jobID)
        submission = self.computingElement.submitJob(wrapperFile, payloadProxy)

        ret = S_OK("Job submitted")

        if submission["OK"]:
            batchID = submission["Value"]
            self.log.info("Job %s submitted as %s" % (jobID, batchID))
            self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID))
            self.__setJobParam(jobID, "LocalBatchID", str(batchID))
            if "PayloadFailed" in submission:
                ret["PayloadFailed"] = submission["PayloadFailed"]
                return ret
            time.sleep(self.jobSubmissionDelay)
        else:
            self.log.error("Job submission failed", jobID)
            self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName))
            if "ReschedulePayload" in submission:
                rescheduleFailedJob(jobID, submission["Message"])
                return S_OK()  # Without this job is marked as failed at line 265 above
            else:
                if "Value" in submission:
                    self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"])))
                # make sure the Job is declared Failed
                self.__report(jobID, "Failed", submission["Message"])
            return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"]))

        return ret

    #############################################################################
    def __requestJob(self, ceDict):
        """Request a single job from the matcher service.
    """
        try:
            matcher = RPCClient("WorkloadManagement/Matcher", timeout=600)
            result = matcher.requestJob(ceDict)
            return result
        except Exception as x:
            self.log.exception(lException=x)
            return S_ERROR("Job request to matcher service failed with exception")

    #############################################################################
    def __getJDLParameters(self, jdl):
        """Returns a dictionary of JDL parameters.
    """
        try:
            parameters = {}
            #      print jdl
            if not re.search("\[", jdl):
                jdl = "[" + jdl + "]"
            classAdJob = ClassAd(jdl)
            paramsDict = classAdJob.contents
            for param, value in paramsDict.items():
                if value.strip().startswith("{"):
                    self.log.debug("Found list type parameter %s" % (param))
                    rawValues = value.replace("{", "").replace("}", "").replace('"', "").split()
                    valueList = []
                    for val in rawValues:
                        if re.search(",$", val):
                            valueList.append(val[:-1])
                        else:
                            valueList.append(val)
                    parameters[param] = valueList
                else:
                    parameters[param] = value.replace('"', "").replace("{", '"{').replace("}", '}"')
                    self.log.debug("Found standard parameter %s: %s" % (param, parameters[param]))
            return S_OK(parameters)
        except Exception, x:
            self.log.exception(lException=x)
            return S_ERROR("Exception while extracting JDL parameters for job")