Example #1
0
def test_jobReport(mocker):
    mocker.patch("DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient", side_effect=MagicMock())

    jr = JobReport(123)
    res = jr.setJobStatus("Matched", "minor_matched", "app_matched", sendFlag=False)
    assert res["OK"]
    res = jr.setJobStatus("Running", "minor_running", "app_running", sendFlag=False)
    assert res["OK"]
    res = jr.setJobParameter("par_1", "value_1", sendFlag=False)
    assert res["OK"]
    res = jr.setJobParameter("par_2", "value_2", sendFlag=False)
    assert res["OK"]
    res = jr.setJobParameters([("par_3", "value_3"), ("par_4", "value_4")], sendFlag=False)
    print(jr.jobParameters)
    jr.dump()
Example #2
0
def test_jobReport(mocker):
  mocker.patch('DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient', side_effect=MagicMock())

  jr = JobReport(123)
  res = jr.setJobStatus('Matched', 'minor_matched', 'app_matched', sendFlag=False)
  assert res['OK']
  res = jr.setJobStatus('Running', 'minor_running', 'app_running', sendFlag=False)
  assert res['OK']
  res = jr.setJobParameter('par_1', 'value_1', sendFlag=False)
  assert res['OK']
  res = jr.setJobParameter('par_2', 'value_2', sendFlag=False)
  assert res['OK']
  res = jr.setJobParameters([
      ('par_3', 'value_3'),
      ('par_4', 'value_4')],
      sendFlag=False)
  print(jr.jobParameters)
  jr.dump()
Example #3
0
def execute(arguments):
  """ The only real function executed here
  """

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int(jobID)

  if 'WorkingDirectory' in arguments:
    wdir = os.path.expandvars(arguments['WorkingDirectory'])
    if os.path.isdir(wdir):
      os.chdir(wdir)
    else:
      try:
        os.makedirs(wdir)  # this will raise an exception if wdir already exists (which is ~OK)
        if os.path.isdir(wdir):
          os.chdir(wdir)
      except OSError as osError:
        if osError.errno == errno.EEXIST and os.path.isdir(wdir):
          gLogger.exception('JobWrapperTemplate found that the working directory already exists')
          rescheduleResult = rescheduleFailedJob(jobID, 'Working Directory already exists')
        else:
          gLogger.exception('JobWrapperTemplate could not create working directory')
          rescheduleResult = rescheduleFailedJob(jobID, 'Could Not Create Working Directory')
        return 1

  gJobReport = JobReport(jobID, 'JobWrapper')

  try:
    job = JobWrapper(jobID, gJobReport)
    job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('JobWrapper failed the initialization phase', lException=exc)
    rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport)
    try:
      job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization')
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper failed sending job accounting', lException=exc)
    return 1

  if 'InputSandbox' in arguments['Job']:
    gJobReport.commit()
    try:
      result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except JobWrapperError:
      gLogger.exception('JobWrapper failed to download input sandbox')
      rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
      return 1
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper raised exception while downloading input sandbox', lException=exc)
      rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
      return 1
  else:
    gLogger.verbose('Job has no InputSandbox requirement')

  gJobReport.commit()

  if 'InputData' in arguments['Job']:
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn(result['Message'])
          raise JobWrapperError(result['Message'])
      except JobWrapperError:
        gLogger.exception('JobWrapper failed to resolve input data')
        rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Input Data Resolution')
        return 1
      except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception('JobWrapper raised exception while resolving input data', lException=exc)
        rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Input Data Resolution')
        return 1
    else:
      gLogger.verbose('Job has a null InputData requirement:')
      gLogger.verbose(arguments)
  else:
    gLogger.verbose('Job has no InputData requirement')

  gJobReport.commit()

  try:
    result = job.execute(arguments)
    if not result['OK']:
      gLogger.error('Failed to execute job', result['Message'])
      raise JobWrapperError((result['Message'], result['Errno']))
  except JobWrapperError as exc:
    if exc.value[1] == 0 or str(exc.value[0]) == '0':
      gLogger.verbose('JobWrapper exited with status=0 after execution')
    if exc.value[1] == DErrno.EWMSRESC:
      gLogger.warn("Asked to reschedule job")
      rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'JobWrapper execution')
      return 1
    gLogger.exception('Job failed in execution phase')
    gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
    gJobReport.setJobStatus(
        'Failed', 'Exception During Execution', sendFlag=False)
    job.sendFailoverRequest('Failed', 'Exception During Execution')
    return 1
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('Job raised exception during execution phase', lException=exc)
    gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
    gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False)
    job.sendFailoverRequest('Failed', 'Exception During Execution')
    return 1

  if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']:
    try:
      result = job.processJobOutputs()
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except JobWrapperError as exc:
      gLogger.exception('JobWrapper failed to process output files')
      gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
      gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False)
      job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
      return 2
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper raised exception while processing output files', lException=exc)
      gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
      gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False)
      job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
      return 2
  else:
    gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

  try:
    # Failed jobs will return 1 / successful jobs will return 0
    return job.finalize()
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('JobWrapper raised exception during the finalization phase', lException=exc)
    return 2
Example #4
0
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    
    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:  
      ceDict['PilotReference'] = str( self.pilotReference ) 
    ceDict['PilotBenchmark'] = self.cpuFactor 
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag
    
    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No work available', jobRequest['Message'] ):
        self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return S_ERROR( 'Nothing to do' )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return S_ERROR( 'Nothing to do' )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return S_ERROR( 'Nothing to do' )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'SystemConfig' ):
      self.log.warn( 'Job has no system configuration defined in JDL parameters' )
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig )
      if not systemConfig:
        self.log.warn( '/LocalSite/Architecture is not defined' )
      params['SystemConfig'] = systemConfig
    else:
      systemConfig = params['SystemConfig']
      if systemConfig.lower() == 'any':
        systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
        self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =',
                       '"%s" since it was set to "ANY" in the job description' % systemConfig )
        if not systemConfig:
          self.log.warn( '/LocalSite/Architecture is not defined' )
        params['SystemConfig'] = systemConfig

    if not params.has_key( 'MaxCPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )
      if self.gridCEQueue:
        jobReport.setJobParameter( 'GridCEQueue', self.gridCEQueue, sendFlag = False )
      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      # self.__setJobSite( jobID, self.siteName )
      if not self.pilotInfoReportedFlag:
        self.__reportPilotInfo( jobID )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Is this necessary at all?
      saveJDL = self.__saveJobJDLRequest( jobID, jobJDL )
      #self.__report(jobID,'Matched','Job Prepared to Submit')

      #resourceParameters = self.__getJDLParameters( resourceJDL )
      #if not resourceParameters['OK']:
      #  return resourceParameters
      #resourceParams = resourceParameters['Value']

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, jobJDL, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )
Example #5
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                          'BoincHostName'):
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Example #6
0
  def execute(self):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish('Node is being drained by an operator')
      # Only call timeLeft utility after a job has been picked up
      self.log.info('Attempting to check CPU time left for filling mode')
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn(self.timeLeftError)
          return self.__finish(self.timeLeftError)
        self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft))
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish('No more time left')
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
        if not result['OK']:
          return self.__finish(result['Message'])

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join('.', self.extraOptions)
        else:
          localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
        localCfg.writeToFile(localConfigFile)

      else:
        return self.__finish('Filling Mode is Disabled')

    self.log.verbose('Job Agent execution loop')
    result = self.computingElement.available()
    if not result['OK']:
      self.log.info('Resource is not available')
      self.log.info(result['Message'])
      return self.__finish('CE Not Available')

    self.log.info(result['Message'])

    ceInfoDict = result['CEInfoDict']
    runningJobs = ceInfoDict.get("RunningJobs")
    availableSlots = result['Value']

    if not availableSlots:
      if runningJobs:
        self.log.info('No available slots with %d running jobs' % runningJobs)
        return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs)
      else:
        self.log.info('CE is not available')
        return self.__finish('CE Not Available')

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    # Add pilot information
    gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if 'PilotReference' not in ceDict:
      ceDict['PilotReference'] = str(self.pilotReference)
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict('/AgentJobRequirements')
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update(requirementsDict)
      self.log.info('Requirements:', requirementsDict)

    self.log.verbose(ceDict)
    start = time.time()
    jobRequest = MatcherClient().requestJob(ceDict)
    matchTime = time.time() - start
    self.log.info('MatcherTime = %.2f (s)' % (matchTime))

    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)

    if not jobRequest['OK']:
      if re.search('No match found', jobRequest['Message']):
        self.log.notice('Job request OK: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("seconds timeout") != -1:
        self.log.error('Timeout while requesting job', jobRequest['Message'])
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("Pilot version does not match") != -1:
        errorMsg = 'Pilot version does not match the production version'
        self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, ''))
        return S_ERROR(jobRequest['Message'])
      else:
        self.log.notice('Failed to get jobs: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False)
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param))
        return self.__finish('Matcher Failed')
      elif not matcherInfo[param]:
        self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param))
        return self.__finish('Matcher Failed')
      else:
        self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param]))

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters(jobJDL)
    if not parameters['OK']:
      self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
      self.log.warn(parameters['Message'])
      return self.__finish('JDL Problem')

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report(jobID, 'Failed', msg)
      self.log.warn(msg)
      return self.__finish('JDL Problem')
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn('Job has no JobType defined in JDL parameters')
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn('Job has no CPU requirement defined in JDL parameters')

    # Job requirement for a number of processors
    processors = int(params.get('NumberOfProcessors', 1))
    wholeNode = 'WholeNode' in params

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose('Job request successful: \n', jobRequest['Value'])
    self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
    self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
    self.jobCount += 1
    try:
      jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
      jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False)

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'):
          jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False)

      jobReport.setJobStatus('Matched', 'Job Received by Agent')
      result = self.__setupProxy(ownerDN, jobGroup)
      if not result['OK']:
        return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure)
      proxyChain = result.get('Value')

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest(jobID, jobJDL)

      software = self.__checkInstallSoftware(jobID, params, ceDict)
      if not software['OK']:
        self.log.error('Failed to install software for job', '%s' % (jobID))
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

      self.log.debug('Before %sCE submitJob()' % (self.ceName))
      result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode)
      if not result['OK']:
        self.__report(jobID, 'Failed', result['Message'])
        return self.__finish(result['Message'])
      elif 'PayloadFailed' in result:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % result['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish(message, self.stopOnApplicationFailure)
        else:
          self.log.info(message)

      self.log.debug('After %sCE submitJob()' % (self.ceName))
    except Exception as subExcept:  # pylint: disable=broad-except
      self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
      return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure)

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

    result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    return S_OK('Job Agent cycle complete')
Example #7
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])
            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Example #8
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")
Example #9
0
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )
Example #10
0
def execute(arguments):

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)

    if arguments.has_key('WorkingDirectory'):
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)
    except Exception as e:
        gLogger.exception('JobWrapper failed the initialization phase',
                          lException=e)
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        try:
            job.sendJobAccounting(rescheduleResult,
                                  'Job Wrapper Initialization')
        except Exception as e:
            gLogger.exception('JobWrapper failed sending job accounting',
                              lException=e)
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if arguments['Job'].has_key('InputData'):
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except Exception as x:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
    else:
        gLogger.verbose('Job has no InputData requirement')

    gJobReport.commit()

    try:
        result = job.execute(arguments)
        if not result['OK']:
            gLogger.error('Failed to execute job', result['Message'])
            raise JobWrapperError(result['Message'])
    except Exception as x:
        if str(x) == '0':
            gLogger.verbose('JobWrapper exited with status=0 after execution')
        else:
            gLogger.exception('Job failed in execution phase')
            gJobReport.setJobParameter('Error Message', str(x), sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Exception During Execution',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Exception During Execution')
            return 1

    if arguments['Job'].has_key('OutputSandbox') or arguments['Job'].has_key(
            'OutputData'):
        try:
            result = job.processJobOutputs(arguments)
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception as x:
            gLogger.exception('JobWrapper failed to process output files')
            gJobReport.setJobParameter('Error Message', str(x), sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
    else:
        gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

    try:
        # Failed jobs will return 1 / successful jobs will return 0
        return job.finalize(arguments)
    except Exception:
        gLogger.exception('JobWrapper failed the finalization phase')
        return 2
Example #11
0
    gLogger.verbose('Job has no InputData requirement')

  jobReport.commit()

  try:
    result = job.execute(arguments)
    if not result['OK']:
      gLogger.error(result['Message'])
      raise JobWrapperError(result['Message'])
  except Exception, x:
    if str(x) == '0':
      gLogger.verbose('JobWrapper exited with status=0 after execution')
      pass
    else:
      gLogger.exception('Job failed in execution phase')
      jobReport.setJobParameter('Error Message',str(x),sendFlag=False)
      jobReport.setJobStatus('Failed','Exception During Execution',sendFlag=False)
      job.sendFailoverRequest('Failed','Exception During Execution')
      return 1

  if arguments['Job'].has_key('OutputSandbox') or arguments['Job'].has_key('OutputData'):
    try:
      result = job.processJobOutputs(arguments)
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except Exception, x:
      gLogger.exception('JobWrapper failed to process output files')
      jobReport.setJobParameter('Error Message',str(x),sendFlag=False)
      jobReport.setJobStatus('Failed','Uploading Job Outputs',sendFlag=False)
      job.sendFailoverRequest('Failed','Uploading Job Outputs')
def execute( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception as e:
    gLogger.exception( 'JobWrapper failed the initialization phase', lException = e )
    rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    try:
      job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' )
    except Exception as e:
      gLogger.exception( 'JobWrapper failed sending job accounting', lException = e )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport )
      job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception as x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport )
        job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
  else:
    gLogger.verbose( 'Job has no InputData requirement' )

  gJobReport.commit()

  try:
    result = job.execute( arguments )
    if not result['OK']:
      gLogger.error( 'Failed to execute job', result['Message'] )
      raise JobWrapperError( result['Message'] )
  except Exception as x:
    if str( x ) == '0':
      gLogger.verbose( 'JobWrapper exited with status=0 after execution' )
    else:
      gLogger.exception( 'Job failed in execution phase' )
      gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False )
      gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag = False )
      job.sendFailoverRequest( 'Failed', 'Exception During Execution' )
      return 1

  if arguments['Job'].has_key( 'OutputSandbox' ) or arguments['Job'].has_key( 'OutputData' ):
    try:
      result = job.processJobOutputs( arguments )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception as x:
      gLogger.exception( 'JobWrapper failed to process output files' )
      gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False )
      gJobReport.setJobStatus( 'Failed', 'Uploading Job Outputs', sendFlag = False )
      job.sendFailoverRequest( 'Failed', 'Uploading Job Outputs' )
      return 2
  else:
    gLogger.verbose( 'Job has no OutputData or OutputSandbox requirement' )

  try:
    # Failed jobs will return 1 / successful jobs will return 0
    return job.finalize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the finalization phase' )
    return 2
Example #13
0
        gLogger.verbose('Job has no InputData requirement')

    jobReport.commit()

    try:
        result = job.execute(arguments)
        if not result['OK']:
            gLogger.error(result['Message'])
            raise JobWrapperError(result['Message'])
    except Exception, x:
        if str(x) == '0':
            gLogger.verbose('JobWrapper exited with status=0 after execution')
            pass
        else:
            gLogger.exception('Job failed in execution phase')
            jobReport.setJobParameter('Error Message', str(x), sendFlag=False)
            jobReport.setJobStatus('Failed',
                                   'Exception During Execution',
                                   sendFlag=False)
            job.sendFailoverRequest('Failed', 'Exception During Execution')
            return 1

    if arguments['Job'].has_key('OutputSandbox') or arguments['Job'].has_key(
            'OutputData'):
        try:
            result = job.processJobOutputs(arguments)
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception, x:
            gLogger.exception('JobWrapper failed to process output files')
Example #14
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])
            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error(jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                self.log.error(jobRequest["Message"])
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if not params.has_key("JobID"):
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if not params.has_key("JobType"):
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if not params.has_key("SystemConfig"):
            self.log.warn("Job has no system configuration defined in JDL parameters")
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            self.log.info(
                "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig
            )
            if not systemConfig:
                self.log.warn("/LocalSite/Architecture is not defined")
            params["SystemConfig"] = systemConfig
        else:
            systemConfig = params["SystemConfig"]
            if systemConfig.lower() == "any":
                systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
                self.log.info(
                    "Setting SystemConfig = /LocalSite/Architecture =",
                    '"%s" since it was set to "ANY" in the job description' % systemConfig,
                )
                if not systemConfig:
                    self.log.warn("/LocalSite/Architecture is not defined")
                params["SystemConfig"] = systemConfig

        if not params.has_key("CPUTime"):
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"]))
        self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False)

            if os.environ.has_key("BOINC_JOB_ID"):
                # Report BOINC environment
                for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]:
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure)
            if "Value" in result and result["Value"]:
                proxyChain = result["Value"]

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job %s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure)

            self.log.verbose("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    "Payload execution failed with error code %s" % submission["PayloadFailed"],
                    self.stopOnApplicationFailure,
                )

            self.log.verbose("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure
            )

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"]

        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")
Example #15
0
def execute(arguments):
    """ The only real function executed here
  """

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)

    if 'WorkingDirectory' in arguments:
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(
                    wdir
                )  # this will raise an exception if wdir already exists (which is ~OK)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except OSError as osError:
                if osError.errno == errno.EEXIST and os.path.isdir(wdir):
                    gLogger.exception(
                        'JobWrapperTemplate found that the working directory already exists'
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, 'Working Directory already exists')
                else:
                    gLogger.exception(
                        'JobWrapperTemplate could not create working directory'
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
    except Exception as exc:  #pylint: disable=broad-except
        gLogger.exception('JobWrapper failed the initialization phase',
                          lException=exc)
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        try:
            job.sendJobAccounting(rescheduleResult,
                                  'Job Wrapper Initialization')
        except Exception as exc:  #pylint: disable=broad-except
            gLogger.exception('JobWrapper failed sending job accounting',
                              lException=exc)
        return 1

    if 'InputSandbox' in arguments['Job']:
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except JobWrapperError:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
        except Exception as exc:  #pylint: disable=broad-except
            gLogger.exception(
                'JobWrapper raised exception while downloading input sandbox',
                lException=exc)
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if 'InputData' in arguments['Job']:
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except JobWrapperError:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
            except Exception as exc:  #pylint: disable=broad-except
                gLogger.exception(
                    'JobWrapper raised exception while resolving input data',
                    lException=exc)
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
    else:
        gLogger.verbose('Job has no InputData requirement')

    gJobReport.commit()

    try:
        result = job.execute(arguments)
        if not result['OK']:
            gLogger.error('Failed to execute job', result['Message'])
            raise JobWrapperError((result['Message'], result['Errno']))
    except JobWrapperError as exc:
        if exc.value[1] == 0 or str(exc.value[0]) == '0':
            gLogger.verbose('JobWrapper exited with status=0 after execution')
        if exc.value[1] == DErrno.EWMSRESC:
            gLogger.warn("Asked to reschedule job")
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'JobWrapper execution',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'JobWrapper execution')
            return 1
        gLogger.exception('Job failed in execution phase')
        gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
        gJobReport.setJobStatus('Failed',
                                'Exception During Execution',
                                sendFlag=False)
        job.sendFailoverRequest('Failed', 'Exception During Execution')
        return 1
    except Exception as exc:  #pylint: disable=broad-except
        gLogger.exception('Job raised exception during execution phase',
                          lException=exc)
        gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
        gJobReport.setJobStatus('Failed',
                                'Exception During Execution',
                                sendFlag=False)
        job.sendFailoverRequest('Failed', 'Exception During Execution')
        return 1

    if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']:
        try:
            result = job.processJobOutputs(arguments)
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except JobWrapperError as exc:
            gLogger.exception('JobWrapper failed to process output files')
            gJobReport.setJobParameter('Error Message',
                                       str(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                'JobWrapper raised exception while processing output files',
                lException=exc)
            gJobReport.setJobParameter('Error Message',
                                       str(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
    else:
        gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

    try:
        # Failed jobs will return 1 / successful jobs will return 0
        return job.finalize()
    except Exception as exc:  #pylint: disable=broad-except
        gLogger.exception(
            'JobWrapper raised exception during the finalization phase',
            lException=exc)
        return 2
Example #16
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Temporary mechanism to pass a shutdown message to the agent
            if os.path.exists('/var/lib/dirac_drain'):
                return self.__finish('Node is being drained by an operator')
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(
                        "Disabling filling mode as errors calculating time left",
                        self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              '%d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            else:
                self.log.info('CE is not available')
                return self.__finish('CE Not Available')

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy'
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)

            # here finally calling the matcher
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirements for determining the number of processors
        # the minimum number of processors requested
        processors = int(
            params.get('NumberOfProcessors',
                       int(params.get('MinNumberOfProcessors', 1))))
        # the maximum number of processors allowed to the payload
        maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0))
        # need or not the whole node for the job
        wholeNode = 'WholeNode' in params
        mpTag = 'MultiProcessor' in params.get('Tags', [])

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(thisp,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self._setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self._rescheduleFailedJob(jobID, result['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName))
            result = self._submitJob(jobID, params, ceDict, optimizerParams,
                                     proxyChain, processors, wholeNode,
                                     maxNumberOfProcessors, mpTag)
            if not result['OK']:
                self.__report(jobID, 'Failed', result['Message'])
                return self.__finish(result['Message'])
            elif 'PayloadFailed' in result:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self._getCPUTimeLeft()

        return S_OK('Job Agent cycle complete')
Example #17
0
def execute(arguments):
    """The only real function executed here"""

    global gJobReport

    jobID = arguments["Job"].get("JobID", 0)
    os.environ["JOBID"] = str(jobID)
    jobID = int(jobID)

    if "WorkingDirectory" in arguments:
        wdir = os.path.expandvars(arguments["WorkingDirectory"])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(
                    wdir
                )  # this will raise an exception if wdir already exists (which is ~OK)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except OSError as osError:
                if osError.errno == errno.EEXIST and os.path.isdir(wdir):
                    gLogger.exception(
                        "JobWrapperTemplate found that the working directory already exists"
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, "Working Directory already exists")
                else:
                    gLogger.exception(
                        "JobWrapperTemplate could not create working directory"
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, "Could Not Create Working Directory")
                return 1

    gJobReport = JobReport(jobID, "JobWrapper")

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception("JobWrapper failed the initialization phase",
                          lException=exc)
        rescheduleResult = rescheduleFailedJob(
            jobID=jobID,
            minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION,
            jobReport=gJobReport)
        job.sendJobAccounting(
            status=rescheduleResult,
            minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION)
        return 1

    if "InputSandbox" in arguments["Job"]:
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments["Job"]["InputSandbox"])
            if not result["OK"]:
                gLogger.warn(result["Message"])
                raise JobWrapperError(result["Message"])
        except JobWrapperError:
            gLogger.exception("JobWrapper failed to download input sandbox")
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX)
            return 1
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                "JobWrapper raised exception while downloading input sandbox",
                lException=exc)
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX)
            return 1
    else:
        gLogger.verbose("Job has no InputSandbox requirement")

    gJobReport.commit()

    if "InputData" in arguments["Job"]:
        if arguments["Job"]["InputData"]:
            try:
                result = job.resolveInputData()
                if not result["OK"]:
                    gLogger.warn(result["Message"])
                    raise JobWrapperError(result["Message"])
            except JobWrapperError:
                gLogger.exception("JobWrapper failed to resolve input data")
                rescheduleResult = rescheduleFailedJob(
                    jobID=jobID,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION,
                    jobReport=gJobReport)
                job.sendJobAccounting(
                    status=rescheduleResult,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION)
                return 1
            except Exception as exc:  # pylint: disable=broad-except
                gLogger.exception(
                    "JobWrapper raised exception while resolving input data",
                    lException=exc)
                rescheduleResult = rescheduleFailedJob(
                    jobID=jobID,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION,
                    jobReport=gJobReport)
                job.sendJobAccounting(
                    status=rescheduleResult,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION)
                return 1
        else:
            gLogger.verbose("Job has a null InputData requirement:")
            gLogger.verbose(arguments)
    else:
        gLogger.verbose("Job has no InputData requirement")

    gJobReport.commit()

    try:
        result = job.execute()
        if not result["OK"]:
            gLogger.error("Failed to execute job", result["Message"])
            raise JobWrapperError((result["Message"], result["Errno"]))
    except JobWrapperError as exc:
        if exc.value[1] == 0 or str(exc.value[0]) == "0":
            gLogger.verbose("JobWrapper exited with status=0 after execution")
        if exc.value[1] == DErrno.EWMSRESC:
            gLogger.warn("Asked to reschedule job")
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
            return 1
        gLogger.exception("Job failed in execution phase")
        gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
        gJobReport.setJobStatus(
            status=JobStatus.FAILED,
            minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC,
            sendFlag=False)
        job.sendFailoverRequest()
        job.sendJobAccounting(status=JobStatus.FAILED,
                              minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
        return 1
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception("Job raised exception during execution phase",
                          lException=exc)
        gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
        gJobReport.setJobStatus(
            status=JobStatus.FAILED,
            minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC,
            sendFlag=False)
        job.sendFailoverRequest()
        job.sendJobAccounting(status=JobStatus.FAILED,
                              minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
        return 1

    if "OutputSandbox" in arguments["Job"] or "OutputData" in arguments["Job"]:
        try:
            result = job.processJobOutputs()
            if not result["OK"]:
                gLogger.warn(result["Message"])
                raise JobWrapperError(result["Message"])
        except JobWrapperError as exc:
            gLogger.exception("JobWrapper failed to process output files")
            gJobReport.setJobParameter("Error Message",
                                       repr(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS,
                sendFlag=False)
            job.sendFailoverRequest()
            job.sendJobAccounting(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS)

            return 2
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                "JobWrapper raised exception while processing output files",
                lException=exc)
            gJobReport.setJobParameter("Error Message",
                                       repr(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS,
                sendFlag=False)
            job.sendFailoverRequest()
            job.sendJobAccounting(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS)
            return 2
    else:
        gLogger.verbose("Job has no OutputData or OutputSandbox requirement")

    try:
        # Failed jobs will return !=0 / successful jobs will return 0
        return job.finalize()
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception(
            "JobWrapper raised exception during the finalization phase",
            lException=exc)
        return 2
Example #18
0
    def execute(self):
        """The JobAgent execution method."""

        # Temporary mechanism to pass a shutdown message to the agent
        if os.path.exists("/var/lib/dirac_drain"):
            return self._finish("Node is being drained by an operator")

        self.log.verbose("Job Agent execution loop")

        # Check that there is enough slots to match a job
        result = self._checkCEAvailability(self.computingElement)
        if not result["OK"]:
            return self._finish(result["Message"])
        if result["OK"] and result["Value"]:
            return result

        # Check that we are allowed to continue and that time left is sufficient
        if self.jobCount:
            cpuWorkLeft = self._computeCPUWorkLeft()
            result = self._checkCPUWorkLeft(cpuWorkLeft)
            if not result["OK"]:
                return result
            result = self._setCPUWorkLeft(cpuWorkLeft)
            if not result["OK"]:
                return result

        # Get environment details and enhance them
        result = self._getCEDict(self.computingElement)
        if not result["OK"]:
            return result
        ceDictList = result["Value"]

        for ceDict in ceDictList:
            self._setCEDict(ceDict)

        # Try to match a job
        jobRequest = self._matchAJob(ceDictList)

        self.stopAfterFailedMatches = self.am_getOption(
            "StopAfterFailedMatches", self.stopAfterFailedMatches)
        if not jobRequest["OK"]:
            res = self._checkMatchingIssues(jobRequest)
            if not res["OK"]:
                self._finish(res["Message"])
                return res

            # if we don't match a job, independently from the reason,
            # we wait a bit longer before trying again
            time.sleep(
                int(self.am_getOption("PollingTime")) *
                (self.matchFailedCount + 1) * 2)
            return res

        # If we are, we matched a job
        # Reset the Counter
        self.matchFailedCount = 0

        # Check matcher information returned
        matcherParams = ["JDL", "DN", "Group"]
        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
        result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport)
        if not result["OK"]:
            return self._finish(result["Message"])

        # Get matcher information
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                "PilotInfoReportedFlag", False)

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]
        ceDict = matcherInfo["CEDict"]
        matchTime = matcherInfo["matchTime"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        # Get JDL paramters
        parameters = self._getJDLParameters(jobJDL)
        if not parameters["OK"]:
            jobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus="Could Not Extract JDL Parameters")
            self.log.warn("Could Not Extract JDL Parameters",
                          parameters["Message"])
            return self._finish("JDL Problem")

        params = parameters["Value"]
        result = self._extractValuesFromJobParams(params, jobReport)
        if not result["OK"]:
            return self._finish(result["Value"])
        submissionParams = result["Value"]
        jobID = submissionParams["jobID"]
        jobType = submissionParams["jobType"]

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info(
            "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport.setJobParameter(par_name="MatcherServiceTime",
                                      par_value=str(matchTime),
                                      sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for thisp in ("BoincUserID", "BoincHostID",
                              "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(par_name=thisp,
                                              par_value=gConfig.getValue(
                                                  "/LocalSite/%s" % thisp,
                                                  "Unknown"),
                                              sendFlag=False)

            jobReport.setJobStatus(minorStatus="Job Received by Agent",
                                   sendFlag=False)
            result_setupProxy = self._setupProxy(ownerDN, jobGroup)
            if not result_setupProxy["OK"]:
                result = self._rescheduleFailedJob(
                    jobID, result_setupProxy["Message"])
                return self._finish(result["Message"],
                                    self.stopOnApplicationFailure)
            proxyChain = result_setupProxy.get("Value")

            # Save the job jdl for external monitoring
            self._saveJobJDLRequest(jobID, jobJDL)

            # Check software and install them if required
            software = self._checkInstallSoftware(jobID, params, ceDict,
                                                  jobReport)
            if not software["OK"]:
                self.log.error("Failed to install software for job",
                               "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                result = self._rescheduleFailedJob(jobID, errorMsg)
                return self._finish(result["Message"],
                                    self.stopOnApplicationFailure)

            gridCE = gConfig.getValue("/LocalSite/GridCE", "")
            if gridCE:
                jobReport.setJobParameter(par_name="GridCE",
                                          par_value=gridCE,
                                          sendFlag=False)

            queue = gConfig.getValue("/LocalSite/CEQueue", "")
            if queue:
                jobReport.setJobParameter(par_name="CEQueue",
                                          par_value=queue,
                                          sendFlag=False)

            self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName))
            result_submitJob = self._submitJob(
                jobID=jobID,
                jobParams=params,
                resourceParams=ceDict,
                optimizerParams=optimizerParams,
                proxyChain=proxyChain,
                jobReport=jobReport,
                processors=submissionParams["processors"],
                wholeNode=submissionParams["wholeNode"],
                maxNumberOfProcessors=submissionParams[
                    "maxNumberOfProcessors"],
                mpTag=submissionParams["mpTag"],
            )

            # Committing the JobReport before evaluating the result of job submission
            res = jobReport.commit()
            if not res["OK"]:
                resFD = jobReport.generateForwardDISET()
                if not resFD["OK"]:
                    self.log.error("Error generating ForwardDISET operation",
                                   resFD["Message"])
                elif resFD["Value"]:
                    # Here we create the Request.
                    op = resFD["Value"]
                    request = Request()
                    requestName = "jobAgent_%s" % jobID
                    request.RequestName = requestName.replace('"', "")
                    request.JobID = jobID
                    request.SourceComponent = "JobAgent_%s" % jobID
                    request.addOperation(op)
                    # This might fail, but only a message would be printed.
                    self._sendFailoverRequest(request)

            if not result_submitJob["OK"]:
                return self._finish(result_submitJob["Message"])
            elif "PayloadFailed" in result_submitJob:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % result_submitJob[
                    "PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self._finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            result = self._rescheduleFailedJob(
                jobID, "Job processing failed with exception", direct=True)
            return self._finish(result["Message"],
                                self.stopOnApplicationFailure)

        return S_OK("Job Agent cycle complete")
Example #19
0
    def execute(self):
        """The JobAgent execution method."""
        self.log.verbose("Job Agent execution loop")

        queueDictItems = list(self.queueDict.items())
        random.shuffle(queueDictItems)

        # Check that there is enough slots locally
        result = self._checkCEAvailability(self.computingElement)
        if not result["OK"] or result["Value"]:
            return result

        for queueName, queueDictionary in queueDictItems:

            # Make sure there is no problem with the queue before trying to submit
            if not self._allowedToSubmit(queueName):
                continue

            # Get a working proxy
            ce = queueDictionary["CE"]
            cpuTime = 86400 * 3
            self.log.verbose(
                "Getting pilot proxy",
                "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            proxy = result["Value"]
            result = proxy.getRemainingSecs()  # pylint: disable=no-member
            if not result["OK"]:
                return result
            lifetime_secs = result["Value"]
            ce.setProxy(proxy, lifetime_secs)

            # Check that there is enough slots in the remote CE to match a job
            result = self._checkCEAvailability(ce)
            if not result["OK"] or result["Value"]:
                self.failedQueues[queueName] += 1
                continue

            # Get environment details and enhance them
            result = self._getCEDict(ce)
            if not result["OK"]:
                self.failedQueues[queueName] += 1
                continue
            ceDictList = result["Value"]

            for ceDict in ceDictList:
                # Information about number of processors might not be returned in CE.getCEStatus()
                ceDict["NumberOfProcessors"] = ce.ceParameters.get(
                    "NumberOfProcessors")
                self._setCEDict(ceDict)

            # Update the configuration with the names of the Site, CE and queue to target
            # This is used in the next stages
            self._updateConfiguration("Site", queueDictionary["Site"])
            self._updateConfiguration("GridCE", queueDictionary["CEName"])
            self._updateConfiguration("CEQueue", queueDictionary["QueueName"])
            self._updateConfiguration("RemoteExecution", True)

            # Try to match a job
            jobRequest = self._matchAJob(ceDictList)
            while jobRequest["OK"]:

                # Check matcher information returned
                matcherParams = ["JDL", "DN", "Group"]
                matcherInfo = jobRequest["Value"]
                jobID = matcherInfo["JobID"]
                jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName)
                result = self._checkMatcherInfo(matcherInfo, matcherParams,
                                                jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break

                jobJDL = matcherInfo["JDL"]
                jobGroup = matcherInfo["Group"]
                ownerDN = matcherInfo["DN"]
                ceDict = matcherInfo["CEDict"]
                matchTime = matcherInfo["matchTime"]

                optimizerParams = {}
                for key in matcherInfo:
                    if key not in matcherParams:
                        optimizerParams[key] = matcherInfo[key]

                # Get JDL paramters
                parameters = self._getJDLParameters(jobJDL)
                if not parameters["OK"]:
                    jobReport.setJobStatus(
                        status=JobStatus.FAILED,
                        minorStatus="Could Not Extract JDL Parameters")
                    self.log.warn("Could Not Extract JDL Parameters",
                                  parameters["Message"])
                    self.failedQueues[queueName] += 1
                    break

                params = parameters["Value"]
                result = self._extractValuesFromJobParams(params, jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break
                submissionParams = result["Value"]
                jobID = submissionParams["jobID"]
                jobType = submissionParams["jobType"]

                self.log.verbose("Job request successful: \n",
                                 jobRequest["Value"])
                self.log.info(
                    "Received",
                    "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" %
                    (jobID, jobType, ownerDN, jobGroup))
                try:
                    jobReport.setJobParameter(par_name="MatcherServiceTime",
                                              par_value=str(matchTime),
                                              sendFlag=False)
                    jobReport.setJobStatus(status=JobStatus.MATCHED,
                                           minorStatus="Job Received by Agent",
                                           sendFlag=False)

                    # Setup proxy
                    result_setupProxy = self._setupProxy(ownerDN, jobGroup)
                    if not result_setupProxy["OK"]:
                        result = self._rescheduleFailedJob(
                            jobID, result_setupProxy["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    proxyChain = result_setupProxy.get("Value")

                    # Check software and install them if required
                    software = self._checkInstallSoftware(
                        jobID, params, ceDict, jobReport)
                    if not software["OK"]:
                        self.log.error("Failed to install software for job",
                                       "%s" % (jobID))
                        errorMsg = software["Message"]
                        if not errorMsg:
                            errorMsg = "Failed software installation"
                        result = self._rescheduleFailedJob(jobID, errorMsg)
                        self.failedQueues[queueName] += 1
                        break

                    # Submit the job to the CE
                    self.log.debug("Before self._submitJob() (%sCE)" %
                                   (self.ceName))
                    result_submitJob = self._submitJob(
                        jobID=jobID,
                        jobParams=params,
                        resourceParams=ceDict,
                        optimizerParams=optimizerParams,
                        proxyChain=proxyChain,
                        jobReport=jobReport,
                        processors=submissionParams["processors"],
                        wholeNode=submissionParams["wholeNode"],
                        maxNumberOfProcessors=submissionParams[
                            "maxNumberOfProcessors"],
                        mpTag=submissionParams["mpTag"],
                    )

                    # Committing the JobReport before evaluating the result of job submission
                    res = jobReport.commit()
                    if not res["OK"]:
                        resFD = jobReport.generateForwardDISET()
                        if not resFD["OK"]:
                            self.log.error(
                                "Error generating ForwardDISET operation",
                                resFD["Message"])
                        elif resFD["Value"]:
                            # Here we create the Request.
                            op = resFD["Value"]
                            request = Request()
                            requestName = "jobAgent_%s" % jobID
                            request.RequestName = requestName.replace('"', "")
                            request.JobID = jobID
                            request.SourceComponent = "JobAgent_%s" % jobID
                            request.addOperation(op)
                            # This might fail, but only a message would be printed.
                            self._sendFailoverRequest(request)

                    if not result_submitJob["OK"]:
                        self.log.error("Error during submission",
                                       result_submitJob["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    elif "PayloadFailed" in result_submitJob:
                        # Do not keep running and do not overwrite the Payload error
                        message = "Payload execution failed with error code %s" % result_submitJob[
                            "PayloadFailed"]
                        self.log.info(message)

                    self.log.debug("After %sCE submitJob()" % (self.ceName))

                    # Check that there is enough slots locally
                    result = self._checkCEAvailability(self.computingElement)
                    if not result["OK"] or result["Value"]:
                        return result

                    # Check that there is enough slots in the remote CE to match a new job
                    result = self._checkCEAvailability(ce)
                    if not result["OK"] or result["Value"]:
                        self.failedQueues[queueName] += 1
                        break

                    # Try to match a new job
                    jobRequest = self._matchAJob(ceDictList)
                except Exception as subExcept:  # pylint: disable=broad-except
                    self.log.exception("Exception in submission",
                                       "",
                                       lException=subExcept,
                                       lExcInfo=True)
                    result = self._rescheduleFailedJob(
                        jobID, "Job processing failed with exception")
                    self.failedQueues[queueName] += 1
                    break

            if not jobRequest["OK"]:
                self._checkMatchingIssues(jobRequest)
                self.failedQueues[queueName] += 1
                continue

        return S_OK("Push Job Agent cycle complete")