Example #1
0
  def _getCurrentConfig(self):
    """Return the current system configuration."""
    from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData
    gConfig.forceRefresh()

    fullCfg = CFG()
    setup = gConfig.getValue('/DIRAC/Setup', '')
    setupList = gConfig.getSections('/DIRAC/Setups', [])
    if not setupList['OK']:
      return S_ERROR('Could not get /DIRAC/Setups sections')
    setupList = setupList['Value']
    if setup not in setupList:
      return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList)))
    serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup)
    if not serviceSetups['OK']:
      return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup)
    serviceSetups = serviceSetups['Value']  # dict
    for system, setup in serviceSetups.items():
      if self.systems and system not in self.systems:
        continue
      systemCfg = gConfigurationData.remoteCFG.getAsCFG("/Systems/%s/%s" % (system, setup))
      for section in systemCfg.listSections():
        if section not in ('Agents', 'Services', 'Executors'):
          systemCfg.deleteKey(section)

      fullCfg.createNewSection("/%s" % system, contents=systemCfg)

    return S_OK(fullCfg)
Example #2
0
  def _parseConfigTemplate(self, templatePath, cfg=None):
    """Parse the ConfigTemplate.cfg files.

    :param str templatePath: path to the folder containing a ConfigTemplate.cfg file
    :param CFG cfg: cfg to merge with the systems config
    :returns: CFG object
    """
    cfg = CFG() if cfg is None else cfg

    system = os.path.split(templatePath.rstrip("/"))[1]
    if system.lower().endswith('system'):
      system = system[:-len('System')]

    if self.systems and system not in self.systems:
      return S_OK(cfg)

    templatePath = os.path.join(templatePath, 'ConfigTemplate.cfg')
    if not os.path.exists(templatePath):
      return S_ERROR("File not found: %s" % templatePath)

    loadCfg = CFG()
    loadCfg.loadFromFile(templatePath)

    newCfg = CFG()
    newCfg.createNewSection("/%s" % system, contents=loadCfg)

    cfg = cfg.mergeWith(newCfg)

    return S_OK(cfg)
Example #3
0
class ProcessList(object):
  """ The ProcessList uses internally the CFG utility to store the processes and their properties.
  """
  def __init__(self, location):
    self.cfg = CFG()
    self.location = location
    self.goodProcessList = True
    if os.path.exists(self.location):
      self.cfg.loadFromFile(self.location)
      if not self.cfg.existsKey('Processes'):
        self.cfg.createNewSection('Processes')
    else:
      self.goodProcessList = False  
      
  def _writeProcessList(self, path):
    """ Write to text
    """
    handle, tmpName = tempfile.mkstemp()
    written = self.cfg.writeToFile(tmpName)
    os.close(handle)
    if not written:
      if os.path.exists(tmpName):
        os.remove(tmpName)
      return written
    if os.path.exists(path):
      LOG.debug("Replacing %s" % path)
    try:
      shutil.move(tmpName, path)
      return True
    except OSError, err:
      LOG.error("Failed to overwrite process list.", err)
      LOG.info("If your process list is corrupted a backup can be found %s" % tmpName)
      return False
Example #4
0
class ProcessList(object):
  """ The ProcessList uses internally the CFG utility to store the processes and their properties.
  """
  def __init__(self, location):
    self.cfg = CFG()
    self.location = location
    self.goodProcessList = True
    if os.path.exists(self.location):
      self.cfg.loadFromFile(self.location)
      if not self.cfg.existsKey('Processes'):
        self.cfg.createNewSection('Processes')
    else:
      self.goodProcessList = False  
      
  def _writeProcessList(self, path):
    """ Write to text
    """
    handle, tmpName = tempfile.mkstemp()
    written = self.cfg.writeToFile(tmpName)
    os.close(handle)
    if not written:
      if os.path.exists(tmpName):
        os.remove(tmpName)
      return written
    if os.path.exists(path):
      gLogger.debug("Replacing %s" % path)
    try:
      shutil.move(tmpName, path)
      return True
    except OSError, err:
      gLogger.error("Failed to overwrite process list.", err)
      gLogger.info("If your process list is corrupted a backup can be found %s" % tmpName)
      return False
Example #5
0
 def getSystemsCFG(self):
   """Find all the ConfigTemplates and collate them into one CFG object."""
   cfg = CFG()
   cfg.createNewSection('/Systems')
   templateLocations = self.findConfigTemplates()
   for templatePath in templateLocations:
     cfgRes = self.parseConfigTemplate(templatePath, cfg)
     if cfgRes['OK']:
       cfg = cfgRes['Value']
   return cfg
Example #6
0
 def toCFG(self):
   """ Get the full description of the file in CFG format
   """
   oCFG = CFG()
   strippedLFN = self.lfn.replace('/','&&')
   oCFG.createNewSection(strippedLFN)
   oCFG.setOption('%s/Status' % (strippedLFN), self.status)    
   oCFG.setOption('%s/Size' % (strippedLFN), self.size)    
   oCFG.setOption('%s/GUID' % (strippedLFN), self.guid)    
   oCFG.setOption('%s/Checksum' % (strippedLFN), self.checksum)
   #TODO: still have to include the CFG from the replica objects 
   if self.catalogReplicas:
     oCFG.createNewSection('%s/CatalogReplicas' % strippedLFN)
     for replica in self.catalogReplicas:
       pass
       #  rCFG.mergeWith(CFG().loadFromBuffer(replica.toCFG()['Value']))
   return S_OK(str(oCFG))
Example #7
0
 def toCFG(self):
     """ Get the full description of the file in CFG format
 """
     oCFG = CFG()
     strippedLFN = self.lfn.replace('/', '&&')
     oCFG.createNewSection(strippedLFN)
     oCFG.setOption('%s/Status' % (strippedLFN), self.status)
     oCFG.setOption('%s/Size' % (strippedLFN), self.size)
     oCFG.setOption('%s/GUID' % (strippedLFN), self.guid)
     oCFG.setOption('%s/Checksum' % (strippedLFN), self.checksum)
     #TODO: still have to include the CFG from the replica objects
     if self.catalogReplicas:
         oCFG.createNewSection('%s/CatalogReplicas' % strippedLFN)
         for replica in self.catalogReplicas:
             pass
             #  rCFG.mergeWith(CFG().loadFromBuffer(replica.toCFG()['Value']))
     return S_OK(str(oCFG))
def checkFunction():
  """ gets CPU normalisation from MFJ or calculate itself """
  from DIRAC.WorkloadManagementSystem.Client.CPUNormalization import getPowerFromMJF
  from ILCDIRAC.Core.Utilities.CPUNormalization import getCPUNormalization
  from DIRAC import gLogger, gConfig

  result = getCPUNormalization()

  if not result['OK']:
    gLogger.error( result['Message'] )

  norm = round( result['Value']['NORM'], 1 )

  gLogger.notice( 'Estimated CPU power is %.1f %s' % ( norm, result['Value']['UNIT'] ) )

  mjfPower = getPowerFromMJF()
  if mjfPower:
    gLogger.notice( 'CPU power from MJF is %.1f HS06' % mjfPower )
  else:
    gLogger.notice( 'MJF not available on this node' )

  if update and not configFile:
    gConfig.setOptionValue( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    gConfig.setOptionValue( '/LocalSite/CPUNormalizationFactor', norm )

    gConfig.dumpLocalCFGToFile( gConfig.diracConfigFilePath )
  if configFile:
    from DIRAC.Core.Utilities.CFG import CFG
    cfg = CFG()
    try:
      # Attempt to open the given file
      cfg.loadFromFile( configFile )
    except:
      pass
    # Create the section if it does not exist
    if not cfg.existsKey( 'LocalSite' ):
      cfg.createNewSection( 'LocalSite' )
    cfg.setOption( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    cfg.setOption( '/LocalSite/CPUNormalizationFactor', norm )

    cfg.writeToFile( configFile )


  DIRAC.exit()
Example #9
0
def __gConfigDefaults(defaultPath):
  """
  Build a cfg from a Default Section
  """
  from DIRAC import gConfig
  cfgDefaults = CFG()
  result = gConfig.getSections(defaultPath)
  if not result['OK']:
    return cfgDefaults
  for name in result['Value']:
    typePath = cfgPath(defaultPath, name)
    cfgDefaults.createNewSection(name)
    result = gConfig.getOptionsDict(typePath)
    if result['OK']:
      optionsDict = result['Value']
      for option, value in optionsDict.items():
        cfgDefaults[name].setOption(option, value)

  return cfgDefaults
Example #10
0
def __gConfigDefaults(defaultPath):
    """
  Build a cfg from a Default Section
  """
    from DIRAC import gConfig
    cfgDefaults = CFG()
    result = gConfig.getSections(defaultPath)
    if not result['OK']:
        return cfgDefaults
    for name in result['Value']:
        typePath = cfgPath(defaultPath, name)
        cfgDefaults.createNewSection(name)
        result = gConfig.getOptionsDict(typePath)
        if result['OK']:
            optionsDict = result['Value']
            for option, value in optionsDict.items():
                cfgDefaults[name].setOption(option, value)

    return cfgDefaults
Example #11
0
def getComputingElementDefaults(ceName='',
                                ceType='',
                                cfg=None,
                                currentSectionPath=''):
    """
  Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg
  """
    cesCfg = CFG()
    if cfg:
        try:
            cesCfg.loadFromFile(cfg)
            cesPath = cfgInstallPath('ComputingElements')
            if cesCfg.isSection(cesPath):
                for section in cfgPathToList(cesPath):
                    cesCfg = cesCfg[section]
        except:
            return CFG()

    # Overwrite the cfg with Command line arguments
    if ceName:
        if not cesCfg.isSection(ceName):
            cesCfg.createNewSection(ceName)
        if currentSectionPath:
            # Add Options from Command Line
            optionsDict = __getExtraOptions(currentSectionPath)
            for name, value in optionsDict.items():
                cesCfg[ceName].setOption(name, value)  #pylint: disable=no-member
        if ceType:
            cesCfg[ceName].setOption('CEType', ceType)  #pylint: disable=no-member

    ceDefaultSection = cfgPath(defaultSection('ComputingElements'))
    # Load Default for the given type from Central configuration is defined
    ceDefaults = __gConfigDefaults(ceDefaultSection)
    for ceName in cesCfg.listSections():
        if 'CEType' in cesCfg[ceName]:
            ceType = cesCfg[ceName]['CEType']
            if ceType in ceDefaults:
                for option in ceDefaults[ceType].listOptions():
                    if option not in cesCfg[ceName]:
                        cesCfg[ceName].setOption(option,
                                                 ceDefaults[ceType][option])

    return cesCfg
Example #12
0
def getComputingElementDefaults(ceName="", ceType="", cfg=None, currentSectionPath=""):
    """
  Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg
  """
    cesCfg = CFG()
    if cfg:
        try:
            cesCfg.loadFromFile(cfg)
            cesPath = cfgInstallPath("ComputingElements")
            if cesCfg.isSection(cesPath):
                for section in cfgPathToList(cesPath):
                    cesCfg = cesCfg[section]
        except:
            return CFG()

    # Overwrite the cfg with Command line arguments
    if ceName:
        if not cesCfg.isSection(ceName):
            cesCfg.createNewSection(ceName)
        if currentSectionPath:
            # Add Options from Command Line
            optionsDict = __getExtraOptions(currentSectionPath)
            for name, value in optionsDict.items():
                cesCfg[ceName].setOption(name, value)
        if ceType:
            cesCfg[ceName].setOption("CEType", ceType)

    ceDefaultSection = cfgPath(defaultSection("ComputingElements"))
    # Load Default for the given type from Central configuration is defined
    ceDefaults = __gConfigDefaults(ceDefaultSection)
    for ceName in cesCfg.listSections():
        if "CEType" in cesCfg[ceName]:
            ceType = cesCfg[ceName]["CEType"]
            if ceType in ceDefaults:
                for option in ceDefaults[ceType].listOptions():
                    if option not in cesCfg[ceName]:
                        cesCfg[ceName].setOption(option, ceDefaults[ceType][option])

    return cesCfg
Example #13
0
def getComputingElementDefaults(ceName='', ceType='', cfg=None, currentSectionPath=''):
  """
  Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg
  """
  cesCfg = CFG()
  if cfg:
    try:
      cesCfg.loadFromFile(cfg)
      cesPath = cfgInstallPath('ComputingElements')
      if cesCfg.isSection(cesPath):
        for section in cfgPathToList(cesPath):
          cesCfg = cesCfg[section]
    except BaseException:
      return CFG()

  # Overwrite the cfg with Command line arguments
  if ceName:
    if not cesCfg.isSection(ceName):
      cesCfg.createNewSection(ceName)
    if currentSectionPath:
      # Add Options from Command Line
      optionsDict = __getExtraOptions(currentSectionPath)
      for name, value in optionsDict.items():
        cesCfg[ceName].setOption(name, value)  # pylint: disable=no-member
    if ceType:
      cesCfg[ceName].setOption('CEType', ceType)  # pylint: disable=no-member

  ceDefaultSection = cfgPath(defaultSection('ComputingElements'))
  # Load Default for the given type from Central configuration is defined
  ceDefaults = __gConfigDefaults(ceDefaultSection)
  for ceName in cesCfg.listSections():
    if 'CEType' in cesCfg[ceName]:
      ceType = cesCfg[ceName]['CEType']
      if ceType in ceDefaults:
        for option in ceDefaults[ceType].listOptions():  # pylint: disable=no-member
          if option not in cesCfg[ceName]:
            cesCfg[ceName].setOption(option, ceDefaults[ceType][option])  # pylint: disable=unsubscriptable-object

  return cesCfg
Example #14
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Temporary mechanism to pass a shutdown message to the agent
            if os.path.exists('/var/lib/dirac_drain'):
                return self.__finish('Node is being drained by an operator')
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(
                        "Disabling filling mode as errors calculating time left",
                        self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              '%d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            else:
                self.log.info('CE is not available')
                return self.__finish('CE Not Available')

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy'
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)

            # here finally calling the matcher
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirements for determining the number of processors
        # the minimum number of processors requested
        processors = int(
            params.get('NumberOfProcessors',
                       int(params.get('MinNumberOfProcessors', 1))))
        # the maximum number of processors allowed to the payload
        maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0))
        # need or not the whole node for the job
        wholeNode = 'WholeNode' in params
        mpTag = 'MultiProcessor' in params.get('Tags', [])

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(thisp,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self._setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self._rescheduleFailedJob(jobID, result['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName))
            result = self._submitJob(jobID, params, ceDict, optimizerParams,
                                     proxyChain, processors, wholeNode,
                                     maxNumberOfProcessors, mpTag)
            if not result['OK']:
                self.__report(jobID, 'Failed', result['Message'])
                return self.__finish(result['Message'])
            elif 'PayloadFailed' in result:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self._getCPUTimeLeft()

        return S_OK('Job Agent cycle complete')
Example #15
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")
Example #16
0
class JobRepository( object ):

  def __init__( self, repository = None ):
    self.location = repository
    if not self.location:
      if "HOME" in os.environ:
        self.location = '%s/.dirac.repo.rep' % os.environ['HOME']
      else:
        self.location = '%s/.dirac.repo.rep' % os.getcwd()
    self.repo = CFG()
    if os.path.exists( self.location ):
      self.repo.loadFromFile( self.location )
      if not self.repo.existsKey( 'Jobs' ):
        self.repo.createNewSection( 'Jobs' )
    else:
      self.repo.createNewSection( 'Jobs' )
    self.OK = True
    written = self._writeRepository( self.location )
    if not written:
      self.OK = False

  def isOK( self ):
    return self.OK

  def readRepository( self ):
    return S_OK( self.repo.getAsDict( 'Jobs' ) )

  def writeRepository( self, alternativePath = None ):
    destination = self.location
    if alternativePath:
      destination = alternativePath
    written = self._writeRepository( destination )
    if not written:
      return S_ERROR( "Failed to write repository" )
    return S_OK( destination )

  def resetRepository( self, jobIDs = [] ):
    if not jobIDs:
      jobs = self.readRepository()['Value']
      jobIDs = jobs.keys()
    paramDict = {'State'       : 'Submitted',
                 'Retrieved'   : 0,
                 'OutputData'  : 0}
    for jobID in jobIDs:
      self._writeJob( jobID, paramDict, True )
    self._writeRepository( self.location )
    return S_OK()

  def _writeRepository( self, path ):
    handle, tmpName = tempfile.mkstemp()
    written = self.repo.writeToFile( tmpName )
    os.close( handle )
    if not written:
      if os.path.exists( tmpName ):
        os.remove( tmpName )
      return written
    if os.path.exists( path ):
      gLogger.debug( "Replacing %s" % path )
    try:
      shutil.move( tmpName, path )
      return True
    except Exception as x:
      gLogger.error( "Failed to overwrite repository.", x )
      gLogger.info( "If your repository is corrupted a backup can be found %s" % tmpName )
      return False

  def appendToRepository( self, repoLocation ):
    if not os.path.exists( repoLocation ):
      gLogger.error( "Secondary repository does not exist", repoLocation )
      return S_ERROR( "Secondary repository does not exist" )
    self.repo = CFG().loadFromFile( repoLocation ).mergeWith( self.repo )
    self._writeRepository( self.location )
    return S_OK()

  def addJob( self, jobID, state = 'Submitted', retrieved = 0, outputData = 0, update = False ):
    paramDict = { 'State'       : state,
                  'Time'        : self._getTime(),
                  'Retrieved'   : int( retrieved ),
                  'OutputData'  : outputData}
    self._writeJob( jobID, paramDict, update )
    self._writeRepository( self.location )
    return S_OK( jobID )

  def updateJob( self, jobID, paramDict ):
    if self._existsJob( jobID ):
      paramDict['Time'] = self._getTime()
      self._writeJob( jobID, paramDict, True )
      self._writeRepository( self.location )
    return S_OK()

  def updateJobs( self, jobDict ):
    for jobID, paramDict in jobDict.items():
      if self._existsJob( jobID ):
        paramDict['Time'] = self._getTime()
        self._writeJob( jobID, paramDict, True )
    self._writeRepository( self.location )
    return S_OK()

  def _getTime( self ):
    runtime = time.ctime()
    return runtime.replace( " ", "_" )

  def _writeJob( self, jobID, paramDict, update ):
    jobID = str( jobID )
    jobExists = self._existsJob( jobID )
    if jobExists and ( not update ):
      gLogger.warn( "Job exists and not overwriting" )
      return S_ERROR( "Job exists and not overwriting" )
    if not jobExists:
      self.repo.createNewSection( 'Jobs/%s' % jobID )
    for key, value in paramDict.items():
      self.repo.setOption( 'Jobs/%s/%s' % ( jobID, key ), value )
    return S_OK()

  def removeJob( self, jobID ):
    res = self.repo['Jobs'].deleteKey( str( jobID ) ) #pylint: disable=no-member
    if res:
      self._writeRepository( self.location )
    return S_OK()

  def existsJob( self, jobID ):
    return S_OK( self._existsJob( jobID ) )

  def _existsJob( self, jobID ):
    return self.repo.isSection( 'Jobs/%s' % jobID )

  def getLocation( self ):
    return S_OK( self.location )

  def getSize( self ):
    return S_OK( len( self.repo.getAsDict( 'Jobs' ) ) )
Example #17
0
  def execute(self):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish('Node is being drained by an operator')
      # Only call timeLeft utility after a job has been picked up
      self.log.info('Attempting to check CPU time left for filling mode')
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn(self.timeLeftError)
          return self.__finish(self.timeLeftError)
        self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft))
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish('No more time left')
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
        if not result['OK']:
          return self.__finish(result['Message'])

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join('.', self.extraOptions)
        else:
          localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
        localCfg.writeToFile(localConfigFile)

      else:
        return self.__finish('Filling Mode is Disabled')

    self.log.verbose('Job Agent execution loop')
    result = self.computingElement.available()
    if not result['OK']:
      self.log.info('Resource is not available')
      self.log.info(result['Message'])
      return self.__finish('CE Not Available')

    self.log.info(result['Message'])

    ceInfoDict = result['CEInfoDict']
    runningJobs = ceInfoDict.get("RunningJobs")
    availableSlots = result['Value']

    if not availableSlots:
      if runningJobs:
        self.log.info('No available slots with %d running jobs' % runningJobs)
        return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs)
      else:
        self.log.info('CE is not available')
        return self.__finish('CE Not Available')

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    # Add pilot information
    gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if 'PilotReference' not in ceDict:
      ceDict['PilotReference'] = str(self.pilotReference)
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict('/AgentJobRequirements')
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update(requirementsDict)
      self.log.info('Requirements:', requirementsDict)

    self.log.verbose(ceDict)
    start = time.time()
    jobRequest = MatcherClient().requestJob(ceDict)
    matchTime = time.time() - start
    self.log.info('MatcherTime = %.2f (s)' % (matchTime))

    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)

    if not jobRequest['OK']:
      if re.search('No match found', jobRequest['Message']):
        self.log.notice('Job request OK: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("seconds timeout") != -1:
        self.log.error('Timeout while requesting job', jobRequest['Message'])
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("Pilot version does not match") != -1:
        errorMsg = 'Pilot version does not match the production version'
        self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, ''))
        return S_ERROR(jobRequest['Message'])
      else:
        self.log.notice('Failed to get jobs: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False)
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param))
        return self.__finish('Matcher Failed')
      elif not matcherInfo[param]:
        self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param))
        return self.__finish('Matcher Failed')
      else:
        self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param]))

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters(jobJDL)
    if not parameters['OK']:
      self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
      self.log.warn(parameters['Message'])
      return self.__finish('JDL Problem')

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report(jobID, 'Failed', msg)
      self.log.warn(msg)
      return self.__finish('JDL Problem')
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn('Job has no JobType defined in JDL parameters')
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn('Job has no CPU requirement defined in JDL parameters')

    # Job requirement for a number of processors
    processors = int(params.get('NumberOfProcessors', 1))
    wholeNode = 'WholeNode' in params

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose('Job request successful: \n', jobRequest['Value'])
    self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
    self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
    self.jobCount += 1
    try:
      jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
      jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False)

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'):
          jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False)

      jobReport.setJobStatus('Matched', 'Job Received by Agent')
      result = self.__setupProxy(ownerDN, jobGroup)
      if not result['OK']:
        return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure)
      proxyChain = result.get('Value')

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest(jobID, jobJDL)

      software = self.__checkInstallSoftware(jobID, params, ceDict)
      if not software['OK']:
        self.log.error('Failed to install software for job', '%s' % (jobID))
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

      self.log.debug('Before %sCE submitJob()' % (self.ceName))
      result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode)
      if not result['OK']:
        self.__report(jobID, 'Failed', result['Message'])
        return self.__finish(result['Message'])
      elif 'PayloadFailed' in result:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % result['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish(message, self.stopOnApplicationFailure)
        else:
          self.log.info(message)

      self.log.debug('After %sCE submitJob()' % (self.ceName))
    except Exception as subExcept:  # pylint: disable=broad-except
      self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
      return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure)

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

    result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    return S_OK('Job Agent cycle complete')
Example #18
0
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )
Example #19
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Example #20
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                          'BoincHostName'):
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Example #21
0
 def toCFG(self):
     oCFG = CFG()
     oCFG.createNewSection(self.se)
     oCFG.setOption('%s/Status' % (self.se), self.status)
     oCFG.setOption('%s/PFN' % (self.se), self.pfn)
     return S_OK(str(oCFG))
Example #22
0
 def toCFG(self):
   oCFG = CFG()
   oCFG.createNewSection(self.se)
   oCFG.setOption('%s/Status' % (self.se), self.status)
   oCFG.setOption('%s/PFN' % (self.se), self.pfn)
   return S_OK(str(oCFG))
Example #23
0
class JobRepository(object):

  def __init__(self, repository=None):
    self.location = repository
    if not self.location:
      if "HOME" in os.environ:
        self.location = '%s/.dirac.repo.rep' % os.environ['HOME']
      else:
        self.location = '%s/.dirac.repo.rep' % os.getcwd()
    self.repo = CFG()
    if os.path.exists(self.location):
      self.repo.loadFromFile(self.location)
      if not self.repo.existsKey('Jobs'):
        self.repo.createNewSection('Jobs')
    else:
      self.repo.createNewSection('Jobs')
    self.OK = True
    written = self._writeRepository(self.location)
    if not written:
      self.OK = False

  def isOK(self):
    return self.OK

  def readRepository(self):
    return S_OK(self.repo.getAsDict('Jobs'))

  def writeRepository(self, alternativePath=None):
    destination = self.location
    if alternativePath:
      destination = alternativePath
    written = self._writeRepository(destination)
    if not written:
      return S_ERROR("Failed to write repository")
    return S_OK(destination)

  def resetRepository(self, jobIDs=[]):
    if not jobIDs:
      jobs = self.readRepository()['Value']
      jobIDs = jobs.keys()
    paramDict = {'State': 'Submitted',
                 'Retrieved': 0,
                 'OutputData': 0}
    for jobID in jobIDs:
      self._writeJob(jobID, paramDict, True)
    self._writeRepository(self.location)
    return S_OK()

  def _writeRepository(self, path):
    handle, tmpName = tempfile.mkstemp()
    written = self.repo.writeToFile(tmpName)
    os.close(handle)
    if not written:
      if os.path.exists(tmpName):
        os.remove(tmpName)
      return written
    if os.path.exists(path):
      gLogger.debug("Replacing %s" % path)
    try:
      shutil.move(tmpName, path)
      return True
    except Exception as x:
      gLogger.error("Failed to overwrite repository.", x)
      gLogger.info("If your repository is corrupted a backup can be found %s" % tmpName)
      return False

  def appendToRepository(self, repoLocation):
    if not os.path.exists(repoLocation):
      gLogger.error("Secondary repository does not exist", repoLocation)
      return S_ERROR("Secondary repository does not exist")
    self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo)
    self._writeRepository(self.location)
    return S_OK()

  def addJob(self, jobID, state='Submitted', retrieved=0, outputData=0, update=False):
    paramDict = {'State': state,
                 'Time': self._getTime(),
                 'Retrieved': int(retrieved),
                 'OutputData': outputData}
    self._writeJob(jobID, paramDict, update)
    self._writeRepository(self.location)
    return S_OK(jobID)

  def updateJob(self, jobID, paramDict):
    if self._existsJob(jobID):
      paramDict['Time'] = self._getTime()
      self._writeJob(jobID, paramDict, True)
      self._writeRepository(self.location)
    return S_OK()

  def updateJobs(self, jobDict):
    for jobID, paramDict in jobDict.items():
      if self._existsJob(jobID):
        paramDict['Time'] = self._getTime()
        self._writeJob(jobID, paramDict, True)
    self._writeRepository(self.location)
    return S_OK()

  def _getTime(self):
    runtime = time.ctime()
    return runtime.replace(" ", "_")

  def _writeJob(self, jobID, paramDict, update):
    jobID = str(jobID)
    jobExists = self._existsJob(jobID)
    if jobExists and (not update):
      gLogger.warn("Job exists and not overwriting")
      return S_ERROR("Job exists and not overwriting")
    if not jobExists:
      self.repo.createNewSection('Jobs/%s' % jobID)
    for key, value in paramDict.items():
      self.repo.setOption('Jobs/%s/%s' % (jobID, key), value)
    return S_OK()

  def removeJob(self, jobID):
    res = self.repo['Jobs'].deleteKey(str(jobID))  # pylint: disable=no-member
    if res:
      self._writeRepository(self.location)
    return S_OK()

  def existsJob(self, jobID):
    return S_OK(self._existsJob(jobID))

  def _existsJob(self, jobID):
    return self.repo.isSection('Jobs/%s' % jobID)

  def getLocation(self):
    return S_OK(self.location)

  def getSize(self):
    return S_OK(len(self.repo.getAsDict('Jobs')))
  mjfPower = getPowerFromMJF()
  if mjfPower:
    gLogger.notice( 'CPU power from MJF is %.1f HS06' % mjfPower )
  else:
    gLogger.notice( 'MJF not available on this node' )

  if update and not configFile:
    gConfig.setOptionValue( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    gConfig.setOptionValue( '/LocalSite/CPUNormalizationFactor', norm )

    gConfig.dumpLocalCFGToFile( gConfig.diracConfigFilePath )
  if configFile:
    from DIRAC.Core.Utilities.CFG import CFG
    cfg = CFG()
    try:
      # Attempt to open the given file
      cfg.loadFromFile( configFile )
    except:
      pass
    # Create the section if it does not exist
    if not cfg.existsKey( 'LocalSite' ):
      cfg.createNewSection( 'LocalSite' )
    cfg.setOption( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    cfg.setOption( '/LocalSite/CPUNormalizationFactor', norm )

    cfg.writeToFile( configFile )


  DIRAC.exit()
Example #25
0
  localConfigFile = cFile
else:
  print "WORKSPACE: %s" % os.path.expandvars('$WORKSPACE')
  if os.path.isfile( os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg' ):
    localConfigFile = os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg'
  elif os.path.isfile( os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg' ):
    localConfigFile = os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg'
  elif os.path.isfile( './etc/dirac.cfg' ):
    localConfigFile = './etc/dirac.cfg'
  else:
    print "Local CFG file not found"
    exit( 2 )

localCfg.loadFromFile( localConfigFile )
if not localCfg.isSection( '/LocalSite' ):
  localCfg.createNewSection( '/LocalSite' )
localCfg.setOption( '/LocalSite/CPUTimeLeft', 5000 )
localCfg.setOption( '/DIRAC/Security/UseServerCertificate', False )

if not sMod:
  if not setup:
    setup = gConfig.getValue('/DIRAC/Setup')
    if not setup:
      setup = 'JenkinsSetup'
  if not vo:
    vo = gConfig.getValue('/DIRAC/VirtualOrganization')
    if not vo:
      vo = 'dirac'

  if not localCfg.isSection( '/DIRAC/VOPolicy' ):
    localCfg.createNewSection( '/DIRAC/VOPolicy' )
    result = getCPUNormalization()

    if not result["OK"]:
        DIRAC.gLogger.error(result["Message"])

    norm = int((result["Value"]["NORM"] + 0.05) * 10) / 10.0

    DIRAC.gLogger.notice("Normalization for current CPU is %.1f %s" % (norm, result["Value"]["UNIT"]))

    if update:
        DIRAC.gConfig.setOptionValue("/LocalSite/CPUNormalizationFactor", norm)
        DIRAC.gConfig.dumpLocalCFGToFile(DIRAC.gConfig.diracConfigFilePath)
    if configFile:
        from DIRAC.Core.Utilities.CFG import CFG

        cfg = CFG()
        try:
            # Attempt to open the given file
            cfg.loadFromFile(configFile)
        except:
            pass
        # Create the section if it does not exist
        if not cfg.existsKey("LocalSite"):
            cfg.createNewSection("LocalSite")
        cfg.setOption("/LocalSite/CPUNormalizationFactor", norm)

        cfg.writeToFile(configFile)

    DIRAC.exit()
Example #27
0
def loadJDLAsCFG( jdl ):
  """
  Load a JDL as CFG
  """
  def cleanValue( value ):
    value = value.strip()
    if value[0] == '"':
      entries = []
      iPos = 1
      current = ""
      state = "in"
      while iPos < len( value ):
        if value[ iPos ] == '"':
          if state == "in":
            entries.append( current )
            current = ""
            state = "out"
          elif state == "out":
            current = current.strip()
            if current not in ( ",", ):
              return S_ERROR( "value seems a list but is not separated in commas" )
            current = ""
            state = "in"
        else:
          current += value[ iPos ]
        iPos += 1
      if state == "in":
        return S_ERROR( 'value is opened with " but is not closed' )
      return S_OK( ", ".join ( entries ) )
    else:
      return S_OK( value.replace( '"', '' ) )

  def assignValue( key, value, cfg ):
    key = key.strip()
    if len( key ) == 0:
      return S_ERROR( "Invalid key name" )
    value = value.strip()
    if not value:
      return S_ERROR( "No value for key %s" % key )
    if value[0] == "{":
      if value[-1 ] != "}":
        return S_ERROR( "Value '%s' seems a list but does not end in '}'" % ( value ) )
      valList = List.fromChar( value[1:-1] )
      for i in range( len( valList ) ):
        result = cleanValue( valList[i] )
        if not result[ 'OK' ]:
          return S_ERROR( "Var %s : %s" % ( key, result[ 'Message' ] ) )
        valList[i] = result[ 'Value' ]
        if valList[ i ] == None:
          return S_ERROR( "List value '%s' seems invalid for item %s" % ( value, i ) )
      value = ", ".join( valList )
    else:
      result = cleanValue( value )
      if not result[ 'OK' ]:
        return S_ERROR( "Var %s : %s" % ( key, result[ 'Message' ] ) )
      nV = result[ 'Value' ]
      if nV == None:
        return S_ERROR( "Value '%s seems invalid" % ( value ) )
      value = nV
    cfg.setOption( key, value )
    return S_OK()

  if jdl[ 0 ] == "[":
    iPos = 1
  else:
    iPos = 0
  key = ""
  value = ""
  action = "key"
  insideLiteral = False
  cfg = CFG()
  while iPos < len( jdl ):
    char = jdl[ iPos ]
    if char == ";" and not insideLiteral:
      if key.strip():
        result = assignValue( key, value, cfg )
        if not result[ 'OK' ]:
          return result
      key = ""
      value = ""
      action = "key"
    elif char == "[" and not insideLiteral:
      key = key.strip()
      if not key:
        return S_ERROR( "Invalid key in JDL" )
      if value.strip():
        return S_ERROR( "Key %s seems to have a value and open a sub JDL at the same time" % key )
      result = loadJDLAsCFG( jdl[ iPos: ] )
      if not result[ 'OK' ]:
        return result
      subCfg, subPos = result[ 'Value' ]
      cfg.createNewSection( key, contents = subCfg )
      key = ""
      value = ""
      action = "key"
      insideLiteral = False
      iPos += subPos
    elif char == "=" and not insideLiteral:
      if action == "key":
        action = "value"
        insideLiteral = False
      else:
        value += char
    elif char == "]" and not insideLiteral:
      key = key.strip()
      if len( key ) > 0:
        result = assignValue( key, value, cfg )
        if not result[ 'OK' ]:
          return result
      return S_OK( ( cfg, iPos ) )
    else:
      if action == "key":
        key += char
      else:
        value += char
        if char == '"':
          insideLiteral = not insideLiteral
    iPos += 1

  return S_OK( ( cfg, iPos ) )
Example #28
0
File: JDL.py Project: roiser/DIRAC
def loadJDLAsCFG(jdl):
    """
  Load a JDL as CFG
  """

    def cleanValue(value):
        value = value.strip()
        if value[0] == '"':
            entries = []
            iPos = 1
            current = ""
            state = "in"
            while iPos < len(value):
                if value[iPos] == '"':
                    if state == "in":
                        entries.append(current)
                        current = ""
                        state = "out"
                    elif state == "out":
                        current = current.strip()
                        if current not in (",",):
                            return S_ERROR("value seems a list but is not separated in commas")
                        current = ""
                        state = "in"
                else:
                    current += value[iPos]
                iPos += 1
            if state == "in":
                return S_ERROR('value is opened with " but is not closed')
            return S_OK(", ".join(entries))
        else:
            return S_OK(value.replace('"', ""))

    def assignValue(key, value, cfg):
        key = key.strip()
        if len(key) == 0:
            return S_ERROR("Invalid key name")
        value = value.strip()
        if not value:
            return S_ERROR("No value for key %s" % key)
        if value[0] == "{":
            if value[-1] != "}":
                return S_ERROR("Value '%s' seems a list but does not end in '}'" % (value))
            valList = List.fromChar(value[1:-1])
            for i in range(len(valList)):
                result = cleanValue(valList[i])
                if not result["OK"]:
                    return S_ERROR("Var %s : %s" % (key, result["Message"]))
                valList[i] = result["Value"]
                if valList[i] == None:
                    return S_ERROR("List value '%s' seems invalid for item %s" % (value, i))
            value = ", ".join(valList)
        else:
            result = cleanValue(value)
            if not result["OK"]:
                return S_ERROR("Var %s : %s" % (key, result["Message"]))
            nV = result["Value"]
            if nV == None:
                return S_ERROR("Value '%s seems invalid" % (value))
            value = nV
        cfg.setOption(key, value)
        return S_OK()

    if jdl[0] == "[":
        iPos = 1
    else:
        iPos = 0
    key = ""
    value = ""
    action = "key"
    insideLiteral = False
    cfg = CFG()
    while iPos < len(jdl):
        char = jdl[iPos]
        if char == ";" and not insideLiteral:
            if key.strip():
                result = assignValue(key, value, cfg)
                if not result["OK"]:
                    return result
            key = ""
            value = ""
            action = "key"
        elif char == "[" and not insideLiteral:
            key = key.strip()
            if not key:
                return S_ERROR("Invalid key in JDL")
            if value.strip():
                return S_ERROR("Key %s seems to have a value and open a sub JDL at the same time" % key)
            result = loadJDLAsCFG(jdl[iPos:])
            if not result["OK"]:
                return result
            subCfg, subPos = result["Value"]
            cfg.createNewSection(key, contents=subCfg)
            key = ""
            value = ""
            action = "key"
            insideLiteral = False
            iPos += subPos
        elif char == "=" and not insideLiteral:
            if action == "key":
                action = "value"
                insideLiteral = False
            else:
                value += char
        elif char == "]" and not insideLiteral:
            key = key.strip()
            if len(key) > 0:
                result = assignValue(key, value, cfg)
                if not result["OK"]:
                    return result
            return S_OK((cfg, iPos))
        else:
            if action == "key":
                key += char
            else:
                value += char
                if char == '"':
                    insideLiteral = not insideLiteral
        iPos += 1

    return S_OK((cfg, iPos))
#     ProductionSandboxSE
#     {
#       BackendType = DISET
#       AccessProtocol.1
#       {
#         Host = localhost
#         Port = 9196
#         ProtocolName = DIP
#         Protocol = dips
#         Path = /scratch/workspace/%s/sandboxes % setupName
#         Access = remote
#         SpaceToken =
#         WSUrl =
#       }
#     }
localCfg.createNewSection( 'Resources/StorageElements/' )
localCfg.createNewSection( 'Resources/StorageElements/ProductionSandboxSE' )
localCfg.setOption( 'Resources/StorageElements/ProductionSandboxSE/BackendType', 'DISET' )
localCfg.createNewSection( 'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1' )
localCfg.setOption( 'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/Host', 'localhost' )
localCfg.setOption( 'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/Port', '9196' )
localCfg.setOption( 'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/ProtocolName', 'DIP' )
localCfg.setOption( 'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/Path', '%s/sandboxes' % setupName )
localCfg.setOption( 'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/Access', 'remote' )



# Setup the DFC
#
# DataManagement
# {
Example #30
0
    mjfPower = getPowerFromMJF()
    if mjfPower:
        gLogger.notice('CPU power from MJF is %.1f HS06' % mjfPower)
    else:
        gLogger.notice('MJF not available on this node')

    if update and not configFile:
        gConfig.setOptionValue('/LocalSite/CPUScalingFactor',
                               mjfPower if mjfPower else norm)
        gConfig.setOptionValue('/LocalSite/CPUNormalizationFactor', norm)

        gConfig.dumpLocalCFGToFile(gConfig.diracConfigFilePath)
    if configFile:
        from DIRAC.Core.Utilities.CFG import CFG
        cfg = CFG()
        try:
            # Attempt to open the given file
            cfg.loadFromFile(configFile)
        except:
            pass
        # Create the section if it does not exist
        if not cfg.existsKey('LocalSite'):
            cfg.createNewSection('LocalSite')
        cfg.setOption('/LocalSite/CPUScalingFactor',
                      mjfPower if mjfPower else norm)
        cfg.setOption('/LocalSite/CPUNormalizationFactor', norm)

        cfg.writeToFile(configFile)

    DIRAC.exit()
Example #31
0
        localConfigFile = os.path.expandvars(
            '$WORKSPACE') + '/PilotInstallDIR/etc/dirac.cfg'
    elif os.path.isfile(
            os.path.expandvars('$WORKSPACE') +
            '/ServerInstallDIR/etc/dirac.cfg'):
        localConfigFile = os.path.expandvars(
            '$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg'
    elif os.path.isfile('./etc/dirac.cfg'):
        localConfigFile = './etc/dirac.cfg'
    else:
        print "Local CFG file not found"
        exit(2)

localCfg.loadFromFile(localConfigFile)
if not localCfg.isSection('/LocalSite'):
    localCfg.createNewSection('/LocalSite')
localCfg.setOption('/LocalSite/CPUTimeLeft', 5000)
localCfg.setOption('/DIRAC/Security/UseServerCertificate', False)

if not sMod:
    if not setup:
        setup = gConfig.getValue('/DIRAC/Setup')
        if not setup:
            setup = 'dirac-JenkinsSetup'
    if not vo:
        vo = gConfig.getValue('/DIRAC/VirtualOrganization')
        if not vo:
            vo = 'dirac'

    if not localCfg.isSection('/DIRAC/VOPolicy'):
        localCfg.createNewSection('/DIRAC/VOPolicy')
#     ProductionSandboxSE
#     {
#       BackendType = DISET
#       AccessProtocol.1
#       {
#         Host = localhost
#         Port = 9196
#         ProtocolName = DIP
#         Protocol = dips
#         Path = /scratch/workspace/%s/sandboxes % setupName
#         Access = remote
#         SpaceToken =
#         WSUrl =
#       }
#     }
localCfg.createNewSection('Resources/StorageElements/')
localCfg.createNewSection('Resources/StorageElements/ProductionSandboxSE')
localCfg.setOption('Resources/StorageElements/ProductionSandboxSE/BackendType',
                   'DISET')
localCfg.createNewSection(
    'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1')
localCfg.setOption(
    'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/Host',
    'localhost')
localCfg.setOption(
    'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/Port',
    '9196')
localCfg.setOption(
    'Resources/StorageElements/ProductionSandboxSE/AccessProtocol.1/ProtocolName',
    'DIP')
localCfg.setOption(
Example #33
0
class JobManifest(object):
    def __init__(self, manifest=""):
        self.__manifest = CFG()
        self.__dirty = False
        self.__ops = False
        if manifest:
            result = self.load(manifest)
            if not result["OK"]:
                raise Exception(result["Message"])

    def isDirty(self):
        return self.__dirty

    def setDirty(self):
        self.__dirty = True

    def clearDirty(self):
        self.__dirty = False

    def load(self, dataString):
        """
    Auto discover format type based on [ .. ] of JDL
    """
        dataString = dataString.strip()
        if dataString[0] == "[" and dataString[-1] == "]":
            return self.loadJDL(dataString)
        else:
            return self.loadCFG(dataString)

    def loadJDL(self, jdlString):
        """
    Load job manifest from JDL format
    """
        result = loadJDLAsCFG(jdlString.strip())
        if not result["OK"]:
            self.__manifest = CFG()
            return result
        self.__manifest = result["Value"][0]
        return S_OK()

    def loadCFG(self, cfgString):
        """
    Load job manifest from CFG format
    """
        try:
            self.__manifest.loadFromBuffer(cfgString)
        except Exception as e:
            return S_ERROR("Can't load manifest from cfg: %s" % str(e))
        return S_OK()

    def dumpAsCFG(self):
        return str(self.__manifest)

    def getAsCFG(self):
        return self.__manifest.clone()

    def dumpAsJDL(self):
        return dumpCFGAsJDL(self.__manifest)

    def __getCSValue(self, varName, defaultVal=None):
        if not self.__ops:
            self.__ops = Operations(group=self.__manifest["OwnerGroup"], setup=self.__manifest["DIRACSetup"])
        if varName[0] != "/":
            varName = "JobDescription/%s" % varName
        return self.__ops.getValue(varName, defaultVal)

    def __checkNumericalVar(self, varName, defaultVal, minVal, maxVal):
        """
    Check a numerical var
    """
        initialVal = False
        if varName not in self.__manifest:
            varValue = self.__getCSValue("Default%s" % varName, defaultVal)
        else:
            varValue = self.__manifest[varName]
            initialVal = varValue
        try:
            varValue = long(varValue)
        except:
            return S_ERROR("%s must be a number" % varName)
        minVal = self.__getCSValue("Min%s" % varName, minVal)
        maxVal = self.__getCSValue("Max%s" % varName, maxVal)
        varValue = max(minVal, min(varValue, maxVal))
        if initialVal != varValue:
            self.__manifest.setOption(varName, varValue)
        return S_OK(varValue)

    def __checkChoiceVar(self, varName, defaultVal, choices):
        """
    Check a choice var
    """
        initialVal = False
        if varName not in self.__manifest:
            varValue = self.__getCSValue("Default%s" % varName, defaultVal)
        else:
            varValue = self.__manifest[varName]
            initialVal = varValue
        if varValue not in self.__getCSValue("Choices%s" % varName, choices):
            return S_ERROR("%s is not a valid value for %s" % (varValue, varName))
        if initialVal != varValue:
            self.__manifest.setOption(varName, varValue)
        return S_OK(varValue)

    def __checkMultiChoice(self, varName, choices):
        """
    Check a multi choice var
    """
        initialVal = False
        if varName not in self.__manifest:
            return S_OK()
        else:
            varValue = self.__manifest[varName]
            initialVal = varValue
        choices = self.__getCSValue("Choices%s" % varName, choices)
        for v in List.fromChar(varValue):
            if v not in choices:
                return S_ERROR("%s is not a valid value for %s" % (v, varName))
        if initialVal != varValue:
            self.__manifest.setOption(varName, varValue)
        return S_OK(varValue)

    def __checkMaxInputData(self, maxNumber):
        """
    Check Maximum Number of Input Data files allowed
    """
        varName = "InputData"
        if varName not in self.__manifest:
            return S_OK()
        varValue = self.__manifest[varName]
        if len(List.fromChar(varValue)) > maxNumber:
            return S_ERROR(
                "Number of Input Data Files (%s) greater than current limit: %s"
                % (len(List.fromChar(varValue)), maxNumber)
            )
        return S_OK()

    def __contains__(self, key):
        """ Check if the manifest has the required key
    """
        return key in self.__manifest

    def setOptionsFromDict(self, varDict):
        for k in sorted(varDict):
            self.setOption(k, varDict[k])

    def check(self):
        """
    Check that the manifest is OK
    """
        for k in ["OwnerName", "OwnerDN", "OwnerGroup", "DIRACSetup"]:
            if k not in self.__manifest:
                return S_ERROR("Missing var %s in manifest" % k)

        # Check CPUTime
        result = self.__checkNumericalVar("CPUTime", 86400, 100, 500000)
        if not result["OK"]:
            return result

        result = self.__checkNumericalVar("Priority", 1, 0, 10)
        if not result["OK"]:
            return result

        allowedSubmitPools = getSubmitPools(self.__manifest["OwnerGroup"])
        result = self.__checkMultiChoice("SubmitPools", list(set(allowedSubmitPools)))
        if not result["OK"]:
            return result

        result = self.__checkMultiChoice("PilotTypes", ["private"])
        if not result["OK"]:
            return result

        maxInputData = Operations().getValue("JobDescription/MaxInputData", 500)
        result = self.__checkMaxInputData(maxInputData)
        if not result["OK"]:
            return result

        transformationTypes = Operations().getValue("Transformations/DataProcessing", [])
        result = self.__checkMultiChoice("JobType", ["User", "Test", "Hospital"] + transformationTypes)
        if not result["OK"]:
            return result
        return S_OK()

    def createSection(self, secName, contents=False):
        if secName not in self.__manifest:
            if contents and not isinstance(contents, CFG):
                return S_ERROR("Contents for section %s is not a cfg object" % secName)
            self.__dirty = True
            return S_OK(self.__manifest.createNewSection(secName, contents=contents))
        return S_ERROR("Section %s already exists" % secName)

    def getSection(self, secName):
        self.__dirty = True
        sec = self.__manifest[secName]
        if not sec:
            return S_ERROR("%s does not exist")
        return S_OK(sec)

    def setSectionContents(self, secName, contents):
        if contents and not isinstance(contents, CFG):
            return S_ERROR("Contents for section %s is not a cfg object" % secName)
        self.__dirty = True
        if secName in self.__manifest:
            self.__manifest[secName].reset()
            self.__manifest[secName].mergeWith(contents)
        else:
            self.__manifest.createNewSection(secName, contents=contents)

    def setOption(self, varName, varValue):
        """
    Set a var in job manifest
    """
        self.__dirty = True
        levels = List.fromChar(varName, "/")
        cfg = self.__manifest
        for l in levels[:-1]:
            if l not in cfg:
                cfg.createNewSection(l)
            cfg = cfg[l]
        cfg.setOption(levels[-1], varValue)

    def remove(self, opName):
        levels = List.fromChar(opName, "/")
        cfg = self.__manifest
        for l in levels[:-1]:
            if l not in cfg:
                return S_ERROR("%s does not exist" % opName)
            cfg = cfg[l]
        if cfg.deleteKey(levels[-1]):
            self.__dirty = True
            return S_OK()
        return S_ERROR("%s does not exist" % opName)

    def getOption(self, varName, defaultValue=None):
        """
     Get a variable from the job manifest
    """
        cfg = self.__manifest
        return cfg.getOption(varName, defaultValue)

    def getOptionList(self, section=""):
        """
    Get a list of variables in a section of the job manifest
    """
        cfg = self.__manifest.getRecursive(section)
        if not cfg or "value" not in cfg:
            return []
        cfg = cfg["value"]
        return cfg.listOptions()

    def isOption(self, opName):
        """
    Check if it is a valid option
    """
        return self.__manifest.isOption(opName)

    def getSectionList(self, section=""):
        """
    Get a list of sections in the job manifest
    """
        cfg = self.__manifest.getRecursive(section)
        if not cfg or "value" not in cfg:
            return []
        cfg = cfg["value"]
        return cfg.listSections()
Example #34
0
class JobManifest(object):
    def __init__(self, manifest=""):
        self.__manifest = CFG()
        self.__dirty = False
        self.__ops = False
        if manifest:
            result = self.load(manifest)
            if not result['OK']:
                raise Exception(result['Message'])

    def isDirty(self):
        return self.__dirty

    def setDirty(self):
        self.__dirty = True

    def clearDirty(self):
        self.__dirty = False

    def load(self, dataString):
        """
    Auto discover format type based on [ .. ] of JDL
    """
        dataString = dataString.strip()
        if dataString[0] == "[" and dataString[-1] == "]":
            return self.loadJDL(dataString)
        else:
            return self.loadCFG(dataString)

    def loadJDL(self, jdlString):
        """
    Load job manifest from JDL format
    """
        result = loadJDLAsCFG(jdlString.strip())
        if not result['OK']:
            self.__manifest = CFG()
            return result
        self.__manifest = result['Value'][0]
        return S_OK()

    def loadCFG(self, cfgString):
        """
    Load job manifest from CFG format
    """
        try:
            self.__manifest.loadFromBuffer(cfgString)
        except Exception as e:
            return S_ERROR("Can't load manifest from cfg: %s" % str(e))
        return S_OK()

    def dumpAsCFG(self):
        return str(self.__manifest)

    def getAsCFG(self):
        return self.__manifest.clone()

    def dumpAsJDL(self):
        return dumpCFGAsJDL(self.__manifest)

    def __getCSValue(self, varName, defaultVal=None):
        if not self.__ops:
            self.__ops = Operations(group=self.__manifest['OwnerGroup'],
                                    setup=self.__manifest['DIRACSetup'])
        if varName[0] != "/":
            varName = "JobDescription/%s" % varName
        return self.__ops.getValue(varName, defaultVal)

    def __checkNumericalVar(self, varName, defaultVal, minVal, maxVal):
        """
    Check a numerical var
    """
        initialVal = False
        if varName not in self.__manifest:
            varValue = self.__getCSValue("Default%s" % varName, defaultVal)
        else:
            varValue = self.__manifest[varName]
            initialVal = varValue
        try:
            varValue = long(varValue)
        except BaseException:
            return S_ERROR("%s must be a number" % varName)
        minVal = self.__getCSValue("Min%s" % varName, minVal)
        maxVal = self.__getCSValue("Max%s" % varName, maxVal)
        varValue = max(minVal, min(varValue, maxVal))
        if initialVal != varValue:
            self.__manifest.setOption(varName, varValue)
        return S_OK(varValue)

    def __checkChoiceVar(self, varName, defaultVal, choices):
        """
    Check a choice var
    """
        initialVal = False
        if varName not in self.__manifest:
            varValue = self.__getCSValue("Default%s" % varName, defaultVal)
        else:
            varValue = self.__manifest[varName]
            initialVal = varValue
        if varValue not in self.__getCSValue("Choices%s" % varName, choices):
            return S_ERROR("%s is not a valid value for %s" %
                           (varValue, varName))
        if initialVal != varValue:
            self.__manifest.setOption(varName, varValue)
        return S_OK(varValue)

    def __checkMultiChoice(self, varName, choices):
        """
    Check a multi choice var
    """
        initialVal = False
        if varName not in self.__manifest:
            return S_OK()
        else:
            varValue = self.__manifest[varName]
            initialVal = varValue
        choices = self.__getCSValue("Choices%s" % varName, choices)
        for v in List.fromChar(varValue):
            if v not in choices:
                return S_ERROR("%s is not a valid value for %s" % (v, varName))
        if initialVal != varValue:
            self.__manifest.setOption(varName, varValue)
        return S_OK(varValue)

    def __checkMaxInputData(self, maxNumber):
        """
    Check Maximum Number of Input Data files allowed
    """
        varName = "InputData"
        if varName not in self.__manifest:
            return S_OK()
        varValue = self.__manifest[varName]
        if len(List.fromChar(varValue)) > maxNumber:
            return S_ERROR(
                'Number of Input Data Files (%s) greater than current limit: %s'
                % (len(List.fromChar(varValue)), maxNumber))
        return S_OK()

    def __contains__(self, key):
        """ Check if the manifest has the required key
    """
        return key in self.__manifest

    def setOptionsFromDict(self, varDict):
        for k in sorted(varDict):
            self.setOption(k, varDict[k])

    def check(self):
        """
    Check that the manifest is OK
    """
        for k in ['OwnerName', 'OwnerDN', 'OwnerGroup', 'DIRACSetup']:
            if k not in self.__manifest:
                return S_ERROR("Missing var %s in manifest" % k)

        # Check CPUTime
        result = self.__checkNumericalVar("CPUTime", 86400, 100, 500000)
        if not result['OK']:
            return result

        result = self.__checkNumericalVar("Priority", 1, 0, 10)
        if not result['OK']:
            return result

        allowedSubmitPools = getSubmitPools(self.__manifest['OwnerGroup'])
        result = self.__checkMultiChoice("SubmitPools",
                                         list(set(allowedSubmitPools)))
        if not result['OK']:
            return result

        result = self.__checkMultiChoice("PilotTypes", ['private'])
        if not result['OK']:
            return result

        maxInputData = Operations().getValue("JobDescription/MaxInputData",
                                             500)
        result = self.__checkMaxInputData(maxInputData)
        if not result['OK']:
            return result

        transformationTypes = Operations().getValue(
            "Transformations/DataProcessing", [])
        result = self.__checkMultiChoice(
            "JobType", ['User', 'Test', 'Hospital'] + transformationTypes)
        if not result['OK']:
            return result
        return S_OK()

    def createSection(self, secName, contents=False):
        if secName not in self.__manifest:
            if contents and not isinstance(contents, CFG):
                return S_ERROR("Contents for section %s is not a cfg object" %
                               secName)
            self.__dirty = True
            return S_OK(
                self.__manifest.createNewSection(secName, contents=contents))
        return S_ERROR("Section %s already exists" % secName)

    def getSection(self, secName):
        self.__dirty = True
        if secName not in self.__manifest:
            return S_ERROR("%s does not exist" % secName)
        sec = self.__manifest[secName]
        if not sec:
            return S_ERROR("%s section empty" % secName)
        return S_OK(sec)

    def setSectionContents(self, secName, contents):
        if contents and not isinstance(contents, CFG):
            return S_ERROR("Contents for section %s is not a cfg object" %
                           secName)
        self.__dirty = True
        if secName in self.__manifest:
            self.__manifest[secName].reset()
            self.__manifest[secName].mergeWith(contents)
        else:
            self.__manifest.createNewSection(secName, contents=contents)

    def setOption(self, varName, varValue):
        """
    Set a var in job manifest
    """
        self.__dirty = True
        levels = List.fromChar(varName, "/")
        cfg = self.__manifest
        for l in levels[:-1]:
            if l not in cfg:
                cfg.createNewSection(l)
            cfg = cfg[l]
        cfg.setOption(levels[-1], varValue)

    def remove(self, opName):
        levels = List.fromChar(opName, "/")
        cfg = self.__manifest
        for l in levels[:-1]:
            if l not in cfg:
                return S_ERROR("%s does not exist" % opName)
            cfg = cfg[l]
        if cfg.deleteKey(levels[-1]):
            self.__dirty = True
            return S_OK()
        return S_ERROR("%s does not exist" % opName)

    def getOption(self, varName, defaultValue=None):
        """
     Get a variable from the job manifest
    """
        cfg = self.__manifest
        return cfg.getOption(varName, defaultValue)

    def getOptionList(self, section=""):
        """
    Get a list of variables in a section of the job manifest
    """
        cfg = self.__manifest.getRecursive(section)
        if not cfg or 'value' not in cfg:
            return []
        cfg = cfg['value']
        return cfg.listOptions()

    def isOption(self, opName):
        """
    Check if it is a valid option
    """
        return self.__manifest.isOption(opName)

    def getSectionList(self, section=""):
        """
    Get a list of sections in the job manifest
    """
        cfg = self.__manifest.getRecursive(section)
        if not cfg or 'value' not in cfg:
            return []
        cfg = cfg['value']
        return cfg.listSections()