Example #1
0
class ProcessList(object):
  """ The ProcessList uses internally the CFG utility to store the processes and their properties.
  """
  def __init__(self, location):
    self.cfg = CFG()
    self.location = location
    self.goodProcessList = True
    if os.path.exists(self.location):
      self.cfg.loadFromFile(self.location)
      if not self.cfg.existsKey('Processes'):
        self.cfg.createNewSection('Processes')
    else:
      self.goodProcessList = False  
      
  def _writeProcessList(self, path):
    """ Write to text
    """
    handle, tmpName = tempfile.mkstemp()
    written = self.cfg.writeToFile(tmpName)
    os.close(handle)
    if not written:
      if os.path.exists(tmpName):
        os.remove(tmpName)
      return written
    if os.path.exists(path):
      LOG.debug("Replacing %s" % path)
    try:
      shutil.move(tmpName, path)
      return True
    except OSError, err:
      LOG.error("Failed to overwrite process list.", err)
      LOG.info("If your process list is corrupted a backup can be found %s" % tmpName)
      return False
Example #2
0
  def _parseConfigTemplate(self, templatePath, cfg=None):
    """Parse the ConfigTemplate.cfg files.

    :param str templatePath: path to the folder containing a ConfigTemplate.cfg file
    :param CFG cfg: cfg to merge with the systems config
    :returns: CFG object
    """
    cfg = CFG() if cfg is None else cfg

    system = os.path.split(templatePath.rstrip("/"))[1]
    if system.lower().endswith('system'):
      system = system[:-len('System')]

    if self.systems and system not in self.systems:
      return S_OK(cfg)

    templatePath = os.path.join(templatePath, 'ConfigTemplate.cfg')
    if not os.path.exists(templatePath):
      return S_ERROR("File not found: %s" % templatePath)

    loadCfg = CFG()
    loadCfg.loadFromFile(templatePath)

    newCfg = CFG()
    newCfg.createNewSection("/%s" % system, contents=loadCfg)

    cfg = cfg.mergeWith(newCfg)

    return S_OK(cfg)
Example #3
0
class ProcessList(object):
  """ The ProcessList uses internally the CFG utility to store the processes and their properties.
  """
  def __init__(self, location):
    self.cfg = CFG()
    self.location = location
    self.goodProcessList = True
    if os.path.exists(self.location):
      self.cfg.loadFromFile(self.location)
      if not self.cfg.existsKey('Processes'):
        self.cfg.createNewSection('Processes')
    else:
      self.goodProcessList = False  
      
  def _writeProcessList(self, path):
    """ Write to text
    """
    handle, tmpName = tempfile.mkstemp()
    written = self.cfg.writeToFile(tmpName)
    os.close(handle)
    if not written:
      if os.path.exists(tmpName):
        os.remove(tmpName)
      return written
    if os.path.exists(path):
      gLogger.debug("Replacing %s" % path)
    try:
      shutil.move(tmpName, path)
      return True
    except OSError, err:
      gLogger.error("Failed to overwrite process list.", err)
      gLogger.info("If your process list is corrupted a backup can be found %s" % tmpName)
      return False
Example #4
0
  def parseConfigTemplate(self, templatePath, cfg):
    """Parse the ConfigTemplate.cfg files.

    :param str templatePath: path to the folder containing a ConfigTemplate.cfg file
    :param CFG cfg: cfg to merge with the systems config
    :returns: CFG object
    """
    system = os.path.split(templatePath.rstrip('/'))[1]
    if system.lower().endswith('system'):
      system = system[:-len('System')]

    templatePath = os.path.join(templatePath, 'ConfigTemplate.cfg')
    if not os.path.exists(templatePath):
      return S_ERROR('File not found: %s' % templatePath)

    loadCfg = CFG()
    try:
      loadCfg.loadFromFile(templatePath)
    except ValueError as err:
      LOG.error('Failed loading file %r: %r', templatePath, err)
      self.retVal = 1
      return S_ERROR()
    cfg.createNewSection('/Systems/%s' % system, contents=loadCfg)

    return S_OK(cfg)
Example #5
0
def checkAgentOptions(getOptionMock, systemName, agentName,
                      ignoreOptions=None, extension='DIRAC'):
  """Ensure that all the agent options are properly documented.

  :param getOptionMock: Mock object for agentmodule.get_amOption function
  :param str systemName: name of the **System**
  :param str agentName: name of the **Agent**
  :param list ignoreOptions: list of options to ignore
  :param str extension: name of the DIRAC **Extension** where the Agent comes from
  """
  if ignoreOptions is None:
    ignoreOptions = []

  # add some options that can be set, see the AgentModule for all of them
  ignoreOptions.extend(['PollingTime', 'Status', 'Enabled', 'MaxCycles', 'LogOutputs', 'ControlDirectory'])
  ignoreOptions = list(set(ignoreOptions))
  config = CFG()

  LOG.info("Testing %s/%s, ignoring options %s", systemName, agentName, ignoreOptions)

  # get the location where DIRAC is in from basefolder/DIRAC/__ini__.py
  configFilePath = os.path.join(os.path.dirname(os.path.dirname(DIRAC.__file__)),
                                extension, systemName, 'ConfigTemplate.cfg')
  config.loadFromFile(configFilePath)
  optionsDict = config.getAsDict('Agents/%s' % agentName)
  outDict = {}
  _parseOption(outDict, optionsDict)
  optionsDict = outDict
  LOG.info("Calls: %s", pformat(getOptionMock.call_args_list))
  LOG.info("Options found in ConfigTemplate: %s ", list(optionsDict.keys()))

  # check that values in ConfigTemplate are used
  for option, value in optionsDict.iteritems():
    if any(ignoreOp in option for ignoreOp in ignoreOptions):
      LOG.info("From Agent: ignoring option %r with value %r, (%s)", option, value, type(value))
      continue
    LOG.info("Looking for call to option %r with value %r, (%s)", option, value, type(value))
    if not isinstance(value, bool) and not value:  # empty string, list, dict ...
      assert any(call(option, null) in getOptionMock.call_args_list for null in ({}, set(), [], '', 0))
    else:
      assert call(option, value) in getOptionMock.call_args_list or \
          call(option, [value]) in getOptionMock.call_args_list

  # check that options used in the agent are in the ConfigTemplates
  for opCall in getOptionMock.call_args_list:
    optionArguments = opCall[0]
    if len(optionArguments) != 2:
      continue
    optionName = optionArguments[0]
    optionValue = optionArguments[1]
    if optionName in ignoreOptions:
      LOG.info("From Template: ignoring option %r with %r", optionName, optionValue)
      continue
    LOG.info("Checking Template option %r with %r", optionName, optionValue)
    assert optionName in optionsDict
    if not optionsDict[optionName]:
      assert not optionValue
      continue
    assert optionsDict[optionName] == optionValue or [optionsDict[optionName]] == optionValue
 def loadFile( self, fileName ):
   try:
     fileCFG = CFG()
     fileCFG.loadFromFile( fileName )
   except IOError:
     self.localCFG = self.localCFG.mergeWith( fileCFG )
     return S_ERROR( "Can't load a cfg file '%s'" % fileName )
   return self.mergeWithLocal( fileCFG )
Example #7
0
 def loadFile(self, fileName):
     try:
         fileCFG = CFG()
         fileCFG.loadFromFile(fileName)
     except IOError:
         self.localCFG = self.localCFG.mergeWith(fileCFG)
         return S_ERROR("Can't load a cfg file '%s'" % fileName)
     return self.mergeWithLocal(fileCFG)
  def execute( self ):
    """ execute
    
    """

    stopAgents = self.findStopAgents()[ 'Value' ]
    if stopAgents:
      self.log.info( 'Aborting, there are stop_agents to be picked' )
      return S_OK()

    pilotVersion = self.opHelper.getValue( 'Pilot/Version', '' )
    if not pilotVersion:
      self.log.error( 'There is no pilot version on the CS' )
      return S_OK()

    pilotVersion = self.getNewestPilotVersion()
    if not pilotVersion[ 'OK' ]:
      self.log.error( pilotVersion[ 'Message' ] )
      return S_ERROR( pilotVersion[ 'Message' ] )
    pilotVersion = pilotVersion[ 'Value' ]
      
    localCFG = CFG()
    
    #load local CFG
    localCFG.loadFromFile( self.cfgToUpdate )
    releaseVersion = localCFG.getRecursive( 'LocalSite/ReleaseVersion' )[ 'value' ]
    
    self.log.info( 'PilotVersion : %s' % pilotVersion )
    self.log.info( 'ReleaseVersion : %s' % releaseVersion )
            
    if LooseVersion( pilotVersion ) > LooseVersion( releaseVersion ):
    
      self.log.info( 'UPDATING %s > %s' % ( pilotVersion, releaseVersion ) )
    
      localCFG.setOption( 'LocalSite/ReleaseVersion', pilotVersion )
      localCFG.writeToFile( self.cfgToUpdate )  
      
      self.touchStopAgents()
    
    else:
      
      self.log.info( 'Nothing to do' )
    
    return S_OK()     
def checkFunction():
  """ gets CPU normalisation from MFJ or calculate itself """
  from DIRAC.WorkloadManagementSystem.Client.CPUNormalization import getPowerFromMJF
  from ILCDIRAC.Core.Utilities.CPUNormalization import getCPUNormalization
  from DIRAC import gLogger, gConfig

  result = getCPUNormalization()

  if not result['OK']:
    gLogger.error( result['Message'] )

  norm = round( result['Value']['NORM'], 1 )

  gLogger.notice( 'Estimated CPU power is %.1f %s' % ( norm, result['Value']['UNIT'] ) )

  mjfPower = getPowerFromMJF()
  if mjfPower:
    gLogger.notice( 'CPU power from MJF is %.1f HS06' % mjfPower )
  else:
    gLogger.notice( 'MJF not available on this node' )

  if update and not configFile:
    gConfig.setOptionValue( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    gConfig.setOptionValue( '/LocalSite/CPUNormalizationFactor', norm )

    gConfig.dumpLocalCFGToFile( gConfig.diracConfigFilePath )
  if configFile:
    from DIRAC.Core.Utilities.CFG import CFG
    cfg = CFG()
    try:
      # Attempt to open the given file
      cfg.loadFromFile( configFile )
    except:
      pass
    # Create the section if it does not exist
    if not cfg.existsKey( 'LocalSite' ):
      cfg.createNewSection( 'LocalSite' )
    cfg.setOption( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    cfg.setOption( '/LocalSite/CPUNormalizationFactor', norm )

    cfg.writeToFile( configFile )


  DIRAC.exit()
Example #10
0
def getComputingElementDefaults(ceName='',
                                ceType='',
                                cfg=None,
                                currentSectionPath=''):
    """
  Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg
  """
    cesCfg = CFG()
    if cfg:
        try:
            cesCfg.loadFromFile(cfg)
            cesPath = cfgInstallPath('ComputingElements')
            if cesCfg.isSection(cesPath):
                for section in cfgPathToList(cesPath):
                    cesCfg = cesCfg[section]
        except:
            return CFG()

    # Overwrite the cfg with Command line arguments
    if ceName:
        if not cesCfg.isSection(ceName):
            cesCfg.createNewSection(ceName)
        if currentSectionPath:
            # Add Options from Command Line
            optionsDict = __getExtraOptions(currentSectionPath)
            for name, value in optionsDict.items():
                cesCfg[ceName].setOption(name, value)  #pylint: disable=no-member
        if ceType:
            cesCfg[ceName].setOption('CEType', ceType)  #pylint: disable=no-member

    ceDefaultSection = cfgPath(defaultSection('ComputingElements'))
    # Load Default for the given type from Central configuration is defined
    ceDefaults = __gConfigDefaults(ceDefaultSection)
    for ceName in cesCfg.listSections():
        if 'CEType' in cesCfg[ceName]:
            ceType = cesCfg[ceName]['CEType']
            if ceType in ceDefaults:
                for option in ceDefaults[ceType].listOptions():
                    if option not in cesCfg[ceName]:
                        cesCfg[ceName].setOption(option,
                                                 ceDefaults[ceType][option])

    return cesCfg
Example #11
0
    def updateCompleteDiracCFG(self):
        """Read the dirac.cfg and update the Systems sections from the ConfigTemplate.cfg files."""
        compCfg = CFG()
        mainDiracCfgPath = self.config.cfg_baseFile

        if not os.path.exists(mainDiracCfgPath):
            LOG.error('Failed to find Main Dirac cfg at %r', mainDiracCfgPath)
            return 1

        LOG.info('Extracting default configuration from %r', mainDiracCfgPath)
        loadCFG = CFG()
        loadCFG.loadFromFile(mainDiracCfgPath)
        compCfg = loadCFG.mergeWith(compCfg)

        cfg = self.getSystemsCFG()
        compCfg = compCfg.mergeWith(cfg)
        diracCfgOutput = self.config.cfg_targetFile

        LOG.info('Writing output to %r', diracCfgOutput)

        with open(diracCfgOutput, 'w') as rst:
            rst.write(
                textwrap.dedent("""
                                ==========================
                                Full Configuration Example
                                ==========================

                                .. This file is created by docs/Tools/UpdateDiracCFG.py

                                Below is a complete example configuration with anotations for some sections::

                                """))
            # indent the cfg text
            cfgString = ''.join('  ' + line
                                for line in str(compCfg).splitlines(True))
            # fix the links, add back the # for targets
            # match .html with following character using positive look ahead
            htmlMatch = re.compile(r'\.html(?=[a-zA-Z0-9])')
            cfgString = re.sub(htmlMatch, '.html#', cfgString)
            rst.write(cfgString)
        return self.retVal
Example #12
0
def getComputingElementDefaults(ceName="", ceType="", cfg=None, currentSectionPath=""):
    """
  Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg
  """
    cesCfg = CFG()
    if cfg:
        try:
            cesCfg.loadFromFile(cfg)
            cesPath = cfgInstallPath("ComputingElements")
            if cesCfg.isSection(cesPath):
                for section in cfgPathToList(cesPath):
                    cesCfg = cesCfg[section]
        except:
            return CFG()

    # Overwrite the cfg with Command line arguments
    if ceName:
        if not cesCfg.isSection(ceName):
            cesCfg.createNewSection(ceName)
        if currentSectionPath:
            # Add Options from Command Line
            optionsDict = __getExtraOptions(currentSectionPath)
            for name, value in optionsDict.items():
                cesCfg[ceName].setOption(name, value)
        if ceType:
            cesCfg[ceName].setOption("CEType", ceType)

    ceDefaultSection = cfgPath(defaultSection("ComputingElements"))
    # Load Default for the given type from Central configuration is defined
    ceDefaults = __gConfigDefaults(ceDefaultSection)
    for ceName in cesCfg.listSections():
        if "CEType" in cesCfg[ceName]:
            ceType = cesCfg[ceName]["CEType"]
            if ceType in ceDefaults:
                for option in ceDefaults[ceType].listOptions():
                    if option not in cesCfg[ceName]:
                        cesCfg[ceName].setOption(option, ceDefaults[ceType][option])

    return cesCfg
Example #13
0
def getComputingElementDefaults(ceName='', ceType='', cfg=None, currentSectionPath=''):
  """
  Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg
  """
  cesCfg = CFG()
  if cfg:
    try:
      cesCfg.loadFromFile(cfg)
      cesPath = cfgInstallPath('ComputingElements')
      if cesCfg.isSection(cesPath):
        for section in cfgPathToList(cesPath):
          cesCfg = cesCfg[section]
    except BaseException:
      return CFG()

  # Overwrite the cfg with Command line arguments
  if ceName:
    if not cesCfg.isSection(ceName):
      cesCfg.createNewSection(ceName)
    if currentSectionPath:
      # Add Options from Command Line
      optionsDict = __getExtraOptions(currentSectionPath)
      for name, value in optionsDict.items():
        cesCfg[ceName].setOption(name, value)  # pylint: disable=no-member
    if ceType:
      cesCfg[ceName].setOption('CEType', ceType)  # pylint: disable=no-member

  ceDefaultSection = cfgPath(defaultSection('ComputingElements'))
  # Load Default for the given type from Central configuration is defined
  ceDefaults = __gConfigDefaults(ceDefaultSection)
  for ceName in cesCfg.listSections():
    if 'CEType' in cesCfg[ceName]:
      ceType = cesCfg[ceName]['CEType']
      if ceType in ceDefaults:
        for option in ceDefaults[ceType].listOptions():  # pylint: disable=no-member
          if option not in cesCfg[ceName]:
            cesCfg[ceName].setOption(option, ceDefaults[ceType][option])  # pylint: disable=unsubscriptable-object

  return cesCfg
Example #14
0
localCfg = CFG()
if cFile:
  localConfigFile = cFile
else:
  print "WORKSPACE: %s" % os.path.expandvars('$WORKSPACE')
  if os.path.isfile( os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg' ):
    localConfigFile = os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg'
  elif os.path.isfile( os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg' ):
    localConfigFile = os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg'
  elif os.path.isfile( './etc/dirac.cfg' ):
    localConfigFile = './etc/dirac.cfg'
  else:
    print "Local CFG file not found"
    exit( 2 )

localCfg.loadFromFile( localConfigFile )
if not localCfg.isSection( '/LocalSite' ):
  localCfg.createNewSection( '/LocalSite' )
localCfg.setOption( '/LocalSite/CPUTimeLeft', 5000 )
localCfg.setOption( '/DIRAC/Security/UseServerCertificate', False )

if not sMod:
  if not setup:
    setup = gConfig.getValue('/DIRAC/Setup')
    if not setup:
      setup = 'JenkinsSetup'
  if not vo:
    vo = gConfig.getValue('/DIRAC/VirtualOrganization')
    if not vo:
      vo = 'dirac'
Example #15
0
class JobRepository(object):

  def __init__(self, repository=None):
    self.location = repository
    if not self.location:
      if "HOME" in os.environ:
        self.location = '%s/.dirac.repo.rep' % os.environ['HOME']
      else:
        self.location = '%s/.dirac.repo.rep' % os.getcwd()
    self.repo = CFG()
    if os.path.exists(self.location):
      self.repo.loadFromFile(self.location)
      if not self.repo.existsKey('Jobs'):
        self.repo.createNewSection('Jobs')
    else:
      self.repo.createNewSection('Jobs')
    self.OK = True
    written = self._writeRepository(self.location)
    if not written:
      self.OK = False

  def isOK(self):
    return self.OK

  def readRepository(self):
    return S_OK(self.repo.getAsDict('Jobs'))

  def writeRepository(self, alternativePath=None):
    destination = self.location
    if alternativePath:
      destination = alternativePath
    written = self._writeRepository(destination)
    if not written:
      return S_ERROR("Failed to write repository")
    return S_OK(destination)

  def resetRepository(self, jobIDs=[]):
    if not jobIDs:
      jobs = self.readRepository()['Value']
      jobIDs = jobs.keys()
    paramDict = {'State': 'Submitted',
                 'Retrieved': 0,
                 'OutputData': 0}
    for jobID in jobIDs:
      self._writeJob(jobID, paramDict, True)
    self._writeRepository(self.location)
    return S_OK()

  def _writeRepository(self, path):
    handle, tmpName = tempfile.mkstemp()
    written = self.repo.writeToFile(tmpName)
    os.close(handle)
    if not written:
      if os.path.exists(tmpName):
        os.remove(tmpName)
      return written
    if os.path.exists(path):
      gLogger.debug("Replacing %s" % path)
    try:
      shutil.move(tmpName, path)
      return True
    except Exception as x:
      gLogger.error("Failed to overwrite repository.", x)
      gLogger.info("If your repository is corrupted a backup can be found %s" % tmpName)
      return False

  def appendToRepository(self, repoLocation):
    if not os.path.exists(repoLocation):
      gLogger.error("Secondary repository does not exist", repoLocation)
      return S_ERROR("Secondary repository does not exist")
    self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo)
    self._writeRepository(self.location)
    return S_OK()

  def addJob(self, jobID, state='Submitted', retrieved=0, outputData=0, update=False):
    paramDict = {'State': state,
                 'Time': self._getTime(),
                 'Retrieved': int(retrieved),
                 'OutputData': outputData}
    self._writeJob(jobID, paramDict, update)
    self._writeRepository(self.location)
    return S_OK(jobID)

  def updateJob(self, jobID, paramDict):
    if self._existsJob(jobID):
      paramDict['Time'] = self._getTime()
      self._writeJob(jobID, paramDict, True)
      self._writeRepository(self.location)
    return S_OK()

  def updateJobs(self, jobDict):
    for jobID, paramDict in jobDict.items():
      if self._existsJob(jobID):
        paramDict['Time'] = self._getTime()
        self._writeJob(jobID, paramDict, True)
    self._writeRepository(self.location)
    return S_OK()

  def _getTime(self):
    runtime = time.ctime()
    return runtime.replace(" ", "_")

  def _writeJob(self, jobID, paramDict, update):
    jobID = str(jobID)
    jobExists = self._existsJob(jobID)
    if jobExists and (not update):
      gLogger.warn("Job exists and not overwriting")
      return S_ERROR("Job exists and not overwriting")
    if not jobExists:
      self.repo.createNewSection('Jobs/%s' % jobID)
    for key, value in paramDict.items():
      self.repo.setOption('Jobs/%s/%s' % (jobID, key), value)
    return S_OK()

  def removeJob(self, jobID):
    res = self.repo['Jobs'].deleteKey(str(jobID))  # pylint: disable=no-member
    if res:
      self._writeRepository(self.location)
    return S_OK()

  def existsJob(self, jobID):
    return S_OK(self._existsJob(jobID))

  def _existsJob(self, jobID):
    return self.repo.isSection('Jobs/%s' % jobID)

  def getLocation(self):
    return S_OK(self.location)

  def getSize(self):
    return S_OK(len(self.repo.getAsDict('Jobs')))
Example #16
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Temporary mechanism to pass a shutdown message to the agent
            if os.path.exists('/var/lib/dirac_drain'):
                return self.__finish('Node is being drained by an operator')
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(
                        "Disabling filling mode as errors calculating time left",
                        self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              '%d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            else:
                self.log.info('CE is not available')
                return self.__finish('CE Not Available')

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy'
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)

            # here finally calling the matcher
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirements for determining the number of processors
        # the minimum number of processors requested
        processors = int(
            params.get('NumberOfProcessors',
                       int(params.get('MinNumberOfProcessors', 1))))
        # the maximum number of processors allowed to the payload
        maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0))
        # need or not the whole node for the job
        wholeNode = 'WholeNode' in params
        mpTag = 'MultiProcessor' in params.get('Tags', [])

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(thisp,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self._setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self._rescheduleFailedJob(jobID, result['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName))
            result = self._submitJob(jobID, params, ceDict, optimizerParams,
                                     proxyChain, processors, wholeNode,
                                     maxNumberOfProcessors, mpTag)
            if not result['OK']:
                self.__report(jobID, 'Failed', result['Message'])
                return self.__finish(result['Message'])
            elif 'PayloadFailed' in result:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self._getCPUTimeLeft()

        return S_OK('Job Agent cycle complete')
  mjfPower = getPowerFromMJF()
  if mjfPower:
    gLogger.notice( 'CPU power from MJF is %.1f HS06' % mjfPower )
  else:
    gLogger.notice( 'MJF not available on this node' )

  if update and not configFile:
    gConfig.setOptionValue( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    gConfig.setOptionValue( '/LocalSite/CPUNormalizationFactor', norm )

    gConfig.dumpLocalCFGToFile( gConfig.diracConfigFilePath )
  if configFile:
    from DIRAC.Core.Utilities.CFG import CFG
    cfg = CFG()
    try:
      # Attempt to open the given file
      cfg.loadFromFile( configFile )
    except:
      pass
    # Create the section if it does not exist
    if not cfg.existsKey( 'LocalSite' ):
      cfg.createNewSection( 'LocalSite' )
    cfg.setOption( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm )
    cfg.setOption( '/LocalSite/CPUNormalizationFactor', norm )

    cfg.writeToFile( configFile )


  DIRAC.exit()
Example #18
0
            os.path.expandvars('$WORKSPACE') +
            '/PilotInstallDIR/etc/dirac.cfg'):
        localConfigFile = os.path.expandvars(
            '$WORKSPACE') + '/PilotInstallDIR/etc/dirac.cfg'
    elif os.path.isfile(
            os.path.expandvars('$WORKSPACE') +
            '/ServerInstallDIR/etc/dirac.cfg'):
        localConfigFile = os.path.expandvars(
            '$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg'
    elif os.path.isfile('./etc/dirac.cfg'):
        localConfigFile = './etc/dirac.cfg'
    else:
        print "Local CFG file not found"
        exit(2)

localCfg.loadFromFile(localConfigFile)
if not localCfg.isSection('/LocalSite'):
    localCfg.createNewSection('/LocalSite')
localCfg.setOption('/LocalSite/CPUTimeLeft', 5000)
localCfg.setOption('/DIRAC/Security/UseServerCertificate', False)

if not sMod:
    if not setup:
        setup = gConfig.getValue('/DIRAC/Setup')
        if not setup:
            setup = 'dirac-JenkinsSetup'
    if not vo:
        vo = gConfig.getValue('/DIRAC/VirtualOrganization')
        if not vo:
            vo = 'dirac'
Example #19
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] += ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                          'BoincHostName'):
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Example #20
0
  def execute(self):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish('Node is being drained by an operator')
      # Only call timeLeft utility after a job has been picked up
      self.log.info('Attempting to check CPU time left for filling mode')
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn(self.timeLeftError)
          return self.__finish(self.timeLeftError)
        self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft))
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish('No more time left')
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
        if not result['OK']:
          return self.__finish(result['Message'])

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join('.', self.extraOptions)
        else:
          localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
        localCfg.writeToFile(localConfigFile)

      else:
        return self.__finish('Filling Mode is Disabled')

    self.log.verbose('Job Agent execution loop')
    result = self.computingElement.available()
    if not result['OK']:
      self.log.info('Resource is not available')
      self.log.info(result['Message'])
      return self.__finish('CE Not Available')

    self.log.info(result['Message'])

    ceInfoDict = result['CEInfoDict']
    runningJobs = ceInfoDict.get("RunningJobs")
    availableSlots = result['Value']

    if not availableSlots:
      if runningJobs:
        self.log.info('No available slots with %d running jobs' % runningJobs)
        return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs)
      else:
        self.log.info('CE is not available')
        return self.__finish('CE Not Available')

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    # Add pilot information
    gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if 'PilotReference' not in ceDict:
      ceDict['PilotReference'] = str(self.pilotReference)
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict('/AgentJobRequirements')
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update(requirementsDict)
      self.log.info('Requirements:', requirementsDict)

    self.log.verbose(ceDict)
    start = time.time()
    jobRequest = MatcherClient().requestJob(ceDict)
    matchTime = time.time() - start
    self.log.info('MatcherTime = %.2f (s)' % (matchTime))

    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)

    if not jobRequest['OK']:
      if re.search('No match found', jobRequest['Message']):
        self.log.notice('Job request OK: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("seconds timeout") != -1:
        self.log.error('Timeout while requesting job', jobRequest['Message'])
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("Pilot version does not match") != -1:
        errorMsg = 'Pilot version does not match the production version'
        self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, ''))
        return S_ERROR(jobRequest['Message'])
      else:
        self.log.notice('Failed to get jobs: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False)
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param))
        return self.__finish('Matcher Failed')
      elif not matcherInfo[param]:
        self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param))
        return self.__finish('Matcher Failed')
      else:
        self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param]))

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters(jobJDL)
    if not parameters['OK']:
      self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
      self.log.warn(parameters['Message'])
      return self.__finish('JDL Problem')

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report(jobID, 'Failed', msg)
      self.log.warn(msg)
      return self.__finish('JDL Problem')
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn('Job has no JobType defined in JDL parameters')
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn('Job has no CPU requirement defined in JDL parameters')

    # Job requirement for a number of processors
    processors = int(params.get('NumberOfProcessors', 1))
    wholeNode = 'WholeNode' in params

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose('Job request successful: \n', jobRequest['Value'])
    self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
    self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
    self.jobCount += 1
    try:
      jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
      jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False)

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'):
          jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False)

      jobReport.setJobStatus('Matched', 'Job Received by Agent')
      result = self.__setupProxy(ownerDN, jobGroup)
      if not result['OK']:
        return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure)
      proxyChain = result.get('Value')

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest(jobID, jobJDL)

      software = self.__checkInstallSoftware(jobID, params, ceDict)
      if not software['OK']:
        self.log.error('Failed to install software for job', '%s' % (jobID))
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

      self.log.debug('Before %sCE submitJob()' % (self.ceName))
      result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode)
      if not result['OK']:
        self.__report(jobID, 'Failed', result['Message'])
        return self.__finish(result['Message'])
      elif 'PayloadFailed' in result:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % result['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish(message, self.stopOnApplicationFailure)
        else:
          self.log.info(message)

      self.log.debug('After %sCE submitJob()' % (self.ceName))
    except Exception as subExcept:  # pylint: disable=broad-except
      self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
      return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure)

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

    result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    return S_OK('Job Agent cycle complete')
Example #21
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Example #22
0
# $HeadURL$
__RCSID__ = "$Id$"

from dirac import DIRAC
from DIRAC.Core.Utilities.CFG import CFG

DIRAC.gLogger.initialize('test_gConfig', '/testSectionDebug')

testconfig = '%s/DIRAC/ConfigurationSystem/test/test.cfg' % DIRAC.rootPath
dumpconfig = '%s/DIRAC/ConfigurationSystem/test/dump.cfg' % DIRAC.rootPath

cfg1 = CFG()
cfg1.loadFromFile(testconfig)

fd = file(testconfig)
cfg1String = fd.read()
fd.close()

cfg2 = CFG()
cfg2.loadFromBuffer(cfg1.serialize())

cfg3 = cfg1.mergeWith(cfg2)

testList = [{
    'method': DIRAC.gConfig.loadFile,
    'arguments': (testconfig, ),
    'output': {
        'OK': True,
        'Value': ''
    }
}, {
Example #23
0
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )
Example #24
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")
Example #25
0
 def mergeFromFile( self, filename ):
   cfg = CFG()
   cfg.loadFromFile( filename )
   self.cfgData = self.cfgData.mergeWith( cfg )
Example #26
0
class ConfigurationData(object):
    def __init__(self, loadDefaultCFG=True):
        lr = LockRing()
        self.threadingEvent = lr.getEvent()
        self.threadingEvent.set()
        self.threadingLock = lr.getLock()
        self.runningThreadsNumber = 0
        self.__compressedConfigurationData = None
        self.configurationPath = "/DIRAC/Configuration"
        self.backupsDir = os.path.join(DIRAC.rootPath, "etc", "csbackup")
        self._isService = False
        self.localCFG = CFG()
        self.remoteCFG = CFG()
        self.mergedCFG = CFG()
        self.remoteServerList = []
        if loadDefaultCFG:
            defaultCFGFile = os.path.join(DIRAC.rootPath, "etc", "dirac.cfg")
            gLogger.debug("dirac.cfg should be at", "%s" % defaultCFGFile)
            retVal = self.loadFile(defaultCFGFile)
            if not retVal['OK']:
                gLogger.warn("Can't load %s file" % defaultCFGFile)
        self.sync()

    def getBackupDir(self):
        return self.backupsDir

    def sync(self):
        gLogger.debug("Updating configuration internals")
        self.mergedCFG = self.remoteCFG.mergeWith(self.localCFG)
        self.remoteServerList = []
        localServers = self.extractOptionFromCFG("%s/Servers" %
                                                 self.configurationPath,
                                                 self.localCFG,
                                                 disableDangerZones=True)
        if localServers:
            self.remoteServerList.extend(List.fromChar(localServers, ","))
        remoteServers = self.extractOptionFromCFG("%s/Servers" %
                                                  self.configurationPath,
                                                  self.remoteCFG,
                                                  disableDangerZones=True)
        if remoteServers:
            self.remoteServerList.extend(List.fromChar(remoteServers, ","))
        self.remoteServerList = List.uniqueElements(self.remoteServerList)
        self.__compressedConfigurationData = None

    def loadFile(self, fileName):
        try:
            fileCFG = CFG()
            fileCFG.loadFromFile(fileName)
        except IOError:
            self.localCFG = self.localCFG.mergeWith(fileCFG)
            return S_ERROR("Can't load a cfg file '%s'" % fileName)
        return self.mergeWithLocal(fileCFG)

    def mergeWithLocal(self, extraCFG):
        self.lock()
        try:
            self.localCFG = self.localCFG.mergeWith(extraCFG)
            self.unlock()
            gLogger.debug("CFG merged")
        except Exception as e:
            self.unlock()
            return S_ERROR("Cannot merge with new cfg: %s" % str(e))
        self.sync()
        return S_OK()

    def loadRemoteCFGFromCompressedMem(self, data):
        sUncompressedData = zlib.decompress(data)
        self.loadRemoteCFGFromMem(sUncompressedData)

    def loadRemoteCFGFromMem(self, data):
        self.lock()
        self.remoteCFG.loadFromBuffer(data)
        self.unlock()
        self.sync()

    def loadConfigurationData(self, fileName=False):
        name = self.getName()
        self.lock()
        try:
            if not fileName:
                fileName = "%s.cfg" % name
            if fileName[0] != "/":
                fileName = os.path.join(DIRAC.rootPath, "etc", fileName)
            self.remoteCFG.loadFromFile(fileName)
        except Exception as e:
            print e
        self.unlock()
        self.sync()

    def getCommentFromCFG(self, path, cfg=False):
        if not cfg:
            cfg = self.mergedCFG
        self.dangerZoneStart()
        try:
            levelList = [
                level.strip() for level in path.split("/")
                if level.strip() != ""
            ]
            for section in levelList[:-1]:
                cfg = cfg[section]
            return self.dangerZoneEnd(cfg.getComment(levelList[-1]))
        except Exception:
            pass
        return self.dangerZoneEnd(None)

    def getSectionsFromCFG(self, path, cfg=False, ordered=False):
        if not cfg:
            cfg = self.mergedCFG
        self.dangerZoneStart()
        try:
            levelList = [
                level.strip() for level in path.split("/")
                if level.strip() != ""
            ]
            for section in levelList:
                cfg = cfg[section]
            return self.dangerZoneEnd(cfg.listSections(ordered))
        except Exception:
            pass
        return self.dangerZoneEnd(None)

    def getOptionsFromCFG(self, path, cfg=False, ordered=False):
        if not cfg:
            cfg = self.mergedCFG
        self.dangerZoneStart()
        try:
            levelList = [
                level.strip() for level in path.split("/")
                if level.strip() != ""
            ]
            for section in levelList:
                cfg = cfg[section]
            return self.dangerZoneEnd(cfg.listOptions(ordered))
        except Exception:
            pass
        return self.dangerZoneEnd(None)

    def extractOptionFromCFG(self, path, cfg=False, disableDangerZones=False):
        if not cfg:
            cfg = self.mergedCFG
        if not disableDangerZones:
            self.dangerZoneStart()
        try:
            levelList = [
                level.strip() for level in path.split("/")
                if level.strip() != ""
            ]
            for section in levelList[:-1]:
                cfg = cfg[section]
            if levelList[-1] in cfg.listOptions():
                return self.dangerZoneEnd(cfg[levelList[-1]])
        except Exception:
            pass
        if not disableDangerZones:
            self.dangerZoneEnd()

    def setOptionInCFG(self, path, value, cfg=False, disableDangerZones=False):
        if not cfg:
            cfg = self.localCFG
        if not disableDangerZones:
            self.dangerZoneStart()
        try:
            levelList = [
                level.strip() for level in path.split("/")
                if level.strip() != ""
            ]
            for section in levelList[:-1]:
                if section not in cfg.listSections():
                    cfg.createNewSection(section)
                cfg = cfg[section]
            cfg.setOption(levelList[-1], value)
        finally:
            if not disableDangerZones:
                self.dangerZoneEnd()
        self.sync()

    def deleteOptionInCFG(self, path, cfg=False):
        if not cfg:
            cfg = self.localCFG
        self.dangerZoneStart()
        try:
            levelList = [
                level.strip() for level in path.split("/")
                if level.strip() != ""
            ]
            for section in levelList[:-1]:
                if section not in cfg.listSections():
                    return
                cfg = cfg[section]
            cfg.deleteKey(levelList[-1])
        finally:
            self.dangerZoneEnd()
        self.sync()

    def generateNewVersion(self):
        self.setVersion(Time.toString())
        self.sync()
        gLogger.info("Generated new version %s" % self.getVersion())

    def setVersion(self, version, cfg=False):
        if not cfg:
            cfg = self.remoteCFG
        self.setOptionInCFG("%s/Version" % self.configurationPath, version,
                            cfg)

    def getVersion(self, cfg=False):
        if not cfg:
            cfg = self.remoteCFG
        value = self.extractOptionFromCFG(
            "%s/Version" % self.configurationPath, cfg)
        if value:
            return value
        return "0"

    def getName(self):
        return self.extractOptionFromCFG("%s/Name" % self.configurationPath,
                                         self.mergedCFG)

    def exportName(self):
        return self.setOptionInCFG("%s/Name" % self.configurationPath,
                                   self.getName(), self.remoteCFG)

    def getRefreshTime(self):
        try:
            return int(
                self.extractOptionFromCFG(
                    "%s/RefreshTime" % self.configurationPath, self.mergedCFG))
        except:
            return 300

    def getPropagationTime(self):
        try:
            return int(
                self.extractOptionFromCFG(
                    "%s/PropagationTime" % self.configurationPath,
                    self.mergedCFG))
        except:
            return 300

    def getSlavesGraceTime(self):
        try:
            return int(
                self.extractOptionFromCFG(
                    "%s/SlavesGraceTime" % self.configurationPath,
                    self.mergedCFG))
        except:
            return 600

    def mergingEnabled(self):
        try:
            val = self.extractOptionFromCFG(
                "%s/EnableAutoMerge" % self.configurationPath, self.mergedCFG)
            return val.lower() in ("yes", "true", "y")
        except:
            return False

    def getAutoPublish(self):
        value = self.extractOptionFromCFG(
            "%s/AutoPublish" % self.configurationPath, self.localCFG)
        if value and value.lower() in ("no", "false", "n"):
            return False
        else:
            return True

    def getServers(self):
        return list(self.remoteServerList)

    def getConfigurationGateway(self):
        return self.extractOptionFromCFG("/DIRAC/Gateway", self.localCFG)

    def setServers(self, sServers):
        self.setOptionInCFG("%s/Servers" % self.configurationPath, sServers,
                            self.remoteCFG)
        self.sync()

    def deleteLocalOption(self, optionPath):
        self.deleteOptionInCFG(optionPath, self.localCFG)

    def getMasterServer(self):
        return self.extractOptionFromCFG(
            "%s/MasterServer" % self.configurationPath, self.remoteCFG)

    def setMasterServer(self, sURL):
        self.setOptionInCFG("%s/MasterServer" % self.configurationPath, sURL,
                            self.remoteCFG)
        self.sync()

    def getCompressedData(self):
        if self.__compressedConfigurationData is None:
            self.__compressedConfigurationData = zlib.compress(
                str(self.remoteCFG), 9)
        return self.__compressedConfigurationData

    def isMaster(self):
        value = self.extractOptionFromCFG("%s/Master" % self.configurationPath,
                                          self.localCFG)
        if value and value.lower() in ("yes", "true", "y"):
            return True
        else:
            return False

    def getServicesPath(self):
        return "/Services"

    def setAsService(self):
        self._isService = True

    def isService(self):
        return self._isService

    def useServerCertificate(self):
        value = self.extractOptionFromCFG(
            "/DIRAC/Security/UseServerCertificate")
        if value and value.lower() in ("y", "yes", "true"):
            return True
        return False

    def skipCACheck(self):
        value = self.extractOptionFromCFG("/DIRAC/Security/SkipCAChecks")
        if value and value.lower() in ("y", "yes", "true"):
            return True
        return False

    def dumpLocalCFGToFile(self, fileName):
        try:
            with open(fileName, "w") as fd:
                fd.write(str(self.localCFG))
            gLogger.verbose("Configuration file dumped", "'%s'" % fileName)
        except IOError:
            gLogger.error("Can't dump cfg file", "'%s'" % fileName)
            return S_ERROR("Can't dump cfg file '%s'" % fileName)
        return S_OK()

    def getRemoteCFG(self):
        return self.remoteCFG

    def getMergedCFGAsString(self):
        return str(self.mergedCFG)

    def dumpRemoteCFGToFile(self, fileName):
        with open(fileName, "w") as fd:
            fd.write(str(self.remoteCFG))

    def __backupCurrentConfiguration(self, backupName):
        configurationFilename = "%s.cfg" % self.getName()
        configurationFile = os.path.join(DIRAC.rootPath, "etc",
                                         configurationFilename)
        today = Time.date()
        backupPath = os.path.join(self.getBackupDir(), str(today.year),
                                  "%02d" % today.month)
        mkDir(backupPath)
        backupFile = os.path.join(
            backupPath,
            configurationFilename.replace(".cfg", ".%s.zip" % backupName))
        if os.path.isfile(configurationFile):
            gLogger.info("Making a backup of configuration in %s" % backupFile)
            try:
                with zipfile.ZipFile(backupFile, "w",
                                     zipfile.ZIP_DEFLATED) as zf:
                    zf.write(
                        configurationFile, "%s.backup.%s" %
                        (os.path.split(configurationFile)[1], backupName))
            except Exception:
                gLogger.exception()
                gLogger.error("Cannot backup configuration data file",
                              "file %s" % backupFile)
        else:
            gLogger.warn("CS data file does not exist", configurationFile)

    def writeRemoteConfigurationToDisk(self, backupName=False):
        configurationFile = os.path.join(DIRAC.rootPath, "etc",
                                         "%s.cfg" % self.getName())
        try:
            with open(configurationFile, "w") as fd:
                fd.write(str(self.remoteCFG))
        except Exception as e:
            gLogger.fatal("Cannot write new configuration to disk!",
                          "file %s" % configurationFile)
            return S_ERROR("Can't write cs file %s!: %s" %
                           (configurationFile, repr(e).replace(',)', ')')))
        if backupName:
            self.__backupCurrentConfiguration(backupName)
        return S_OK()

    def setRemoteCFG(self, cfg, disableSync=False):
        self.remoteCFG = cfg.clone()
        if not disableSync:
            self.sync()

    def lock(self):
        """
    Locks Event to prevent further threads from reading.
    Stops current thread until no other thread is accessing.
    PRIVATE USE
    """
        self.threadingEvent.clear()
        while self.runningThreadsNumber > 0:
            time.sleep(0.1)

    def unlock(self):
        """
    Unlocks Event.
    PRIVATE USE
    """
        self.threadingEvent.set()

    def dangerZoneStart(self):
        """
    Start of danger zone. This danger zone may be or may not be a mutual exclusion zone.
    Counter is maintained to know how many threads are inside and be able to enable and disable mutual exclusion.
    PRIVATE USE
    """
        self.threadingEvent.wait()
        self.threadingLock.acquire()
        self.runningThreadsNumber += 1
        try:
            self.threadingLock.release()
        except thread.error:
            pass

    def dangerZoneEnd(self, returnValue=None):
        """
    End of danger zone.
    PRIVATE USE
    """
        self.threadingLock.acquire()
        self.runningThreadsNumber -= 1
        try:
            self.threadingLock.release()
        except thread.error:
            pass
        return returnValue
Example #27
0
 def mergeFromFile( self, filename ):
   cfg = CFG()
   cfg.loadFromFile( filename )
   self.cfgData = self.cfgData.mergeWith( cfg )
Example #28
0
class ConfigurationData( object ):

  def __init__( self, loadDefaultCFG = True ):
    lr = LockRing()
    self.threadingEvent = lr.getEvent()
    self.threadingEvent.set()
    self.threadingLock = lr.getLock()
    self.runningThreadsNumber = 0
    self.__compressedConfigurationData = None
    self.configurationPath = "/DIRAC/Configuration"
    self.backupsDir = os.path.join( DIRAC.rootPath, "etc", "csbackup" )
    self._isService = False
    self.localCFG = CFG()
    self.remoteCFG = CFG()
    self.mergedCFG = CFG()
    self.remoteServerList = []
    if loadDefaultCFG:
      defaultCFGFile = os.path.join( DIRAC.rootPath, "etc", "dirac.cfg" )
      gLogger.debug( "dirac.cfg should be at", "%s" % defaultCFGFile )
      retVal = self.loadFile( defaultCFGFile )
      if not retVal[ 'OK' ]:
        gLogger.warn( "Can't load %s file" % defaultCFGFile )
    self.sync()

  def getBackupDir( self ):
    return self.backupsDir

  def sync( self ):
    gLogger.debug( "Updating configuration internals" )
    self.mergedCFG = self.remoteCFG.mergeWith( self.localCFG )
    self.remoteServerList = []
    localServers = self.extractOptionFromCFG( "%s/Servers" % self.configurationPath,
                                              self.localCFG,
                                              disableDangerZones = True )
    if localServers:
      self.remoteServerList.extend( List.fromChar( localServers, "," ) )
    remoteServers = self.extractOptionFromCFG( "%s/Servers" % self.configurationPath,
                                               self.remoteCFG,
                                               disableDangerZones = True )
    if remoteServers:
      self.remoteServerList.extend( List.fromChar( remoteServers, "," ) )
    self.remoteServerList = List.uniqueElements( self.remoteServerList )
    self.__compressedConfigurationData = None

  def loadFile( self, fileName ):
    try:
      fileCFG = CFG()
      fileCFG.loadFromFile( fileName )
    except IOError:
      self.localCFG = self.localCFG.mergeWith( fileCFG )
      return S_ERROR( "Can't load a cfg file '%s'" % fileName )
    return self.mergeWithLocal( fileCFG )

  def mergeWithLocal( self, extraCFG ):
    self.lock()
    try:
      self.localCFG = self.localCFG.mergeWith( extraCFG )
      self.unlock()
      gLogger.debug( "CFG merged" )
    except Exception as e:
      self.unlock()
      return S_ERROR( "Cannot merge with new cfg: %s" % str( e ) )
    self.sync()
    return S_OK()

  def loadRemoteCFGFromCompressedMem( self, data ):
    sUncompressedData = zlib.decompress( data )
    self.loadRemoteCFGFromMem( sUncompressedData )

  def loadRemoteCFGFromMem( self, data ):
    self.lock()
    self.remoteCFG.loadFromBuffer( data )
    self.unlock()
    self.sync()

  def loadConfigurationData( self, fileName = False ):
    name = self.getName()
    self.lock()
    try:
      if not fileName:
        fileName = "%s.cfg" % name
      if fileName[0] != "/":
        fileName = os.path.join( DIRAC.rootPath, "etc", fileName )
      self.remoteCFG.loadFromFile( fileName )
    except Exception as e:
      print e
      pass
    self.unlock()
    self.sync()

  def getCommentFromCFG( self, path, cfg = False ):
    if not cfg:
      cfg = self.mergedCFG
    self.dangerZoneStart()
    try:
      levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ]
      for section in levelList[:-1]:
        cfg = cfg[ section ]
      return self.dangerZoneEnd( cfg.getComment( levelList[-1] ) )
    except Exception:
      pass
    return self.dangerZoneEnd( None )

  def getSectionsFromCFG( self, path, cfg = False, ordered = False ):
    if not cfg:
      cfg = self.mergedCFG
    self.dangerZoneStart()
    try:
      levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ]
      for section in levelList:
        cfg = cfg[ section ]
      return self.dangerZoneEnd( cfg.listSections( ordered ) )
    except Exception:
      pass
    return self.dangerZoneEnd( None )

  def getOptionsFromCFG( self, path, cfg = False, ordered = False ):
    if not cfg:
      cfg = self.mergedCFG
    self.dangerZoneStart()
    try:
      levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ]
      for section in levelList:
        cfg = cfg[ section ]
      return self.dangerZoneEnd( cfg.listOptions( ordered ) )
    except Exception:
      pass
    return self.dangerZoneEnd( None )

  def extractOptionFromCFG( self, path, cfg = False, disableDangerZones = False ):
    if not cfg:
      cfg = self.mergedCFG
    if not disableDangerZones:
      self.dangerZoneStart()
    try:
      levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ]
      for section in levelList[:-1]:
        cfg = cfg[ section ]
      if levelList[-1] in cfg.listOptions():
        return self.dangerZoneEnd( cfg[ levelList[ -1 ] ] )
    except Exception:
      pass
    if not disableDangerZones:
      self.dangerZoneEnd()

  def setOptionInCFG( self, path, value, cfg = False, disableDangerZones = False ):
    if not cfg:
      cfg = self.localCFG
    if not disableDangerZones:
      self.dangerZoneStart()
    try:
      levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ]
      for section in levelList[:-1]:
        if section not in cfg.listSections():
          cfg.createNewSection( section )
        cfg = cfg[ section ]
      cfg.setOption( levelList[ -1 ], value )
    finally:
      if not disableDangerZones:
        self.dangerZoneEnd()
    self.sync()

  def deleteOptionInCFG( self, path, cfg = False ):
    if not cfg:
      cfg = self.localCFG
    self.dangerZoneStart()
    try:
      levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ]
      for section in levelList[:-1]:
        if section not in cfg.listSections():
          return
        cfg = cfg[ section ]
      cfg.deleteKey( levelList[ -1 ] )
    finally:
      self.dangerZoneEnd()
    self.sync()

  def generateNewVersion( self ):
    self.setVersion( Time.toString() )
    self.sync()
    gLogger.info( "Generated new version %s" % self.getVersion() )

  def setVersion( self, version, cfg = False ):
    if not cfg:
      cfg = self.remoteCFG
    self.setOptionInCFG( "%s/Version" % self.configurationPath, version, cfg )

  def getVersion( self, cfg = False ):
    if not cfg:
      cfg = self.remoteCFG
    value = self.extractOptionFromCFG( "%s/Version" % self.configurationPath, cfg )
    if value:
      return value
    return "0"

  def getName( self ):
    return self.extractOptionFromCFG( "%s/Name" % self.configurationPath, self.mergedCFG )

  def exportName( self ):
    return self.setOptionInCFG( "%s/Name" % self.configurationPath, self.getName(), self.remoteCFG )

  def getRefreshTime( self ):
    try:
      return int( self.extractOptionFromCFG( "%s/RefreshTime" % self.configurationPath, self.mergedCFG ) )
    except:
      return 300

  def getPropagationTime( self ):
    try:
      return int( self.extractOptionFromCFG( "%s/PropagationTime" % self.configurationPath, self.mergedCFG ) )
    except:
      return 300

  def getSlavesGraceTime( self ):
    try:
      return int( self.extractOptionFromCFG( "%s/SlavesGraceTime" % self.configurationPath, self.mergedCFG ) )
    except:
      return 600

  def mergingEnabled( self ):
    try:
      val = self.extractOptionFromCFG( "%s/EnableAutoMerge" % self.configurationPath, self.mergedCFG )
      return val.lower() in ( "yes", "true", "y" )
    except:
      return False

  def getAutoPublish( self ):
    value = self.extractOptionFromCFG( "%s/AutoPublish" % self.configurationPath, self.localCFG )
    if value and value.lower() in ( "no", "false", "n" ):
      return False
    else:
      return True

  def getServers( self ):
    return list( self.remoteServerList )

  def getConfigurationGateway( self ):
    return self.extractOptionFromCFG( "/DIRAC/Gateway", self.localCFG )

  def setServers( self, sServers ):
    self.setOptionInCFG( "%s/Servers" % self.configurationPath, sServers, self.remoteCFG )
    self.sync()

  def deleteLocalOption( self, optionPath ):
    self.deleteOptionInCFG( optionPath, self.localCFG )

  def getMasterServer( self ):
    return self.extractOptionFromCFG( "%s/MasterServer" % self.configurationPath, self.remoteCFG )

  def setMasterServer( self, sURL ):
    self.setOptionInCFG( "%s/MasterServer" % self.configurationPath, sURL, self.remoteCFG )
    self.sync()

  def getCompressedData( self ):
    if self.__compressedConfigurationData is None:
      self.__compressedConfigurationData = zlib.compress( str( self.remoteCFG ), 9 )
    return self.__compressedConfigurationData

  def isMaster( self ):
    value = self.extractOptionFromCFG( "%s/Master" % self.configurationPath, self.localCFG )
    if value and value.lower() in ( "yes", "true", "y" ):
      return True
    else:
      return False

  def getServicesPath( self ):
    return "/Services"

  def setAsService( self ):
    self._isService = True

  def isService( self ):
    return self._isService

  def useServerCertificate( self ):
    value = self.extractOptionFromCFG( "/DIRAC/Security/UseServerCertificate" )
    if value and value.lower() in ( "y", "yes", "true" ):
      return True
    return False

  def skipCACheck( self ):
    value = self.extractOptionFromCFG( "/DIRAC/Security/SkipCAChecks" )
    if value and value.lower() in ( "y", "yes", "true" ):
      return True
    return False

  def dumpLocalCFGToFile( self, fileName ):
    try:
      with open( fileName, "w" ) as fd:
        fd.write( str( self.localCFG ) )
      gLogger.verbose( "Configuration file dumped", "'%s'" % fileName )
    except IOError:
      gLogger.error( "Can't dump cfg file", "'%s'" % fileName )
      return S_ERROR( "Can't dump cfg file '%s'" % fileName )
    return S_OK()

  def getRemoteCFG( self ):
    return self.remoteCFG

  def getMergedCFGAsString( self ):
    return str( self.mergedCFG )

  def dumpRemoteCFGToFile( self, fileName ):
    with open( fileName, "w" ) as fd:
      fd.write( str( self.remoteCFG ) )

  def __backupCurrentConfiguration( self, backupName ):
    configurationFilename = "%s.cfg" % self.getName()
    configurationFile = os.path.join( DIRAC.rootPath, "etc", configurationFilename )
    today = Time.date()
    backupPath = os.path.join( self.getBackupDir(), str( today.year ), "%02d" % today.month )
    mkDir(backupPath)
    backupFile = os.path.join( backupPath, configurationFilename.replace( ".cfg", ".%s.zip" % backupName ) )
    if os.path.isfile( configurationFile ):
      gLogger.info( "Making a backup of configuration in %s" % backupFile )
      try:
        zf = zipfile.ZipFile( backupFile, "w", zipfile.ZIP_DEFLATED )
        zf.write( configurationFile, "%s.backup.%s" % ( os.path.split( configurationFile )[1], backupName ) )
        zf.close()
      except Exception:
        gLogger.exception()
        gLogger.error( "Cannot backup configuration data file", "file %s" % backupFile )
    else:
      gLogger.warn( "CS data file does not exist", configurationFile )

  def writeRemoteConfigurationToDisk( self, backupName = False ):
    configurationFile = os.path.join( DIRAC.rootPath, "etc", "%s.cfg" % self.getName() )
    try:
      with open( configurationFile, "w" ) as fd:
        fd.write( str( self.remoteCFG ) )
    except Exception as e:
      gLogger.fatal( "Cannot write new configuration to disk!",
                     "file %s" % configurationFile )
      return S_ERROR( "Can't write cs file %s!: %s" % ( configurationFile, repr( e ).replace( ',)', ')' ) ) )
    if backupName:
      self.__backupCurrentConfiguration( backupName )
    return S_OK()

  def setRemoteCFG( self, cfg, disableSync = False ):
    self.remoteCFG = cfg.clone()
    if not disableSync:
      self.sync()

  def lock( self ):
    """
    Locks Event to prevent further threads from reading.
    Stops current thread until no other thread is accessing.
    PRIVATE USE
    """
    self.threadingEvent.clear()
    while self.runningThreadsNumber > 0:
      time.sleep( 0.1 )

  def unlock( self ):
    """
    Unlocks Event.
    PRIVATE USE
    """
    self.threadingEvent.set()

  def dangerZoneStart( self ):
    """
    Start of danger zone. This danger zone may be or may not be a mutual exclusion zone.
    Counter is maintained to know how many threads are inside and be able to enable and disable mutual exclusion.
    PRIVATE USE
    """
    self.threadingEvent.wait()
    self.threadingLock.acquire()
    self.runningThreadsNumber += 1
    try:
      self.threadingLock.release()
    except thread.error:
      pass

  def dangerZoneEnd( self, returnValue = None ):
    """
    End of danger zone.
    PRIVATE USE
    """
    self.threadingLock.acquire()
    self.runningThreadsNumber -= 1
    try:
      self.threadingLock.release()
    except thread.error:
      pass
    return returnValue
Example #29
0
# $HeadURL$
__RCSID__ = "7b8878b (2009-11-05 19:40:01 +0000) Adria Casajus <*****@*****.**>"

from dirac import DIRAC
from DIRAC.Core.Utilities.CFG import CFG

DIRAC.gLogger.initialize('test_gConfig','/testSectionDebug')

testconfig = '%s/DIRAC/ConfigurationSystem/test/test.cfg' % DIRAC.rootPath
dumpconfig = '%s/DIRAC/ConfigurationSystem/test/dump.cfg' % DIRAC.rootPath

cfg1 = CFG()
cfg1.loadFromFile( testconfig )

fd = file( testconfig )
cfg1String = fd.read()
fd.close()

cfg2 = CFG()
cfg2.loadFromBuffer( cfg1.serialize() )

cfg3 = cfg1.mergeWith( cfg2 )

testList = [{ 'method'    : DIRAC.gConfig.loadFile,
              'arguments' : ( testconfig, ),
              'output'    : {'OK': True, 'Value': ''}
            },
            { 'method'    : DIRAC.gConfig.dumpLocalCFGToFile,
              'arguments' : ( dumpconfig, ),
              'output'    : {'OK': True, 'Value': ''}
            },
Example #30
0
    mjfPower = getPowerFromMJF()
    if mjfPower:
        gLogger.notice('CPU power from MJF is %.1f HS06' % mjfPower)
    else:
        gLogger.notice('MJF not available on this node')

    if update and not configFile:
        gConfig.setOptionValue('/LocalSite/CPUScalingFactor',
                               mjfPower if mjfPower else norm)
        gConfig.setOptionValue('/LocalSite/CPUNormalizationFactor', norm)

        gConfig.dumpLocalCFGToFile(gConfig.diracConfigFilePath)
    if configFile:
        from DIRAC.Core.Utilities.CFG import CFG
        cfg = CFG()
        try:
            # Attempt to open the given file
            cfg.loadFromFile(configFile)
        except:
            pass
        # Create the section if it does not exist
        if not cfg.existsKey('LocalSite'):
            cfg.createNewSection('LocalSite')
        cfg.setOption('/LocalSite/CPUScalingFactor',
                      mjfPower if mjfPower else norm)
        cfg.setOption('/LocalSite/CPUNormalizationFactor', norm)

        cfg.writeToFile(configFile)

    DIRAC.exit()
Example #31
0
class JobRepository( object ):

  def __init__( self, repository = None ):
    self.location = repository
    if not self.location:
      if "HOME" in os.environ:
        self.location = '%s/.dirac.repo.rep' % os.environ['HOME']
      else:
        self.location = '%s/.dirac.repo.rep' % os.getcwd()
    self.repo = CFG()
    if os.path.exists( self.location ):
      self.repo.loadFromFile( self.location )
      if not self.repo.existsKey( 'Jobs' ):
        self.repo.createNewSection( 'Jobs' )
    else:
      self.repo.createNewSection( 'Jobs' )
    self.OK = True
    written = self._writeRepository( self.location )
    if not written:
      self.OK = False

  def isOK( self ):
    return self.OK

  def readRepository( self ):
    return S_OK( self.repo.getAsDict( 'Jobs' ) )

  def writeRepository( self, alternativePath = None ):
    destination = self.location
    if alternativePath:
      destination = alternativePath
    written = self._writeRepository( destination )
    if not written:
      return S_ERROR( "Failed to write repository" )
    return S_OK( destination )

  def resetRepository( self, jobIDs = [] ):
    if not jobIDs:
      jobs = self.readRepository()['Value']
      jobIDs = jobs.keys()
    paramDict = {'State'       : 'Submitted',
                 'Retrieved'   : 0,
                 'OutputData'  : 0}
    for jobID in jobIDs:
      self._writeJob( jobID, paramDict, True )
    self._writeRepository( self.location )
    return S_OK()

  def _writeRepository( self, path ):
    handle, tmpName = tempfile.mkstemp()
    written = self.repo.writeToFile( tmpName )
    os.close( handle )
    if not written:
      if os.path.exists( tmpName ):
        os.remove( tmpName )
      return written
    if os.path.exists( path ):
      gLogger.debug( "Replacing %s" % path )
    try:
      shutil.move( tmpName, path )
      return True
    except Exception as x:
      gLogger.error( "Failed to overwrite repository.", x )
      gLogger.info( "If your repository is corrupted a backup can be found %s" % tmpName )
      return False

  def appendToRepository( self, repoLocation ):
    if not os.path.exists( repoLocation ):
      gLogger.error( "Secondary repository does not exist", repoLocation )
      return S_ERROR( "Secondary repository does not exist" )
    self.repo = CFG().loadFromFile( repoLocation ).mergeWith( self.repo )
    self._writeRepository( self.location )
    return S_OK()

  def addJob( self, jobID, state = 'Submitted', retrieved = 0, outputData = 0, update = False ):
    paramDict = { 'State'       : state,
                  'Time'        : self._getTime(),
                  'Retrieved'   : int( retrieved ),
                  'OutputData'  : outputData}
    self._writeJob( jobID, paramDict, update )
    self._writeRepository( self.location )
    return S_OK( jobID )

  def updateJob( self, jobID, paramDict ):
    if self._existsJob( jobID ):
      paramDict['Time'] = self._getTime()
      self._writeJob( jobID, paramDict, True )
      self._writeRepository( self.location )
    return S_OK()

  def updateJobs( self, jobDict ):
    for jobID, paramDict in jobDict.items():
      if self._existsJob( jobID ):
        paramDict['Time'] = self._getTime()
        self._writeJob( jobID, paramDict, True )
    self._writeRepository( self.location )
    return S_OK()

  def _getTime( self ):
    runtime = time.ctime()
    return runtime.replace( " ", "_" )

  def _writeJob( self, jobID, paramDict, update ):
    jobID = str( jobID )
    jobExists = self._existsJob( jobID )
    if jobExists and ( not update ):
      gLogger.warn( "Job exists and not overwriting" )
      return S_ERROR( "Job exists and not overwriting" )
    if not jobExists:
      self.repo.createNewSection( 'Jobs/%s' % jobID )
    for key, value in paramDict.items():
      self.repo.setOption( 'Jobs/%s/%s' % ( jobID, key ), value )
    return S_OK()

  def removeJob( self, jobID ):
    res = self.repo['Jobs'].deleteKey( str( jobID ) ) #pylint: disable=no-member
    if res:
      self._writeRepository( self.location )
    return S_OK()

  def existsJob( self, jobID ):
    return S_OK( self._existsJob( jobID ) )

  def _existsJob( self, jobID ):
    return self.repo.isSection( 'Jobs/%s' % jobID )

  def getLocation( self ):
    return S_OK( self.location )

  def getSize( self ):
    return S_OK( len( self.repo.getAsDict( 'Jobs' ) ) )
Example #32
0
def checkAgentOptions(getOptionMock, systemName, agentName,
                      ignoreOptions=None, extension='DIRAC'):
  """Ensure that all the agent options are properly documented.

  :param getOptionMock: Mock object for agentmodule.get_amOption function
  :param str systemName: name of the **System**
  :param str agentName: name of the **Agent**
  :param list ignoreOptions: list of options to ignore
  :param str extension: name of the DIRAC **Extension** where the Agent comes from
  """
  if ignoreOptions is None:
    ignoreOptions = []

  # add some options that can be set, see the AgentModule for all of them
  ignoreOptions.extend(['PollingTime', 'Status', 'Enabled',
                        'MaxCycles', 'LogOutputs', 'ControlDirectory',
                        'shifterProxy'])
  ignoreOptions = list(set(ignoreOptions))
  config = CFG()

  LOG.info("Testing %s/%s, ignoring options %s", systemName, agentName, ignoreOptions)

  # get the location where DIRAC is in from basefolder/DIRAC/__ini__.py
  configFilePath = os.path.join(os.path.dirname(os.path.dirname(DIRAC.__file__)),
                                extension, systemName, 'ConfigTemplate.cfg')
  config.loadFromFile(configFilePath)
  optionsDict = config.getAsDict('Agents/%s' % agentName)
  outDict = {}
  _parseOption(outDict, optionsDict)
  optionsDict = outDict
  LOG.info("Calls: %s", pformat(getOptionMock.call_args_list))
  LOG.info("Options found in ConfigTemplate: %s ", list(optionsDict.keys()))

  # check that values in ConfigTemplate are used
  for option, value in optionsDict.iteritems():
    if any(ignoreOp in option for ignoreOp in ignoreOptions):
      LOG.info("From Agent: ignoring option %r with value %r, (%s)", option, value, type(value))
      continue
    LOG.info("Looking for call to option %r with value %r, (%s)", option, value, type(value))
    if not isinstance(value, bool) and not value:  # empty string, list, dict ...
      assert any(call(option, null) in getOptionMock.call_args_list for null in ({}, set(), [], '', 0))
    else:
      assert call(option, value) in getOptionMock.call_args_list or \
          call(option, [value]) in getOptionMock.call_args_list

  # check that options used in the agent are in the ConfigTemplates
  for opCall in getOptionMock.call_args_list:
    optionArguments = opCall[0]
    if len(optionArguments) != 2:
      continue
    optionName = optionArguments[0]
    optionValue = optionArguments[1]
    if optionName in ignoreOptions:
      LOG.info("From Template: ignoring option %r with %r", optionName, optionValue)
      continue
    LOG.info("Checking Template option %r with %r", optionName, optionValue)
    assert optionName in optionsDict
    if not optionsDict[optionName]:
      assert not optionValue
      continue
    assert optionsDict[optionName] == optionValue or [optionsDict[optionName]] == optionValue