class ProcessList(object): """ The ProcessList uses internally the CFG utility to store the processes and their properties. """ def __init__(self, location): self.cfg = CFG() self.location = location self.goodProcessList = True if os.path.exists(self.location): self.cfg.loadFromFile(self.location) if not self.cfg.existsKey('Processes'): self.cfg.createNewSection('Processes') else: self.goodProcessList = False def _writeProcessList(self, path): """ Write to text """ handle, tmpName = tempfile.mkstemp() written = self.cfg.writeToFile(tmpName) os.close(handle) if not written: if os.path.exists(tmpName): os.remove(tmpName) return written if os.path.exists(path): LOG.debug("Replacing %s" % path) try: shutil.move(tmpName, path) return True except OSError, err: LOG.error("Failed to overwrite process list.", err) LOG.info("If your process list is corrupted a backup can be found %s" % tmpName) return False
def _parseConfigTemplate(self, templatePath, cfg=None): """Parse the ConfigTemplate.cfg files. :param str templatePath: path to the folder containing a ConfigTemplate.cfg file :param CFG cfg: cfg to merge with the systems config :returns: CFG object """ cfg = CFG() if cfg is None else cfg system = os.path.split(templatePath.rstrip("/"))[1] if system.lower().endswith('system'): system = system[:-len('System')] if self.systems and system not in self.systems: return S_OK(cfg) templatePath = os.path.join(templatePath, 'ConfigTemplate.cfg') if not os.path.exists(templatePath): return S_ERROR("File not found: %s" % templatePath) loadCfg = CFG() loadCfg.loadFromFile(templatePath) newCfg = CFG() newCfg.createNewSection("/%s" % system, contents=loadCfg) cfg = cfg.mergeWith(newCfg) return S_OK(cfg)
class ProcessList(object): """ The ProcessList uses internally the CFG utility to store the processes and their properties. """ def __init__(self, location): self.cfg = CFG() self.location = location self.goodProcessList = True if os.path.exists(self.location): self.cfg.loadFromFile(self.location) if not self.cfg.existsKey('Processes'): self.cfg.createNewSection('Processes') else: self.goodProcessList = False def _writeProcessList(self, path): """ Write to text """ handle, tmpName = tempfile.mkstemp() written = self.cfg.writeToFile(tmpName) os.close(handle) if not written: if os.path.exists(tmpName): os.remove(tmpName) return written if os.path.exists(path): gLogger.debug("Replacing %s" % path) try: shutil.move(tmpName, path) return True except OSError, err: gLogger.error("Failed to overwrite process list.", err) gLogger.info("If your process list is corrupted a backup can be found %s" % tmpName) return False
def parseConfigTemplate(self, templatePath, cfg): """Parse the ConfigTemplate.cfg files. :param str templatePath: path to the folder containing a ConfigTemplate.cfg file :param CFG cfg: cfg to merge with the systems config :returns: CFG object """ system = os.path.split(templatePath.rstrip('/'))[1] if system.lower().endswith('system'): system = system[:-len('System')] templatePath = os.path.join(templatePath, 'ConfigTemplate.cfg') if not os.path.exists(templatePath): return S_ERROR('File not found: %s' % templatePath) loadCfg = CFG() try: loadCfg.loadFromFile(templatePath) except ValueError as err: LOG.error('Failed loading file %r: %r', templatePath, err) self.retVal = 1 return S_ERROR() cfg.createNewSection('/Systems/%s' % system, contents=loadCfg) return S_OK(cfg)
def checkAgentOptions(getOptionMock, systemName, agentName, ignoreOptions=None, extension='DIRAC'): """Ensure that all the agent options are properly documented. :param getOptionMock: Mock object for agentmodule.get_amOption function :param str systemName: name of the **System** :param str agentName: name of the **Agent** :param list ignoreOptions: list of options to ignore :param str extension: name of the DIRAC **Extension** where the Agent comes from """ if ignoreOptions is None: ignoreOptions = [] # add some options that can be set, see the AgentModule for all of them ignoreOptions.extend(['PollingTime', 'Status', 'Enabled', 'MaxCycles', 'LogOutputs', 'ControlDirectory']) ignoreOptions = list(set(ignoreOptions)) config = CFG() LOG.info("Testing %s/%s, ignoring options %s", systemName, agentName, ignoreOptions) # get the location where DIRAC is in from basefolder/DIRAC/__ini__.py configFilePath = os.path.join(os.path.dirname(os.path.dirname(DIRAC.__file__)), extension, systemName, 'ConfigTemplate.cfg') config.loadFromFile(configFilePath) optionsDict = config.getAsDict('Agents/%s' % agentName) outDict = {} _parseOption(outDict, optionsDict) optionsDict = outDict LOG.info("Calls: %s", pformat(getOptionMock.call_args_list)) LOG.info("Options found in ConfigTemplate: %s ", list(optionsDict.keys())) # check that values in ConfigTemplate are used for option, value in optionsDict.iteritems(): if any(ignoreOp in option for ignoreOp in ignoreOptions): LOG.info("From Agent: ignoring option %r with value %r, (%s)", option, value, type(value)) continue LOG.info("Looking for call to option %r with value %r, (%s)", option, value, type(value)) if not isinstance(value, bool) and not value: # empty string, list, dict ... assert any(call(option, null) in getOptionMock.call_args_list for null in ({}, set(), [], '', 0)) else: assert call(option, value) in getOptionMock.call_args_list or \ call(option, [value]) in getOptionMock.call_args_list # check that options used in the agent are in the ConfigTemplates for opCall in getOptionMock.call_args_list: optionArguments = opCall[0] if len(optionArguments) != 2: continue optionName = optionArguments[0] optionValue = optionArguments[1] if optionName in ignoreOptions: LOG.info("From Template: ignoring option %r with %r", optionName, optionValue) continue LOG.info("Checking Template option %r with %r", optionName, optionValue) assert optionName in optionsDict if not optionsDict[optionName]: assert not optionValue continue assert optionsDict[optionName] == optionValue or [optionsDict[optionName]] == optionValue
def loadFile( self, fileName ): try: fileCFG = CFG() fileCFG.loadFromFile( fileName ) except IOError: self.localCFG = self.localCFG.mergeWith( fileCFG ) return S_ERROR( "Can't load a cfg file '%s'" % fileName ) return self.mergeWithLocal( fileCFG )
def loadFile(self, fileName): try: fileCFG = CFG() fileCFG.loadFromFile(fileName) except IOError: self.localCFG = self.localCFG.mergeWith(fileCFG) return S_ERROR("Can't load a cfg file '%s'" % fileName) return self.mergeWithLocal(fileCFG)
def execute( self ): """ execute """ stopAgents = self.findStopAgents()[ 'Value' ] if stopAgents: self.log.info( 'Aborting, there are stop_agents to be picked' ) return S_OK() pilotVersion = self.opHelper.getValue( 'Pilot/Version', '' ) if not pilotVersion: self.log.error( 'There is no pilot version on the CS' ) return S_OK() pilotVersion = self.getNewestPilotVersion() if not pilotVersion[ 'OK' ]: self.log.error( pilotVersion[ 'Message' ] ) return S_ERROR( pilotVersion[ 'Message' ] ) pilotVersion = pilotVersion[ 'Value' ] localCFG = CFG() #load local CFG localCFG.loadFromFile( self.cfgToUpdate ) releaseVersion = localCFG.getRecursive( 'LocalSite/ReleaseVersion' )[ 'value' ] self.log.info( 'PilotVersion : %s' % pilotVersion ) self.log.info( 'ReleaseVersion : %s' % releaseVersion ) if LooseVersion( pilotVersion ) > LooseVersion( releaseVersion ): self.log.info( 'UPDATING %s > %s' % ( pilotVersion, releaseVersion ) ) localCFG.setOption( 'LocalSite/ReleaseVersion', pilotVersion ) localCFG.writeToFile( self.cfgToUpdate ) self.touchStopAgents() else: self.log.info( 'Nothing to do' ) return S_OK()
def checkFunction(): """ gets CPU normalisation from MFJ or calculate itself """ from DIRAC.WorkloadManagementSystem.Client.CPUNormalization import getPowerFromMJF from ILCDIRAC.Core.Utilities.CPUNormalization import getCPUNormalization from DIRAC import gLogger, gConfig result = getCPUNormalization() if not result['OK']: gLogger.error( result['Message'] ) norm = round( result['Value']['NORM'], 1 ) gLogger.notice( 'Estimated CPU power is %.1f %s' % ( norm, result['Value']['UNIT'] ) ) mjfPower = getPowerFromMJF() if mjfPower: gLogger.notice( 'CPU power from MJF is %.1f HS06' % mjfPower ) else: gLogger.notice( 'MJF not available on this node' ) if update and not configFile: gConfig.setOptionValue( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm ) gConfig.setOptionValue( '/LocalSite/CPUNormalizationFactor', norm ) gConfig.dumpLocalCFGToFile( gConfig.diracConfigFilePath ) if configFile: from DIRAC.Core.Utilities.CFG import CFG cfg = CFG() try: # Attempt to open the given file cfg.loadFromFile( configFile ) except: pass # Create the section if it does not exist if not cfg.existsKey( 'LocalSite' ): cfg.createNewSection( 'LocalSite' ) cfg.setOption( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm ) cfg.setOption( '/LocalSite/CPUNormalizationFactor', norm ) cfg.writeToFile( configFile ) DIRAC.exit()
def getComputingElementDefaults(ceName='', ceType='', cfg=None, currentSectionPath=''): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath('ComputingElements') if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) #pylint: disable=no-member if ceType: cesCfg[ceName].setOption('CEType', ceType) #pylint: disable=no-member ceDefaultSection = cfgPath(defaultSection('ComputingElements')) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if 'CEType' in cesCfg[ceName]: ceType = cesCfg[ceName]['CEType'] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): if option not in cesCfg[ceName]: cesCfg[ceName].setOption(option, ceDefaults[ceType][option]) return cesCfg
def updateCompleteDiracCFG(self): """Read the dirac.cfg and update the Systems sections from the ConfigTemplate.cfg files.""" compCfg = CFG() mainDiracCfgPath = self.config.cfg_baseFile if not os.path.exists(mainDiracCfgPath): LOG.error('Failed to find Main Dirac cfg at %r', mainDiracCfgPath) return 1 LOG.info('Extracting default configuration from %r', mainDiracCfgPath) loadCFG = CFG() loadCFG.loadFromFile(mainDiracCfgPath) compCfg = loadCFG.mergeWith(compCfg) cfg = self.getSystemsCFG() compCfg = compCfg.mergeWith(cfg) diracCfgOutput = self.config.cfg_targetFile LOG.info('Writing output to %r', diracCfgOutput) with open(diracCfgOutput, 'w') as rst: rst.write( textwrap.dedent(""" ========================== Full Configuration Example ========================== .. This file is created by docs/Tools/UpdateDiracCFG.py Below is a complete example configuration with anotations for some sections:: """)) # indent the cfg text cfgString = ''.join(' ' + line for line in str(compCfg).splitlines(True)) # fix the links, add back the # for targets # match .html with following character using positive look ahead htmlMatch = re.compile(r'\.html(?=[a-zA-Z0-9])') cfgString = re.sub(htmlMatch, '.html#', cfgString) rst.write(cfgString) return self.retVal
def getComputingElementDefaults(ceName="", ceType="", cfg=None, currentSectionPath=""): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath("ComputingElements") if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) if ceType: cesCfg[ceName].setOption("CEType", ceType) ceDefaultSection = cfgPath(defaultSection("ComputingElements")) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if "CEType" in cesCfg[ceName]: ceType = cesCfg[ceName]["CEType"] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): if option not in cesCfg[ceName]: cesCfg[ceName].setOption(option, ceDefaults[ceType][option]) return cesCfg
def getComputingElementDefaults(ceName='', ceType='', cfg=None, currentSectionPath=''): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath('ComputingElements') if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except BaseException: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) # pylint: disable=no-member if ceType: cesCfg[ceName].setOption('CEType', ceType) # pylint: disable=no-member ceDefaultSection = cfgPath(defaultSection('ComputingElements')) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if 'CEType' in cesCfg[ceName]: ceType = cesCfg[ceName]['CEType'] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): # pylint: disable=no-member if option not in cesCfg[ceName]: cesCfg[ceName].setOption(option, ceDefaults[ceType][option]) # pylint: disable=unsubscriptable-object return cesCfg
localCfg = CFG() if cFile: localConfigFile = cFile else: print "WORKSPACE: %s" % os.path.expandvars('$WORKSPACE') if os.path.isfile( os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg' ): localConfigFile = os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg' elif os.path.isfile( os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg' ): localConfigFile = os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg' elif os.path.isfile( './etc/dirac.cfg' ): localConfigFile = './etc/dirac.cfg' else: print "Local CFG file not found" exit( 2 ) localCfg.loadFromFile( localConfigFile ) if not localCfg.isSection( '/LocalSite' ): localCfg.createNewSection( '/LocalSite' ) localCfg.setOption( '/LocalSite/CPUTimeLeft', 5000 ) localCfg.setOption( '/DIRAC/Security/UseServerCertificate', False ) if not sMod: if not setup: setup = gConfig.getValue('/DIRAC/Setup') if not setup: setup = 'JenkinsSetup' if not vo: vo = gConfig.getValue('/DIRAC/VirtualOrganization') if not vo: vo = 'dirac'
class JobRepository(object): def __init__(self, repository=None): self.location = repository if not self.location: if "HOME" in os.environ: self.location = '%s/.dirac.repo.rep' % os.environ['HOME'] else: self.location = '%s/.dirac.repo.rep' % os.getcwd() self.repo = CFG() if os.path.exists(self.location): self.repo.loadFromFile(self.location) if not self.repo.existsKey('Jobs'): self.repo.createNewSection('Jobs') else: self.repo.createNewSection('Jobs') self.OK = True written = self._writeRepository(self.location) if not written: self.OK = False def isOK(self): return self.OK def readRepository(self): return S_OK(self.repo.getAsDict('Jobs')) def writeRepository(self, alternativePath=None): destination = self.location if alternativePath: destination = alternativePath written = self._writeRepository(destination) if not written: return S_ERROR("Failed to write repository") return S_OK(destination) def resetRepository(self, jobIDs=[]): if not jobIDs: jobs = self.readRepository()['Value'] jobIDs = jobs.keys() paramDict = {'State': 'Submitted', 'Retrieved': 0, 'OutputData': 0} for jobID in jobIDs: self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _writeRepository(self, path): handle, tmpName = tempfile.mkstemp() written = self.repo.writeToFile(tmpName) os.close(handle) if not written: if os.path.exists(tmpName): os.remove(tmpName) return written if os.path.exists(path): gLogger.debug("Replacing %s" % path) try: shutil.move(tmpName, path) return True except Exception as x: gLogger.error("Failed to overwrite repository.", x) gLogger.info("If your repository is corrupted a backup can be found %s" % tmpName) return False def appendToRepository(self, repoLocation): if not os.path.exists(repoLocation): gLogger.error("Secondary repository does not exist", repoLocation) return S_ERROR("Secondary repository does not exist") self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo) self._writeRepository(self.location) return S_OK() def addJob(self, jobID, state='Submitted', retrieved=0, outputData=0, update=False): paramDict = {'State': state, 'Time': self._getTime(), 'Retrieved': int(retrieved), 'OutputData': outputData} self._writeJob(jobID, paramDict, update) self._writeRepository(self.location) return S_OK(jobID) def updateJob(self, jobID, paramDict): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def updateJobs(self, jobDict): for jobID, paramDict in jobDict.items(): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _getTime(self): runtime = time.ctime() return runtime.replace(" ", "_") def _writeJob(self, jobID, paramDict, update): jobID = str(jobID) jobExists = self._existsJob(jobID) if jobExists and (not update): gLogger.warn("Job exists and not overwriting") return S_ERROR("Job exists and not overwriting") if not jobExists: self.repo.createNewSection('Jobs/%s' % jobID) for key, value in paramDict.items(): self.repo.setOption('Jobs/%s/%s' % (jobID, key), value) return S_OK() def removeJob(self, jobID): res = self.repo['Jobs'].deleteKey(str(jobID)) # pylint: disable=no-member if res: self._writeRepository(self.location) return S_OK() def existsJob(self, jobID): return S_OK(self._existsJob(jobID)) def _existsJob(self, jobID): return self.repo.isSection('Jobs/%s' % jobID) def getLocation(self): return S_OK(self.location) def getSize(self): return S_OK(len(self.repo.getAsDict('Jobs')))
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn( "Disabling filling mode as errors calculating time left", self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', '%d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self._setupProxy(ownerDN, jobGroup) if not result['OK']: return self._rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result = self._submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode, maxNumberOfProcessors, mpTag) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self._getCPUTimeLeft() return S_OK('Job Agent cycle complete')
mjfPower = getPowerFromMJF() if mjfPower: gLogger.notice( 'CPU power from MJF is %.1f HS06' % mjfPower ) else: gLogger.notice( 'MJF not available on this node' ) if update and not configFile: gConfig.setOptionValue( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm ) gConfig.setOptionValue( '/LocalSite/CPUNormalizationFactor', norm ) gConfig.dumpLocalCFGToFile( gConfig.diracConfigFilePath ) if configFile: from DIRAC.Core.Utilities.CFG import CFG cfg = CFG() try: # Attempt to open the given file cfg.loadFromFile( configFile ) except: pass # Create the section if it does not exist if not cfg.existsKey( 'LocalSite' ): cfg.createNewSection( 'LocalSite' ) cfg.setOption( '/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm ) cfg.setOption( '/LocalSite/CPUNormalizationFactor', norm ) cfg.writeToFile( configFile ) DIRAC.exit()
os.path.expandvars('$WORKSPACE') + '/PilotInstallDIR/etc/dirac.cfg'): localConfigFile = os.path.expandvars( '$WORKSPACE') + '/PilotInstallDIR/etc/dirac.cfg' elif os.path.isfile( os.path.expandvars('$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg'): localConfigFile = os.path.expandvars( '$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg' elif os.path.isfile('./etc/dirac.cfg'): localConfigFile = './etc/dirac.cfg' else: print "Local CFG file not found" exit(2) localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', 5000) localCfg.setOption('/DIRAC/Security/UseServerCertificate', False) if not sMod: if not setup: setup = gConfig.getValue('/DIRAC/Setup') if not setup: setup = 'dirac-JenkinsSetup' if not vo: vo = gConfig.getValue('/DIRAC/VirtualOrganization') if not vo: vo = 'dirac'
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % submission[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete')
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available') self.log.info(result['Message']) return self.__finish('CE Not Available') self.log.info(result['Message']) ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots with %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn('Job has no CPU requirement defined in JDL parameters') # Job requirement for a number of processors processors = int(params.get('NumberOfProcessors', 1)) wholeNode = 'WholeNode' in params if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result['PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() return S_OK('Job Agent cycle complete')
def execute(self): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error(jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: self.log.error(jobRequest['Message']) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if not params.has_key('JobID'): msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if not params.has_key('JobType'): self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key('SystemConfig'): self.log.warn( 'Job has no system configuration defined in JDL parameters') systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig else: systemConfig = params['SystemConfig'] if systemConfig.lower() == 'any': systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =', '"%s" since it was set to "ANY" in the job description' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig if not params.has_key('CPUTime'): self.log.warn( 'Job has no CPU requirement defined in JDL parameters') self.log.verbose('Job request successful: \n %s' % (jobRequest['Value'])) self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' % (jobID, jobType, systemConfig)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if self.gridCEQueue: jobReport.setJobParameter('GridCEQueue', self.gridCEQueue, sendFlag=False) if os.environ.has_key('BOINC_JOB_ID'): # Report BOINC environment for p in [ 'BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName' ]: jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo(jobID) result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) if 'Value' in result and result['Value']: proxyChain = result['Value'] # Is this necessary at all? saveJDL = self.__saveJobJDLRequest(jobID, jobJDL) #self.__report(jobID,'Matched','Job Prepared to Submit') #resourceParameters = self.__getJDLParameters( resourceJDL ) #if not resourceParameters['OK']: # return resourceParameters #resourceParams = resourceParameters['Value'] software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job %s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.verbose('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure) self.log.verbose('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) currentTimes = list(os.times()) for i in range(len(currentTimes)): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete')
# $HeadURL$ __RCSID__ = "$Id$" from dirac import DIRAC from DIRAC.Core.Utilities.CFG import CFG DIRAC.gLogger.initialize('test_gConfig', '/testSectionDebug') testconfig = '%s/DIRAC/ConfigurationSystem/test/test.cfg' % DIRAC.rootPath dumpconfig = '%s/DIRAC/ConfigurationSystem/test/dump.cfg' % DIRAC.rootPath cfg1 = CFG() cfg1.loadFromFile(testconfig) fd = file(testconfig) cfg1String = fd.read() fd.close() cfg2 = CFG() cfg2.loadFromBuffer(cfg1.serialize()) cfg3 = cfg1.mergeWith(cfg2) testList = [{ 'method': DIRAC.gConfig.loadFile, 'arguments': (testconfig, ), 'output': { 'OK': True, 'Value': '' } }, {
def execute( self ): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info( 'Attempting to check CPU time left for filling mode' ) if self.fillingMode: if self.timeLeftError: self.log.warn( self.timeLeftError ) return self.__finish( self.timeLeftError ) self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) ) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft ) if not result['OK']: return self.__finish( result['Message'] ) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join( '.', self.extraOptions ) else: localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" ) localCfg.loadFromFile( localConfigFile ) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft ) localCfg.writeToFile( localConfigFile ) else: return self.__finish( 'Filling Mode is Disabled' ) self.log.verbose( 'Job Agent execution loop' ) available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info( 'Resource is not available' ) self.log.info( available['Message'] ) return self.__finish( 'CE Not Available' ) self.log.info( available['Message'] ) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' ) if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str( self.pilotReference ) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict( '/AgentJobRequirements' ) if result['OK']: requirementsDict = result['Value'] ceDict.update( requirementsDict ) self.log.verbose( ceDict ) start = time.time() jobRequest = self.__requestJob( ceDict ) matchTime = time.time() - start self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches ) if not jobRequest['OK']: if re.search( 'No match found', jobRequest['Message'] ): self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "seconds timeout" ) != -1: self.log.error( jobRequest['Message'] ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 : self.log.error( jobRequest['Message'] ) return S_ERROR( jobRequest['Message'] ) else: self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False ) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key( param ): self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) elif not matcherInfo[param]: self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) else: self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) ) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters( jobJDL ) if not parameters['OK']: self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' ) self.log.warn( parameters['Message'] ) return self.__finish( 'JDL Problem' ) params = parameters['Value'] if not params.has_key( 'JobID' ): msg = 'Job has not JobID defined in JDL parameters' self.__report( jobID, 'Failed', msg ) self.log.warn( msg ) return self.__finish( 'JDL Problem' ) else: jobID = params['JobID'] if not params.has_key( 'JobType' ): self.log.warn( 'Job has no JobType defined in JDL parameters' ) jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key( 'CPUTime' ): self.log.warn( 'Job has no CPU requirement defined in JDL parameters' ) if self.extraOptions: params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) ) self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) ) self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) ) self.jobCount += 1 try: jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName ) jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False ) if os.environ.has_key( 'BOINC_JOB_ID' ): # Report BOINC environment for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']: jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False ) jobReport.setJobStatus( 'Matched', 'Job Received by Agent' ) result = self.__setupProxy( ownerDN, jobGroup ) if not result[ 'OK' ]: return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure ) if 'Value' in result and result[ 'Value' ]: proxyChain = result[ 'Value' ] # Save the job jdl for external monitoring self.__saveJobJDLRequest( jobID, jobJDL ) software = self.__checkInstallSoftware( jobID, params, ceDict ) if not software['OK']: self.log.error( 'Failed to install software for job %s' % ( jobID ) ) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure ) self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) ) submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain ) if not submission['OK']: self.__report( jobID, 'Failed', submission['Message'] ) return self.__finish( submission['Message'] ) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure ) self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) ) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure ) currentTimes = list( os.times() ) for i in range( len( currentTimes ) ): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, _elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft( cpuTime ) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) ) self.scaledCPUTime = scaledCPUTime return S_OK( 'Job Agent cycle complete' )
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish("No more time left") # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result["OK"]: return self.__finish(result["Message"]) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection("/LocalSite"): localCfg.createNewSection("/LocalSite") localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish("Filling Mode is Disabled") self.log.verbose("Job Agent execution loop") available = self.computingElement.available() if not available["OK"] or not available["Value"]: self.log.info("Resource is not available") self.log.info(available["Message"]) return self.__finish("CE Not Available") self.log.info(available["Message"]) result = self.computingElement.getDescription() if not result["OK"]: return result ceDict = result["Value"] # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if not "PilotReference" in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime = %.2f (s)" % (matchTime)) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: if re.search("No match found", jobRequest["Message"]): self.log.notice("Job request OK: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error("Timeout while requesting job", jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("Pilot version does not match") != -1: errorMsg = "Pilot version does not match the production version" self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, "")) return S_ERROR(jobRequest["Message"]) else: self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest["Value"] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False) jobID = matcherInfo["JobID"] matcherParams = ["JDL", "DN", "Group"] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, "Failed", "Matcher did not return %s" % (param)) return self.__finish("Matcher Failed") elif not matcherInfo[param]: self.__report(jobID, "Failed", "Matcher returned null %s" % (param)) return self.__finish("Matcher Failed") else: self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param])) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters["OK"]: self.__report(jobID, "Failed", "Could Not Extract JDL Parameters") self.log.warn(parameters["Message"]) return self.__finish("JDL Problem") params = parameters["Value"] if "JobID" not in params: msg = "Job has not JobID defined in JDL parameters" self.__report(jobID, "Failed", msg) self.log.warn(msg) return self.__finish("JDL Problem") else: jobID = params["JobID"] if "JobType" not in params: self.log.warn("Job has no JobType defined in JDL parameters") jobType = "Unknown" else: jobType = params["JobType"] if "CPUTime" not in params: self.log.warn("Job has no CPU requirement defined in JDL parameters") if self.extraOptions: params["Arguments"] += " " + self.extraOptions params["ExtraOptions"] = self.extraOptions self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType)) self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False) jobReport.setJobStatus("Matched", "Job Received by Agent") result = self.__setupProxy(ownerDN, jobGroup) if not result["OK"]: return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure) proxyChain = result.get("Value") # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug("Before %sCE submitJob()" % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission["OK"]: self.__report(jobID, "Failed", submission["Message"]) return self.__finish(submission["Message"]) elif "PayloadFailed" in submission: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % submission["PayloadFailed"] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, "Job processing failed with exception", self.stopOnApplicationFailure ) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result["OK"]: self.timeLeft = result["Value"] else: if result["Message"] != "Current batch system is not supported": self.timeLeftError = result["Message"] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK("Job Agent cycle complete")
def mergeFromFile( self, filename ): cfg = CFG() cfg.loadFromFile( filename ) self.cfgData = self.cfgData.mergeWith( cfg )
class ConfigurationData(object): def __init__(self, loadDefaultCFG=True): lr = LockRing() self.threadingEvent = lr.getEvent() self.threadingEvent.set() self.threadingLock = lr.getLock() self.runningThreadsNumber = 0 self.__compressedConfigurationData = None self.configurationPath = "/DIRAC/Configuration" self.backupsDir = os.path.join(DIRAC.rootPath, "etc", "csbackup") self._isService = False self.localCFG = CFG() self.remoteCFG = CFG() self.mergedCFG = CFG() self.remoteServerList = [] if loadDefaultCFG: defaultCFGFile = os.path.join(DIRAC.rootPath, "etc", "dirac.cfg") gLogger.debug("dirac.cfg should be at", "%s" % defaultCFGFile) retVal = self.loadFile(defaultCFGFile) if not retVal['OK']: gLogger.warn("Can't load %s file" % defaultCFGFile) self.sync() def getBackupDir(self): return self.backupsDir def sync(self): gLogger.debug("Updating configuration internals") self.mergedCFG = self.remoteCFG.mergeWith(self.localCFG) self.remoteServerList = [] localServers = self.extractOptionFromCFG("%s/Servers" % self.configurationPath, self.localCFG, disableDangerZones=True) if localServers: self.remoteServerList.extend(List.fromChar(localServers, ",")) remoteServers = self.extractOptionFromCFG("%s/Servers" % self.configurationPath, self.remoteCFG, disableDangerZones=True) if remoteServers: self.remoteServerList.extend(List.fromChar(remoteServers, ",")) self.remoteServerList = List.uniqueElements(self.remoteServerList) self.__compressedConfigurationData = None def loadFile(self, fileName): try: fileCFG = CFG() fileCFG.loadFromFile(fileName) except IOError: self.localCFG = self.localCFG.mergeWith(fileCFG) return S_ERROR("Can't load a cfg file '%s'" % fileName) return self.mergeWithLocal(fileCFG) def mergeWithLocal(self, extraCFG): self.lock() try: self.localCFG = self.localCFG.mergeWith(extraCFG) self.unlock() gLogger.debug("CFG merged") except Exception as e: self.unlock() return S_ERROR("Cannot merge with new cfg: %s" % str(e)) self.sync() return S_OK() def loadRemoteCFGFromCompressedMem(self, data): sUncompressedData = zlib.decompress(data) self.loadRemoteCFGFromMem(sUncompressedData) def loadRemoteCFGFromMem(self, data): self.lock() self.remoteCFG.loadFromBuffer(data) self.unlock() self.sync() def loadConfigurationData(self, fileName=False): name = self.getName() self.lock() try: if not fileName: fileName = "%s.cfg" % name if fileName[0] != "/": fileName = os.path.join(DIRAC.rootPath, "etc", fileName) self.remoteCFG.loadFromFile(fileName) except Exception as e: print e self.unlock() self.sync() def getCommentFromCFG(self, path, cfg=False): if not cfg: cfg = self.mergedCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split("/") if level.strip() != "" ] for section in levelList[:-1]: cfg = cfg[section] return self.dangerZoneEnd(cfg.getComment(levelList[-1])) except Exception: pass return self.dangerZoneEnd(None) def getSectionsFromCFG(self, path, cfg=False, ordered=False): if not cfg: cfg = self.mergedCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split("/") if level.strip() != "" ] for section in levelList: cfg = cfg[section] return self.dangerZoneEnd(cfg.listSections(ordered)) except Exception: pass return self.dangerZoneEnd(None) def getOptionsFromCFG(self, path, cfg=False, ordered=False): if not cfg: cfg = self.mergedCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split("/") if level.strip() != "" ] for section in levelList: cfg = cfg[section] return self.dangerZoneEnd(cfg.listOptions(ordered)) except Exception: pass return self.dangerZoneEnd(None) def extractOptionFromCFG(self, path, cfg=False, disableDangerZones=False): if not cfg: cfg = self.mergedCFG if not disableDangerZones: self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split("/") if level.strip() != "" ] for section in levelList[:-1]: cfg = cfg[section] if levelList[-1] in cfg.listOptions(): return self.dangerZoneEnd(cfg[levelList[-1]]) except Exception: pass if not disableDangerZones: self.dangerZoneEnd() def setOptionInCFG(self, path, value, cfg=False, disableDangerZones=False): if not cfg: cfg = self.localCFG if not disableDangerZones: self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split("/") if level.strip() != "" ] for section in levelList[:-1]: if section not in cfg.listSections(): cfg.createNewSection(section) cfg = cfg[section] cfg.setOption(levelList[-1], value) finally: if not disableDangerZones: self.dangerZoneEnd() self.sync() def deleteOptionInCFG(self, path, cfg=False): if not cfg: cfg = self.localCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split("/") if level.strip() != "" ] for section in levelList[:-1]: if section not in cfg.listSections(): return cfg = cfg[section] cfg.deleteKey(levelList[-1]) finally: self.dangerZoneEnd() self.sync() def generateNewVersion(self): self.setVersion(Time.toString()) self.sync() gLogger.info("Generated new version %s" % self.getVersion()) def setVersion(self, version, cfg=False): if not cfg: cfg = self.remoteCFG self.setOptionInCFG("%s/Version" % self.configurationPath, version, cfg) def getVersion(self, cfg=False): if not cfg: cfg = self.remoteCFG value = self.extractOptionFromCFG( "%s/Version" % self.configurationPath, cfg) if value: return value return "0" def getName(self): return self.extractOptionFromCFG("%s/Name" % self.configurationPath, self.mergedCFG) def exportName(self): return self.setOptionInCFG("%s/Name" % self.configurationPath, self.getName(), self.remoteCFG) def getRefreshTime(self): try: return int( self.extractOptionFromCFG( "%s/RefreshTime" % self.configurationPath, self.mergedCFG)) except: return 300 def getPropagationTime(self): try: return int( self.extractOptionFromCFG( "%s/PropagationTime" % self.configurationPath, self.mergedCFG)) except: return 300 def getSlavesGraceTime(self): try: return int( self.extractOptionFromCFG( "%s/SlavesGraceTime" % self.configurationPath, self.mergedCFG)) except: return 600 def mergingEnabled(self): try: val = self.extractOptionFromCFG( "%s/EnableAutoMerge" % self.configurationPath, self.mergedCFG) return val.lower() in ("yes", "true", "y") except: return False def getAutoPublish(self): value = self.extractOptionFromCFG( "%s/AutoPublish" % self.configurationPath, self.localCFG) if value and value.lower() in ("no", "false", "n"): return False else: return True def getServers(self): return list(self.remoteServerList) def getConfigurationGateway(self): return self.extractOptionFromCFG("/DIRAC/Gateway", self.localCFG) def setServers(self, sServers): self.setOptionInCFG("%s/Servers" % self.configurationPath, sServers, self.remoteCFG) self.sync() def deleteLocalOption(self, optionPath): self.deleteOptionInCFG(optionPath, self.localCFG) def getMasterServer(self): return self.extractOptionFromCFG( "%s/MasterServer" % self.configurationPath, self.remoteCFG) def setMasterServer(self, sURL): self.setOptionInCFG("%s/MasterServer" % self.configurationPath, sURL, self.remoteCFG) self.sync() def getCompressedData(self): if self.__compressedConfigurationData is None: self.__compressedConfigurationData = zlib.compress( str(self.remoteCFG), 9) return self.__compressedConfigurationData def isMaster(self): value = self.extractOptionFromCFG("%s/Master" % self.configurationPath, self.localCFG) if value and value.lower() in ("yes", "true", "y"): return True else: return False def getServicesPath(self): return "/Services" def setAsService(self): self._isService = True def isService(self): return self._isService def useServerCertificate(self): value = self.extractOptionFromCFG( "/DIRAC/Security/UseServerCertificate") if value and value.lower() in ("y", "yes", "true"): return True return False def skipCACheck(self): value = self.extractOptionFromCFG("/DIRAC/Security/SkipCAChecks") if value and value.lower() in ("y", "yes", "true"): return True return False def dumpLocalCFGToFile(self, fileName): try: with open(fileName, "w") as fd: fd.write(str(self.localCFG)) gLogger.verbose("Configuration file dumped", "'%s'" % fileName) except IOError: gLogger.error("Can't dump cfg file", "'%s'" % fileName) return S_ERROR("Can't dump cfg file '%s'" % fileName) return S_OK() def getRemoteCFG(self): return self.remoteCFG def getMergedCFGAsString(self): return str(self.mergedCFG) def dumpRemoteCFGToFile(self, fileName): with open(fileName, "w") as fd: fd.write(str(self.remoteCFG)) def __backupCurrentConfiguration(self, backupName): configurationFilename = "%s.cfg" % self.getName() configurationFile = os.path.join(DIRAC.rootPath, "etc", configurationFilename) today = Time.date() backupPath = os.path.join(self.getBackupDir(), str(today.year), "%02d" % today.month) mkDir(backupPath) backupFile = os.path.join( backupPath, configurationFilename.replace(".cfg", ".%s.zip" % backupName)) if os.path.isfile(configurationFile): gLogger.info("Making a backup of configuration in %s" % backupFile) try: with zipfile.ZipFile(backupFile, "w", zipfile.ZIP_DEFLATED) as zf: zf.write( configurationFile, "%s.backup.%s" % (os.path.split(configurationFile)[1], backupName)) except Exception: gLogger.exception() gLogger.error("Cannot backup configuration data file", "file %s" % backupFile) else: gLogger.warn("CS data file does not exist", configurationFile) def writeRemoteConfigurationToDisk(self, backupName=False): configurationFile = os.path.join(DIRAC.rootPath, "etc", "%s.cfg" % self.getName()) try: with open(configurationFile, "w") as fd: fd.write(str(self.remoteCFG)) except Exception as e: gLogger.fatal("Cannot write new configuration to disk!", "file %s" % configurationFile) return S_ERROR("Can't write cs file %s!: %s" % (configurationFile, repr(e).replace(',)', ')'))) if backupName: self.__backupCurrentConfiguration(backupName) return S_OK() def setRemoteCFG(self, cfg, disableSync=False): self.remoteCFG = cfg.clone() if not disableSync: self.sync() def lock(self): """ Locks Event to prevent further threads from reading. Stops current thread until no other thread is accessing. PRIVATE USE """ self.threadingEvent.clear() while self.runningThreadsNumber > 0: time.sleep(0.1) def unlock(self): """ Unlocks Event. PRIVATE USE """ self.threadingEvent.set() def dangerZoneStart(self): """ Start of danger zone. This danger zone may be or may not be a mutual exclusion zone. Counter is maintained to know how many threads are inside and be able to enable and disable mutual exclusion. PRIVATE USE """ self.threadingEvent.wait() self.threadingLock.acquire() self.runningThreadsNumber += 1 try: self.threadingLock.release() except thread.error: pass def dangerZoneEnd(self, returnValue=None): """ End of danger zone. PRIVATE USE """ self.threadingLock.acquire() self.runningThreadsNumber -= 1 try: self.threadingLock.release() except thread.error: pass return returnValue
class ConfigurationData( object ): def __init__( self, loadDefaultCFG = True ): lr = LockRing() self.threadingEvent = lr.getEvent() self.threadingEvent.set() self.threadingLock = lr.getLock() self.runningThreadsNumber = 0 self.__compressedConfigurationData = None self.configurationPath = "/DIRAC/Configuration" self.backupsDir = os.path.join( DIRAC.rootPath, "etc", "csbackup" ) self._isService = False self.localCFG = CFG() self.remoteCFG = CFG() self.mergedCFG = CFG() self.remoteServerList = [] if loadDefaultCFG: defaultCFGFile = os.path.join( DIRAC.rootPath, "etc", "dirac.cfg" ) gLogger.debug( "dirac.cfg should be at", "%s" % defaultCFGFile ) retVal = self.loadFile( defaultCFGFile ) if not retVal[ 'OK' ]: gLogger.warn( "Can't load %s file" % defaultCFGFile ) self.sync() def getBackupDir( self ): return self.backupsDir def sync( self ): gLogger.debug( "Updating configuration internals" ) self.mergedCFG = self.remoteCFG.mergeWith( self.localCFG ) self.remoteServerList = [] localServers = self.extractOptionFromCFG( "%s/Servers" % self.configurationPath, self.localCFG, disableDangerZones = True ) if localServers: self.remoteServerList.extend( List.fromChar( localServers, "," ) ) remoteServers = self.extractOptionFromCFG( "%s/Servers" % self.configurationPath, self.remoteCFG, disableDangerZones = True ) if remoteServers: self.remoteServerList.extend( List.fromChar( remoteServers, "," ) ) self.remoteServerList = List.uniqueElements( self.remoteServerList ) self.__compressedConfigurationData = None def loadFile( self, fileName ): try: fileCFG = CFG() fileCFG.loadFromFile( fileName ) except IOError: self.localCFG = self.localCFG.mergeWith( fileCFG ) return S_ERROR( "Can't load a cfg file '%s'" % fileName ) return self.mergeWithLocal( fileCFG ) def mergeWithLocal( self, extraCFG ): self.lock() try: self.localCFG = self.localCFG.mergeWith( extraCFG ) self.unlock() gLogger.debug( "CFG merged" ) except Exception as e: self.unlock() return S_ERROR( "Cannot merge with new cfg: %s" % str( e ) ) self.sync() return S_OK() def loadRemoteCFGFromCompressedMem( self, data ): sUncompressedData = zlib.decompress( data ) self.loadRemoteCFGFromMem( sUncompressedData ) def loadRemoteCFGFromMem( self, data ): self.lock() self.remoteCFG.loadFromBuffer( data ) self.unlock() self.sync() def loadConfigurationData( self, fileName = False ): name = self.getName() self.lock() try: if not fileName: fileName = "%s.cfg" % name if fileName[0] != "/": fileName = os.path.join( DIRAC.rootPath, "etc", fileName ) self.remoteCFG.loadFromFile( fileName ) except Exception as e: print e pass self.unlock() self.sync() def getCommentFromCFG( self, path, cfg = False ): if not cfg: cfg = self.mergedCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ] for section in levelList[:-1]: cfg = cfg[ section ] return self.dangerZoneEnd( cfg.getComment( levelList[-1] ) ) except Exception: pass return self.dangerZoneEnd( None ) def getSectionsFromCFG( self, path, cfg = False, ordered = False ): if not cfg: cfg = self.mergedCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ] for section in levelList: cfg = cfg[ section ] return self.dangerZoneEnd( cfg.listSections( ordered ) ) except Exception: pass return self.dangerZoneEnd( None ) def getOptionsFromCFG( self, path, cfg = False, ordered = False ): if not cfg: cfg = self.mergedCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ] for section in levelList: cfg = cfg[ section ] return self.dangerZoneEnd( cfg.listOptions( ordered ) ) except Exception: pass return self.dangerZoneEnd( None ) def extractOptionFromCFG( self, path, cfg = False, disableDangerZones = False ): if not cfg: cfg = self.mergedCFG if not disableDangerZones: self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ] for section in levelList[:-1]: cfg = cfg[ section ] if levelList[-1] in cfg.listOptions(): return self.dangerZoneEnd( cfg[ levelList[ -1 ] ] ) except Exception: pass if not disableDangerZones: self.dangerZoneEnd() def setOptionInCFG( self, path, value, cfg = False, disableDangerZones = False ): if not cfg: cfg = self.localCFG if not disableDangerZones: self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ] for section in levelList[:-1]: if section not in cfg.listSections(): cfg.createNewSection( section ) cfg = cfg[ section ] cfg.setOption( levelList[ -1 ], value ) finally: if not disableDangerZones: self.dangerZoneEnd() self.sync() def deleteOptionInCFG( self, path, cfg = False ): if not cfg: cfg = self.localCFG self.dangerZoneStart() try: levelList = [ level.strip() for level in path.split( "/" ) if level.strip() != "" ] for section in levelList[:-1]: if section not in cfg.listSections(): return cfg = cfg[ section ] cfg.deleteKey( levelList[ -1 ] ) finally: self.dangerZoneEnd() self.sync() def generateNewVersion( self ): self.setVersion( Time.toString() ) self.sync() gLogger.info( "Generated new version %s" % self.getVersion() ) def setVersion( self, version, cfg = False ): if not cfg: cfg = self.remoteCFG self.setOptionInCFG( "%s/Version" % self.configurationPath, version, cfg ) def getVersion( self, cfg = False ): if not cfg: cfg = self.remoteCFG value = self.extractOptionFromCFG( "%s/Version" % self.configurationPath, cfg ) if value: return value return "0" def getName( self ): return self.extractOptionFromCFG( "%s/Name" % self.configurationPath, self.mergedCFG ) def exportName( self ): return self.setOptionInCFG( "%s/Name" % self.configurationPath, self.getName(), self.remoteCFG ) def getRefreshTime( self ): try: return int( self.extractOptionFromCFG( "%s/RefreshTime" % self.configurationPath, self.mergedCFG ) ) except: return 300 def getPropagationTime( self ): try: return int( self.extractOptionFromCFG( "%s/PropagationTime" % self.configurationPath, self.mergedCFG ) ) except: return 300 def getSlavesGraceTime( self ): try: return int( self.extractOptionFromCFG( "%s/SlavesGraceTime" % self.configurationPath, self.mergedCFG ) ) except: return 600 def mergingEnabled( self ): try: val = self.extractOptionFromCFG( "%s/EnableAutoMerge" % self.configurationPath, self.mergedCFG ) return val.lower() in ( "yes", "true", "y" ) except: return False def getAutoPublish( self ): value = self.extractOptionFromCFG( "%s/AutoPublish" % self.configurationPath, self.localCFG ) if value and value.lower() in ( "no", "false", "n" ): return False else: return True def getServers( self ): return list( self.remoteServerList ) def getConfigurationGateway( self ): return self.extractOptionFromCFG( "/DIRAC/Gateway", self.localCFG ) def setServers( self, sServers ): self.setOptionInCFG( "%s/Servers" % self.configurationPath, sServers, self.remoteCFG ) self.sync() def deleteLocalOption( self, optionPath ): self.deleteOptionInCFG( optionPath, self.localCFG ) def getMasterServer( self ): return self.extractOptionFromCFG( "%s/MasterServer" % self.configurationPath, self.remoteCFG ) def setMasterServer( self, sURL ): self.setOptionInCFG( "%s/MasterServer" % self.configurationPath, sURL, self.remoteCFG ) self.sync() def getCompressedData( self ): if self.__compressedConfigurationData is None: self.__compressedConfigurationData = zlib.compress( str( self.remoteCFG ), 9 ) return self.__compressedConfigurationData def isMaster( self ): value = self.extractOptionFromCFG( "%s/Master" % self.configurationPath, self.localCFG ) if value and value.lower() in ( "yes", "true", "y" ): return True else: return False def getServicesPath( self ): return "/Services" def setAsService( self ): self._isService = True def isService( self ): return self._isService def useServerCertificate( self ): value = self.extractOptionFromCFG( "/DIRAC/Security/UseServerCertificate" ) if value and value.lower() in ( "y", "yes", "true" ): return True return False def skipCACheck( self ): value = self.extractOptionFromCFG( "/DIRAC/Security/SkipCAChecks" ) if value and value.lower() in ( "y", "yes", "true" ): return True return False def dumpLocalCFGToFile( self, fileName ): try: with open( fileName, "w" ) as fd: fd.write( str( self.localCFG ) ) gLogger.verbose( "Configuration file dumped", "'%s'" % fileName ) except IOError: gLogger.error( "Can't dump cfg file", "'%s'" % fileName ) return S_ERROR( "Can't dump cfg file '%s'" % fileName ) return S_OK() def getRemoteCFG( self ): return self.remoteCFG def getMergedCFGAsString( self ): return str( self.mergedCFG ) def dumpRemoteCFGToFile( self, fileName ): with open( fileName, "w" ) as fd: fd.write( str( self.remoteCFG ) ) def __backupCurrentConfiguration( self, backupName ): configurationFilename = "%s.cfg" % self.getName() configurationFile = os.path.join( DIRAC.rootPath, "etc", configurationFilename ) today = Time.date() backupPath = os.path.join( self.getBackupDir(), str( today.year ), "%02d" % today.month ) mkDir(backupPath) backupFile = os.path.join( backupPath, configurationFilename.replace( ".cfg", ".%s.zip" % backupName ) ) if os.path.isfile( configurationFile ): gLogger.info( "Making a backup of configuration in %s" % backupFile ) try: zf = zipfile.ZipFile( backupFile, "w", zipfile.ZIP_DEFLATED ) zf.write( configurationFile, "%s.backup.%s" % ( os.path.split( configurationFile )[1], backupName ) ) zf.close() except Exception: gLogger.exception() gLogger.error( "Cannot backup configuration data file", "file %s" % backupFile ) else: gLogger.warn( "CS data file does not exist", configurationFile ) def writeRemoteConfigurationToDisk( self, backupName = False ): configurationFile = os.path.join( DIRAC.rootPath, "etc", "%s.cfg" % self.getName() ) try: with open( configurationFile, "w" ) as fd: fd.write( str( self.remoteCFG ) ) except Exception as e: gLogger.fatal( "Cannot write new configuration to disk!", "file %s" % configurationFile ) return S_ERROR( "Can't write cs file %s!: %s" % ( configurationFile, repr( e ).replace( ',)', ')' ) ) ) if backupName: self.__backupCurrentConfiguration( backupName ) return S_OK() def setRemoteCFG( self, cfg, disableSync = False ): self.remoteCFG = cfg.clone() if not disableSync: self.sync() def lock( self ): """ Locks Event to prevent further threads from reading. Stops current thread until no other thread is accessing. PRIVATE USE """ self.threadingEvent.clear() while self.runningThreadsNumber > 0: time.sleep( 0.1 ) def unlock( self ): """ Unlocks Event. PRIVATE USE """ self.threadingEvent.set() def dangerZoneStart( self ): """ Start of danger zone. This danger zone may be or may not be a mutual exclusion zone. Counter is maintained to know how many threads are inside and be able to enable and disable mutual exclusion. PRIVATE USE """ self.threadingEvent.wait() self.threadingLock.acquire() self.runningThreadsNumber += 1 try: self.threadingLock.release() except thread.error: pass def dangerZoneEnd( self, returnValue = None ): """ End of danger zone. PRIVATE USE """ self.threadingLock.acquire() self.runningThreadsNumber -= 1 try: self.threadingLock.release() except thread.error: pass return returnValue
# $HeadURL$ __RCSID__ = "7b8878b (2009-11-05 19:40:01 +0000) Adria Casajus <*****@*****.**>" from dirac import DIRAC from DIRAC.Core.Utilities.CFG import CFG DIRAC.gLogger.initialize('test_gConfig','/testSectionDebug') testconfig = '%s/DIRAC/ConfigurationSystem/test/test.cfg' % DIRAC.rootPath dumpconfig = '%s/DIRAC/ConfigurationSystem/test/dump.cfg' % DIRAC.rootPath cfg1 = CFG() cfg1.loadFromFile( testconfig ) fd = file( testconfig ) cfg1String = fd.read() fd.close() cfg2 = CFG() cfg2.loadFromBuffer( cfg1.serialize() ) cfg3 = cfg1.mergeWith( cfg2 ) testList = [{ 'method' : DIRAC.gConfig.loadFile, 'arguments' : ( testconfig, ), 'output' : {'OK': True, 'Value': ''} }, { 'method' : DIRAC.gConfig.dumpLocalCFGToFile, 'arguments' : ( dumpconfig, ), 'output' : {'OK': True, 'Value': ''} },
mjfPower = getPowerFromMJF() if mjfPower: gLogger.notice('CPU power from MJF is %.1f HS06' % mjfPower) else: gLogger.notice('MJF not available on this node') if update and not configFile: gConfig.setOptionValue('/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm) gConfig.setOptionValue('/LocalSite/CPUNormalizationFactor', norm) gConfig.dumpLocalCFGToFile(gConfig.diracConfigFilePath) if configFile: from DIRAC.Core.Utilities.CFG import CFG cfg = CFG() try: # Attempt to open the given file cfg.loadFromFile(configFile) except: pass # Create the section if it does not exist if not cfg.existsKey('LocalSite'): cfg.createNewSection('LocalSite') cfg.setOption('/LocalSite/CPUScalingFactor', mjfPower if mjfPower else norm) cfg.setOption('/LocalSite/CPUNormalizationFactor', norm) cfg.writeToFile(configFile) DIRAC.exit()
class JobRepository( object ): def __init__( self, repository = None ): self.location = repository if not self.location: if "HOME" in os.environ: self.location = '%s/.dirac.repo.rep' % os.environ['HOME'] else: self.location = '%s/.dirac.repo.rep' % os.getcwd() self.repo = CFG() if os.path.exists( self.location ): self.repo.loadFromFile( self.location ) if not self.repo.existsKey( 'Jobs' ): self.repo.createNewSection( 'Jobs' ) else: self.repo.createNewSection( 'Jobs' ) self.OK = True written = self._writeRepository( self.location ) if not written: self.OK = False def isOK( self ): return self.OK def readRepository( self ): return S_OK( self.repo.getAsDict( 'Jobs' ) ) def writeRepository( self, alternativePath = None ): destination = self.location if alternativePath: destination = alternativePath written = self._writeRepository( destination ) if not written: return S_ERROR( "Failed to write repository" ) return S_OK( destination ) def resetRepository( self, jobIDs = [] ): if not jobIDs: jobs = self.readRepository()['Value'] jobIDs = jobs.keys() paramDict = {'State' : 'Submitted', 'Retrieved' : 0, 'OutputData' : 0} for jobID in jobIDs: self._writeJob( jobID, paramDict, True ) self._writeRepository( self.location ) return S_OK() def _writeRepository( self, path ): handle, tmpName = tempfile.mkstemp() written = self.repo.writeToFile( tmpName ) os.close( handle ) if not written: if os.path.exists( tmpName ): os.remove( tmpName ) return written if os.path.exists( path ): gLogger.debug( "Replacing %s" % path ) try: shutil.move( tmpName, path ) return True except Exception as x: gLogger.error( "Failed to overwrite repository.", x ) gLogger.info( "If your repository is corrupted a backup can be found %s" % tmpName ) return False def appendToRepository( self, repoLocation ): if not os.path.exists( repoLocation ): gLogger.error( "Secondary repository does not exist", repoLocation ) return S_ERROR( "Secondary repository does not exist" ) self.repo = CFG().loadFromFile( repoLocation ).mergeWith( self.repo ) self._writeRepository( self.location ) return S_OK() def addJob( self, jobID, state = 'Submitted', retrieved = 0, outputData = 0, update = False ): paramDict = { 'State' : state, 'Time' : self._getTime(), 'Retrieved' : int( retrieved ), 'OutputData' : outputData} self._writeJob( jobID, paramDict, update ) self._writeRepository( self.location ) return S_OK( jobID ) def updateJob( self, jobID, paramDict ): if self._existsJob( jobID ): paramDict['Time'] = self._getTime() self._writeJob( jobID, paramDict, True ) self._writeRepository( self.location ) return S_OK() def updateJobs( self, jobDict ): for jobID, paramDict in jobDict.items(): if self._existsJob( jobID ): paramDict['Time'] = self._getTime() self._writeJob( jobID, paramDict, True ) self._writeRepository( self.location ) return S_OK() def _getTime( self ): runtime = time.ctime() return runtime.replace( " ", "_" ) def _writeJob( self, jobID, paramDict, update ): jobID = str( jobID ) jobExists = self._existsJob( jobID ) if jobExists and ( not update ): gLogger.warn( "Job exists and not overwriting" ) return S_ERROR( "Job exists and not overwriting" ) if not jobExists: self.repo.createNewSection( 'Jobs/%s' % jobID ) for key, value in paramDict.items(): self.repo.setOption( 'Jobs/%s/%s' % ( jobID, key ), value ) return S_OK() def removeJob( self, jobID ): res = self.repo['Jobs'].deleteKey( str( jobID ) ) #pylint: disable=no-member if res: self._writeRepository( self.location ) return S_OK() def existsJob( self, jobID ): return S_OK( self._existsJob( jobID ) ) def _existsJob( self, jobID ): return self.repo.isSection( 'Jobs/%s' % jobID ) def getLocation( self ): return S_OK( self.location ) def getSize( self ): return S_OK( len( self.repo.getAsDict( 'Jobs' ) ) )
def checkAgentOptions(getOptionMock, systemName, agentName, ignoreOptions=None, extension='DIRAC'): """Ensure that all the agent options are properly documented. :param getOptionMock: Mock object for agentmodule.get_amOption function :param str systemName: name of the **System** :param str agentName: name of the **Agent** :param list ignoreOptions: list of options to ignore :param str extension: name of the DIRAC **Extension** where the Agent comes from """ if ignoreOptions is None: ignoreOptions = [] # add some options that can be set, see the AgentModule for all of them ignoreOptions.extend(['PollingTime', 'Status', 'Enabled', 'MaxCycles', 'LogOutputs', 'ControlDirectory', 'shifterProxy']) ignoreOptions = list(set(ignoreOptions)) config = CFG() LOG.info("Testing %s/%s, ignoring options %s", systemName, agentName, ignoreOptions) # get the location where DIRAC is in from basefolder/DIRAC/__ini__.py configFilePath = os.path.join(os.path.dirname(os.path.dirname(DIRAC.__file__)), extension, systemName, 'ConfigTemplate.cfg') config.loadFromFile(configFilePath) optionsDict = config.getAsDict('Agents/%s' % agentName) outDict = {} _parseOption(outDict, optionsDict) optionsDict = outDict LOG.info("Calls: %s", pformat(getOptionMock.call_args_list)) LOG.info("Options found in ConfigTemplate: %s ", list(optionsDict.keys())) # check that values in ConfigTemplate are used for option, value in optionsDict.iteritems(): if any(ignoreOp in option for ignoreOp in ignoreOptions): LOG.info("From Agent: ignoring option %r with value %r, (%s)", option, value, type(value)) continue LOG.info("Looking for call to option %r with value %r, (%s)", option, value, type(value)) if not isinstance(value, bool) and not value: # empty string, list, dict ... assert any(call(option, null) in getOptionMock.call_args_list for null in ({}, set(), [], '', 0)) else: assert call(option, value) in getOptionMock.call_args_list or \ call(option, [value]) in getOptionMock.call_args_list # check that options used in the agent are in the ConfigTemplates for opCall in getOptionMock.call_args_list: optionArguments = opCall[0] if len(optionArguments) != 2: continue optionName = optionArguments[0] optionValue = optionArguments[1] if optionName in ignoreOptions: LOG.info("From Template: ignoring option %r with %r", optionName, optionValue) continue LOG.info("Checking Template option %r with %r", optionName, optionValue) assert optionName in optionsDict if not optionsDict[optionName]: assert not optionValue continue assert optionsDict[optionName] == optionValue or [optionsDict[optionName]] == optionValue