def _loadWebAppCFGFiles(self): """ Load EiscatWeb/web.cfg definitions """ exts = [] for ext in CSGlobals.getCSExtensions(): if ext == "DIRAC": continue if ext[-5:] != "DIRAC": ext = "%sDIRAC" % ext if ext == "WebAppDIRAC": continue if ext != "EiscatWebDIRAC": exts.append(ext) exts.append("DIRAC") exts.append("EiscatWebDIRAC") print "exts in loadWebAppCFGFiles of App.py" print exts webCFG = CFG() for modName in reversed(exts): try: modPath = imp.find_module(modName)[1] except ImportError: continue gLogger.verbose("Found module %s at %s" % (modName, modPath)) cfgPath = os.path.join(modPath, "WebApp", "web.cfg") print "cfgPath" print cfgPath if not os.path.isfile(cfgPath): gLogger.verbose("Inexistant %s" % cfgPath) continue try: modCFG = CFG().loadFromFile(cfgPath) except Exception, excp: gLogger.error("Could not load %s: %s" % (cfgPath, excp)) continue gLogger.verbose("Loaded %s" % cfgPath) expl = [Conf.BASECS] while len(expl): current = expl.pop(0) if not modCFG.isSection(current): continue if modCFG.getOption("%s/AbsoluteDefinition" % current, False): gLogger.verbose("%s:%s is an absolute definition" % (modName, current)) try: webCFG.deleteKey(current) except: pass modCFG.deleteKey("%s/AbsoluteDefinition" % current) else: for sec in modCFG[current].listSections(): expl.append("%s/%s" % (current, sec)) # Add the modCFG webCFG = webCFG.mergeWith(modCFG) print "al final webCFG" print webCFG
def getComputingElementDefaults(ceName='', ceType='', cfg=None, currentSectionPath=''): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath('ComputingElements') if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) #pylint: disable=no-member if ceType: cesCfg[ceName].setOption('CEType', ceType) #pylint: disable=no-member ceDefaultSection = cfgPath(defaultSection('ComputingElements')) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if 'CEType' in cesCfg[ceName]: ceType = cesCfg[ceName]['CEType'] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): if option not in cesCfg[ceName]: cesCfg[ceName].setOption(option, ceDefaults[ceType][option]) return cesCfg
def getComputingElementDefaults(ceName='', ceType='', cfg=None, currentSectionPath=''): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath('ComputingElements') if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except BaseException: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) # pylint: disable=no-member if ceType: cesCfg[ceName].setOption('CEType', ceType) # pylint: disable=no-member ceDefaultSection = cfgPath(defaultSection('ComputingElements')) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if 'CEType' in cesCfg[ceName]: ceType = cesCfg[ceName]['CEType'] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): # pylint: disable=no-member if option not in cesCfg[ceName]: cesCfg[ceName].setOption(option, ceDefaults[ceType][option]) # pylint: disable=unsubscriptable-object return cesCfg
def getComputingElementDefaults(ceName="", ceType="", cfg=None, currentSectionPath=""): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath("ComputingElements") if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) if ceType: cesCfg[ceName].setOption("CEType", ceType) ceDefaultSection = cfgPath(defaultSection("ComputingElements")) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if "CEType" in cesCfg[ceName]: ceType = cesCfg[ceName]["CEType"] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): if option not in cesCfg[ceName]: cesCfg[ceName].setOption(option, ceDefaults[ceType][option]) return cesCfg
def loadWebAppCFGFiles(): """ Load WebApp/web.cfg definitions """ exts = [] for ext in CSGlobals.getCSExtensions(): if ext == "DIRAC": continue if ext[-5:] != "DIRAC": ext = "%sDIRAC" % ext if ext != "WebAppDIRAC": exts.append( ext ) exts.append( "DIRAC" ) exts.append( "WebAppDIRAC" ) webCFG = CFG() for modName in reversed( exts ): try: modPath = imp.find_module( modName )[1] except ImportError: continue gLogger.verbose( "Found module %s at %s" % ( modName, modPath ) ) cfgPath = os.path.join( modPath, "WebApp", "web.cfg" ) if not os.path.isfile( cfgPath ): gLogger.verbose( "Inexistant %s" % cfgPath ) continue try: modCFG = CFG().loadFromFile( cfgPath ) except Exception, excp: gLogger.error( "Could not load %s: %s" % ( cfgPath, excp ) ) continue gLogger.verbose( "Loaded %s" % cfgPath ) expl = [ BASECS ] while len( expl ): current = expl.pop( 0 ) if not modCFG.isSection( current ): continue if modCFG.getOption( "%s/AbsoluteDefinition" % current, False ): gLogger.verbose( "%s:%s is an absolute definition" % ( modName, current ) ) try: webCFG.deleteKey( current ) except: pass modCFG.deleteKey( "%s/AbsoluteDefinition" % current ) else: for sec in modCFG[ current ].listSections(): expl.append( "%s/%s" % ( current, sec ) ) #Add the modCFG webCFG = webCFG.mergeWith( modCFG )
def _loadWebAppCFGFiles(self, extension): """ Load WebApp/web.cfg definitions :param str extension: the module name of the extension of WebAppDirac for example: LHCbWebDIRAC """ exts = [extension, "WebAppDIRAC"] webCFG = CFG() for modName in reversed(exts): cfgPath = os.path.join(self.__params.destination, "%s/WebApp" % modName, "web.cfg") if not os.path.isfile(cfgPath): gLogger.verbose("Web configuration file %s does not exists!" % cfgPath) continue try: modCFG = CFG().loadFromFile(cfgPath) except Exception, excp: gLogger.error("Could not load %s: %s" % (cfgPath, excp)) continue gLogger.verbose("Loaded %s" % cfgPath) expl = ["/WebApp"] while len(expl): current = expl.pop(0) if not modCFG.isSection(current): continue if modCFG.getOption("%s/AbsoluteDefinition" % current, False): gLogger.verbose("%s:%s is an absolute definition" % (modName, current)) try: webCFG.deleteKey(current) except: pass modCFG.deleteKey("%s/AbsoluteDefinition" % current) else: for sec in modCFG[current].listSections(): expl.append("%s/%s" % (current, sec)) # Add the modCFG webCFG = webCFG.mergeWith(modCFG)
def execute( self ): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info( 'Attempting to check CPU time left for filling mode' ) if self.fillingMode: if self.timeLeftError: self.log.warn( self.timeLeftError ) return self.__finish( self.timeLeftError ) self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) ) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft ) if not result['OK']: return self.__finish( result['Message'] ) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join( '.', self.extraOptions ) else: localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" ) localCfg.loadFromFile( localConfigFile ) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft ) localCfg.writeToFile( localConfigFile ) else: return self.__finish( 'Filling Mode is Disabled' ) self.log.verbose( 'Job Agent execution loop' ) available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info( 'Resource is not available' ) self.log.info( available['Message'] ) return self.__finish( 'CE Not Available' ) self.log.info( available['Message'] ) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' ) if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str( self.pilotReference ) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict( '/AgentJobRequirements' ) if result['OK']: requirementsDict = result['Value'] ceDict.update( requirementsDict ) self.log.verbose( ceDict ) start = time.time() jobRequest = self.__requestJob( ceDict ) matchTime = time.time() - start self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches ) if not jobRequest['OK']: if re.search( 'No match found', jobRequest['Message'] ): self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "seconds timeout" ) != -1: self.log.error( jobRequest['Message'] ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 : self.log.error( jobRequest['Message'] ) return S_ERROR( jobRequest['Message'] ) else: self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False ) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key( param ): self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) elif not matcherInfo[param]: self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) else: self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) ) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters( jobJDL ) if not parameters['OK']: self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' ) self.log.warn( parameters['Message'] ) return self.__finish( 'JDL Problem' ) params = parameters['Value'] if not params.has_key( 'JobID' ): msg = 'Job has not JobID defined in JDL parameters' self.__report( jobID, 'Failed', msg ) self.log.warn( msg ) return self.__finish( 'JDL Problem' ) else: jobID = params['JobID'] if not params.has_key( 'JobType' ): self.log.warn( 'Job has no JobType defined in JDL parameters' ) jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key( 'CPUTime' ): self.log.warn( 'Job has no CPU requirement defined in JDL parameters' ) if self.extraOptions: params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) ) self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) ) self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) ) self.jobCount += 1 try: jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName ) jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False ) if os.environ.has_key( 'BOINC_JOB_ID' ): # Report BOINC environment for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']: jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False ) jobReport.setJobStatus( 'Matched', 'Job Received by Agent' ) result = self.__setupProxy( ownerDN, jobGroup ) if not result[ 'OK' ]: return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure ) if 'Value' in result and result[ 'Value' ]: proxyChain = result[ 'Value' ] # Save the job jdl for external monitoring self.__saveJobJDLRequest( jobID, jobJDL ) software = self.__checkInstallSoftware( jobID, params, ceDict ) if not software['OK']: self.log.error( 'Failed to install software for job %s' % ( jobID ) ) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure ) self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) ) submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain ) if not submission['OK']: self.__report( jobID, 'Failed', submission['Message'] ) return self.__finish( submission['Message'] ) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure ) self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) ) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure ) currentTimes = list( os.times() ) for i in range( len( currentTimes ) ): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, _elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft( cpuTime ) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) ) self.scaledCPUTime = scaledCPUTime return S_OK( 'Job Agent cycle complete' )
if cFile: localConfigFile = cFile else: print "WORKSPACE: %s" % os.path.expandvars('$WORKSPACE') if os.path.isfile( os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg' ): localConfigFile = os.path.expandvars('$WORKSPACE')+'/PilotInstallDIR/etc/dirac.cfg' elif os.path.isfile( os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg' ): localConfigFile = os.path.expandvars('$WORKSPACE')+'/ServerInstallDIR/etc/dirac.cfg' elif os.path.isfile( './etc/dirac.cfg' ): localConfigFile = './etc/dirac.cfg' else: print "Local CFG file not found" exit( 2 ) localCfg.loadFromFile( localConfigFile ) if not localCfg.isSection( '/LocalSite' ): localCfg.createNewSection( '/LocalSite' ) localCfg.setOption( '/LocalSite/CPUTimeLeft', 5000 ) localCfg.setOption( '/DIRAC/Security/UseServerCertificate', False ) if not sMod: if not setup: setup = gConfig.getValue('/DIRAC/Setup') if not setup: setup = 'JenkinsSetup' if not vo: vo = gConfig.getValue('/DIRAC/VirtualOrganization') if not vo: vo = 'dirac' if not localCfg.isSection( '/DIRAC/VOPolicy' ):
class JobRepository(object): def __init__(self, repository=None): self.location = repository if not self.location: if "HOME" in os.environ: self.location = '%s/.dirac.repo.rep' % os.environ['HOME'] else: self.location = '%s/.dirac.repo.rep' % os.getcwd() self.repo = CFG() if os.path.exists(self.location): self.repo.loadFromFile(self.location) if not self.repo.existsKey('Jobs'): self.repo.createNewSection('Jobs') else: self.repo.createNewSection('Jobs') self.OK = True written = self._writeRepository(self.location) if not written: self.OK = False def isOK(self): return self.OK def readRepository(self): return S_OK(self.repo.getAsDict('Jobs')) def writeRepository(self, alternativePath=None): destination = self.location if alternativePath: destination = alternativePath written = self._writeRepository(destination) if not written: return S_ERROR("Failed to write repository") return S_OK(destination) def resetRepository(self, jobIDs=[]): if not jobIDs: jobs = self.readRepository()['Value'] jobIDs = jobs.keys() paramDict = {'State': 'Submitted', 'Retrieved': 0, 'OutputData': 0} for jobID in jobIDs: self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _writeRepository(self, path): handle, tmpName = tempfile.mkstemp() written = self.repo.writeToFile(tmpName) os.close(handle) if not written: if os.path.exists(tmpName): os.remove(tmpName) return written if os.path.exists(path): gLogger.debug("Replacing %s" % path) try: shutil.move(tmpName, path) return True except Exception as x: gLogger.error("Failed to overwrite repository.", x) gLogger.info("If your repository is corrupted a backup can be found %s" % tmpName) return False def appendToRepository(self, repoLocation): if not os.path.exists(repoLocation): gLogger.error("Secondary repository does not exist", repoLocation) return S_ERROR("Secondary repository does not exist") self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo) self._writeRepository(self.location) return S_OK() def addJob(self, jobID, state='Submitted', retrieved=0, outputData=0, update=False): paramDict = {'State': state, 'Time': self._getTime(), 'Retrieved': int(retrieved), 'OutputData': outputData} self._writeJob(jobID, paramDict, update) self._writeRepository(self.location) return S_OK(jobID) def updateJob(self, jobID, paramDict): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def updateJobs(self, jobDict): for jobID, paramDict in jobDict.items(): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _getTime(self): runtime = time.ctime() return runtime.replace(" ", "_") def _writeJob(self, jobID, paramDict, update): jobID = str(jobID) jobExists = self._existsJob(jobID) if jobExists and (not update): gLogger.warn("Job exists and not overwriting") return S_ERROR("Job exists and not overwriting") if not jobExists: self.repo.createNewSection('Jobs/%s' % jobID) for key, value in paramDict.items(): self.repo.setOption('Jobs/%s/%s' % (jobID, key), value) return S_OK() def removeJob(self, jobID): res = self.repo['Jobs'].deleteKey(str(jobID)) # pylint: disable=no-member if res: self._writeRepository(self.location) return S_OK() def existsJob(self, jobID): return S_OK(self._existsJob(jobID)) def _existsJob(self, jobID): return self.repo.isSection('Jobs/%s' % jobID) def getLocation(self): return S_OK(self.location) def getSize(self): return S_OK(len(self.repo.getAsDict('Jobs')))
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn( "Disabling filling mode as errors calculating time left", self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', '%d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self._setupProxy(ownerDN, jobGroup) if not result['OK']: return self._rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result = self._submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode, maxNumberOfProcessors, mpTag) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self._getCPUTimeLeft() return S_OK('Job Agent cycle complete')
class JobRepository( object ): def __init__( self, repository = None ): self.location = repository if not self.location: if "HOME" in os.environ: self.location = '%s/.dirac.repo.rep' % os.environ['HOME'] else: self.location = '%s/.dirac.repo.rep' % os.getcwd() self.repo = CFG() if os.path.exists( self.location ): self.repo.loadFromFile( self.location ) if not self.repo.existsKey( 'Jobs' ): self.repo.createNewSection( 'Jobs' ) else: self.repo.createNewSection( 'Jobs' ) self.OK = True written = self._writeRepository( self.location ) if not written: self.OK = False def isOK( self ): return self.OK def readRepository( self ): return S_OK( self.repo.getAsDict( 'Jobs' ) ) def writeRepository( self, alternativePath = None ): destination = self.location if alternativePath: destination = alternativePath written = self._writeRepository( destination ) if not written: return S_ERROR( "Failed to write repository" ) return S_OK( destination ) def resetRepository( self, jobIDs = [] ): if not jobIDs: jobs = self.readRepository()['Value'] jobIDs = jobs.keys() paramDict = {'State' : 'Submitted', 'Retrieved' : 0, 'OutputData' : 0} for jobID in jobIDs: self._writeJob( jobID, paramDict, True ) self._writeRepository( self.location ) return S_OK() def _writeRepository( self, path ): handle, tmpName = tempfile.mkstemp() written = self.repo.writeToFile( tmpName ) os.close( handle ) if not written: if os.path.exists( tmpName ): os.remove( tmpName ) return written if os.path.exists( path ): gLogger.debug( "Replacing %s" % path ) try: shutil.move( tmpName, path ) return True except Exception as x: gLogger.error( "Failed to overwrite repository.", x ) gLogger.info( "If your repository is corrupted a backup can be found %s" % tmpName ) return False def appendToRepository( self, repoLocation ): if not os.path.exists( repoLocation ): gLogger.error( "Secondary repository does not exist", repoLocation ) return S_ERROR( "Secondary repository does not exist" ) self.repo = CFG().loadFromFile( repoLocation ).mergeWith( self.repo ) self._writeRepository( self.location ) return S_OK() def addJob( self, jobID, state = 'Submitted', retrieved = 0, outputData = 0, update = False ): paramDict = { 'State' : state, 'Time' : self._getTime(), 'Retrieved' : int( retrieved ), 'OutputData' : outputData} self._writeJob( jobID, paramDict, update ) self._writeRepository( self.location ) return S_OK( jobID ) def updateJob( self, jobID, paramDict ): if self._existsJob( jobID ): paramDict['Time'] = self._getTime() self._writeJob( jobID, paramDict, True ) self._writeRepository( self.location ) return S_OK() def updateJobs( self, jobDict ): for jobID, paramDict in jobDict.items(): if self._existsJob( jobID ): paramDict['Time'] = self._getTime() self._writeJob( jobID, paramDict, True ) self._writeRepository( self.location ) return S_OK() def _getTime( self ): runtime = time.ctime() return runtime.replace( " ", "_" ) def _writeJob( self, jobID, paramDict, update ): jobID = str( jobID ) jobExists = self._existsJob( jobID ) if jobExists and ( not update ): gLogger.warn( "Job exists and not overwriting" ) return S_ERROR( "Job exists and not overwriting" ) if not jobExists: self.repo.createNewSection( 'Jobs/%s' % jobID ) for key, value in paramDict.items(): self.repo.setOption( 'Jobs/%s/%s' % ( jobID, key ), value ) return S_OK() def removeJob( self, jobID ): res = self.repo['Jobs'].deleteKey( str( jobID ) ) #pylint: disable=no-member if res: self._writeRepository( self.location ) return S_OK() def existsJob( self, jobID ): return S_OK( self._existsJob( jobID ) ) def _existsJob( self, jobID ): return self.repo.isSection( 'Jobs/%s' % jobID ) def getLocation( self ): return S_OK( self.location ) def getSize( self ): return S_OK( len( self.repo.getAsDict( 'Jobs' ) ) )
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % submission[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete')
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available') self.log.info(result['Message']) return self.__finish('CE Not Available') self.log.info(result['Message']) ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots with %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn('Job has no CPU requirement defined in JDL parameters') # Job requirement for a number of processors processors = int(params.get('NumberOfProcessors', 1)) wholeNode = 'WholeNode' in params if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result['PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() return S_OK('Job Agent cycle complete')
def execute(self): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error(jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: self.log.error(jobRequest['Message']) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if not params.has_key('JobID'): msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if not params.has_key('JobType'): self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key('SystemConfig'): self.log.warn( 'Job has no system configuration defined in JDL parameters') systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig else: systemConfig = params['SystemConfig'] if systemConfig.lower() == 'any': systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =', '"%s" since it was set to "ANY" in the job description' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig if not params.has_key('CPUTime'): self.log.warn( 'Job has no CPU requirement defined in JDL parameters') self.log.verbose('Job request successful: \n %s' % (jobRequest['Value'])) self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' % (jobID, jobType, systemConfig)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if self.gridCEQueue: jobReport.setJobParameter('GridCEQueue', self.gridCEQueue, sendFlag=False) if os.environ.has_key('BOINC_JOB_ID'): # Report BOINC environment for p in [ 'BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName' ]: jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo(jobID) result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) if 'Value' in result and result['Value']: proxyChain = result['Value'] # Is this necessary at all? saveJDL = self.__saveJobJDLRequest(jobID, jobJDL) #self.__report(jobID,'Matched','Job Prepared to Submit') #resourceParameters = self.__getJDLParameters( resourceJDL ) #if not resourceParameters['OK']: # return resourceParameters #resourceParams = resourceParameters['Value'] software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job %s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.verbose('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure) self.log.verbose('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) currentTimes = list(os.times()) for i in range(len(currentTimes)): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete')
'/PilotInstallDIR/etc/dirac.cfg'): localConfigFile = os.path.expandvars( '$WORKSPACE') + '/PilotInstallDIR/etc/dirac.cfg' elif os.path.isfile( os.path.expandvars('$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg'): localConfigFile = os.path.expandvars( '$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg' elif os.path.isfile('./etc/dirac.cfg'): localConfigFile = './etc/dirac.cfg' else: print "Local CFG file not found" exit(2) localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', 5000) localCfg.setOption('/DIRAC/Security/UseServerCertificate', False) if not sMod: if not setup: setup = gConfig.getValue('/DIRAC/Setup') if not setup: setup = 'dirac-JenkinsSetup' if not vo: vo = gConfig.getValue('/DIRAC/VirtualOrganization') if not vo: vo = 'dirac' if not localCfg.isSection('/DIRAC/VOPolicy'):
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish("No more time left") # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result["OK"]: return self.__finish(result["Message"]) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection("/LocalSite"): localCfg.createNewSection("/LocalSite") localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish("Filling Mode is Disabled") self.log.verbose("Job Agent execution loop") available = self.computingElement.available() if not available["OK"] or not available["Value"]: self.log.info("Resource is not available") self.log.info(available["Message"]) return self.__finish("CE Not Available") self.log.info(available["Message"]) result = self.computingElement.getDescription() if not result["OK"]: return result ceDict = result["Value"] # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if not "PilotReference" in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime = %.2f (s)" % (matchTime)) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: if re.search("No match found", jobRequest["Message"]): self.log.notice("Job request OK: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error("Timeout while requesting job", jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("Pilot version does not match") != -1: errorMsg = "Pilot version does not match the production version" self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, "")) return S_ERROR(jobRequest["Message"]) else: self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest["Value"] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False) jobID = matcherInfo["JobID"] matcherParams = ["JDL", "DN", "Group"] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, "Failed", "Matcher did not return %s" % (param)) return self.__finish("Matcher Failed") elif not matcherInfo[param]: self.__report(jobID, "Failed", "Matcher returned null %s" % (param)) return self.__finish("Matcher Failed") else: self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param])) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters["OK"]: self.__report(jobID, "Failed", "Could Not Extract JDL Parameters") self.log.warn(parameters["Message"]) return self.__finish("JDL Problem") params = parameters["Value"] if "JobID" not in params: msg = "Job has not JobID defined in JDL parameters" self.__report(jobID, "Failed", msg) self.log.warn(msg) return self.__finish("JDL Problem") else: jobID = params["JobID"] if "JobType" not in params: self.log.warn("Job has no JobType defined in JDL parameters") jobType = "Unknown" else: jobType = params["JobType"] if "CPUTime" not in params: self.log.warn("Job has no CPU requirement defined in JDL parameters") if self.extraOptions: params["Arguments"] += " " + self.extraOptions params["ExtraOptions"] = self.extraOptions self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType)) self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False) jobReport.setJobStatus("Matched", "Job Received by Agent") result = self.__setupProxy(ownerDN, jobGroup) if not result["OK"]: return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure) proxyChain = result.get("Value") # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug("Before %sCE submitJob()" % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission["OK"]: self.__report(jobID, "Failed", submission["Message"]) return self.__finish(submission["Message"]) elif "PayloadFailed" in submission: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % submission["PayloadFailed"] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, "Job processing failed with exception", self.stopOnApplicationFailure ) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result["OK"]: self.timeLeft = result["Value"] else: if result["Message"] != "Current batch system is not supported": self.timeLeftError = result["Message"] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK("Job Agent cycle complete")