def _updateConfiguration(self, key, value, path="/LocalSite"): """Update local configuration to be used by submitted job wrappers""" localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) section = "/" for p in path.split("/")[1:]: section = os.path.join(section, p) if not localCfg.isSection(section): localCfg.createNewSection(section) localCfg.setOption("%s/%s" % (section, key), value) localCfg.writeToFile(localConfigFile)
class JobRepository(object): def __init__(self, repository=None): self.location = repository if not self.location: if "HOME" in os.environ: self.location = '%s/.dirac.repo.rep' % os.environ['HOME'] else: self.location = '%s/.dirac.repo.rep' % os.getcwd() self.repo = CFG() if os.path.exists(self.location): self.repo.loadFromFile(self.location) if not self.repo.existsKey('Jobs'): self.repo.createNewSection('Jobs') else: self.repo.createNewSection('Jobs') self.OK = True written = self._writeRepository(self.location) if not written: self.OK = False def isOK(self): return self.OK def readRepository(self): return S_OK(self.repo.getAsDict('Jobs')) def writeRepository(self, alternativePath=None): destination = self.location if alternativePath: destination = alternativePath written = self._writeRepository(destination) if not written: return S_ERROR("Failed to write repository") return S_OK(destination) def resetRepository(self, jobIDs=[]): if not jobIDs: jobs = self.readRepository()['Value'] jobIDs = list(jobs) paramDict = {'State': 'Submitted', 'Retrieved': 0, 'OutputData': 0} for jobID in jobIDs: self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _writeRepository(self, path): handle, tmpName = tempfile.mkstemp() written = self.repo.writeToFile(tmpName) os.close(handle) if not written: if os.path.exists(tmpName): os.remove(tmpName) return written if os.path.exists(path): gLogger.debug("Replacing %s" % path) try: shutil.move(tmpName, path) return True except Exception as x: gLogger.error("Failed to overwrite repository.", x) gLogger.info( "If your repository is corrupted a backup can be found %s" % tmpName) return False def appendToRepository(self, repoLocation): if not os.path.exists(repoLocation): gLogger.error("Secondary repository does not exist", repoLocation) return S_ERROR("Secondary repository does not exist") self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo) self._writeRepository(self.location) return S_OK() def addJob(self, jobID, state='Submitted', retrieved=0, outputData=0, update=False): paramDict = { 'State': state, 'Time': self._getTime(), 'Retrieved': int(retrieved), 'OutputData': outputData } self._writeJob(jobID, paramDict, update) self._writeRepository(self.location) return S_OK(jobID) def updateJob(self, jobID, paramDict): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def updateJobs(self, jobDict): for jobID, paramDict in jobDict.items(): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _getTime(self): runtime = time.ctime() return runtime.replace(" ", "_") def _writeJob(self, jobID, paramDict, update): jobID = str(jobID) jobExists = self._existsJob(jobID) if jobExists and (not update): gLogger.warn("Job exists and not overwriting") return S_ERROR("Job exists and not overwriting") if not jobExists: self.repo.createNewSection('Jobs/%s' % jobID) for key, value in paramDict.items(): self.repo.setOption('Jobs/%s/%s' % (jobID, key), value) return S_OK() def removeJob(self, jobID): res = self.repo['Jobs'].deleteKey(str(jobID)) # pylint: disable=no-member if res: self._writeRepository(self.location) return S_OK() def existsJob(self, jobID): return S_OK(self._existsJob(jobID)) def _existsJob(self, jobID): return self.repo.isSection('Jobs/%s' % jobID) def getLocation(self): return S_OK(self.location) def getSize(self): return S_OK(len(self.repo.getAsDict('Jobs')))
elif os.path.isfile( os.path.expandvars("$WORKSPACE") + "/ServerInstallDIR/etc/dirac.cfg"): localConfigFile = os.path.expandvars( "$WORKSPACE") + "/ServerInstallDIR/etc/dirac.cfg" elif os.path.isfile("./etc/dirac.cfg"): localConfigFile = "./etc/dirac.cfg" else: print("Local CFG file not found") exit(2) localCfg.loadFromFile(localConfigFile) if not localCfg.isSection("/LocalSite"): localCfg.createNewSection("/LocalSite") localCfg.setOption("/LocalSite/CPUTimeLeft", 5000) localCfg.setOption("/DIRAC/Security/UseServerCertificate", False) if not sMod: if not setup: setup = gConfig.getValue("/DIRAC/Setup") if not setup: setup = "dirac-JenkinsSetup" if not localCfg.isSection("/Operations"): localCfg.createNewSection("/Operations") if not localCfg.isSection("/Operations/%s" % setup): localCfg.createNewSection("/Operations/%s" % setup) localCfg.setOption("/Operations/%s/SoftwareDistModule" % setup, "") localCfg.writeToFile(localConfigFile)
def execute(self): """The JobAgent execution method. """ # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Check if we can match jobs at all self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', ': %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) self.log.info( 'CE is not available (and there are no running jobs)') return self.__finish('CE Not Available') if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: self.timeLeft = self.computeCPUWorkLeft() self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') # if we are here we assume that a job can be matched result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: # if we don't match a job, independently from the reason, # we wait a bit longer before trying again self.am_setOption("PollingTime", int(self.am_getOption("PollingTime") * 1.5)) if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 # If we are here it is because we matched a job matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: jobReport.setJobStatus(status='Failed', minor='Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: jobReport.setJobStatus(status='Failed', minor='Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: jobReport.setJobStatus(status='Failed', minor='Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' jobReport.setJobStatus(status='Failed', minor=msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions and 'dirac-jobexec' in params.get( 'Executable', '').strip(): params['Arguments'] = (params.get('Arguments', '') + ' ' + self.extraOptions).strip() params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name='MatcherServiceTime', par_value=str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus(status='Matched', minor='Job Received by Agent', sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy['OK']: return self._rescheduleFailedJob(jobID, result_setupProxy['Message'], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=processors, wholeNode=wholeNode, maxNumberOfProcessors=maxNumberOfProcessors, mpTag=mpTag) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res['OK']: resFD = jobReport.generateForwardDISET() if not resFD['OK']: self.log.error("Error generating ForwardDISET operation", resFD['Message']) else: # Here we create the Request. op = resFD['Value'] request = Request() requestName = 'jobAgent_%s' % jobID request.RequestName = requestName.replace('"', '') request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob['OK']: return self.__finish(result_submitJob['Message']) elif 'PayloadFailed' in result_submitJob: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result_submitJob[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) return S_OK('Job Agent cycle complete')