def _matchAJob(self, ceDictList): """Call the Matcher with each ceDict until we get a job""" jobRequest = S_ERROR("No CE Dictionary available") for ceDict in ceDictList: self.log.verbose("CE dict", ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime", "= %.2f (s)" % (matchTime)) if jobRequest["OK"]: jobRequest["Value"]["matchTime"] = matchTime jobRequest["Value"]["CEDict"] = ceDict break return jobRequest
def test_matcher(self): # insert a proper DN to run the test resourceDescription = { "OwnerGroup": "prod", "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "DIRACVersion": "pippo", "GridCE": "some.grid.ce.org", "ReleaseVersion": "blabla", "VirtualOrganization": "LHCb", "PilotInfoReportedFlag": "True", "PilotBenchmark": "anotherPilot", "Site": "DIRAC.Jenkins.ch", "CPUTime": 86400, } wmsClient = WMSClient() job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData("/a/bbb") job.setType("User") jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] # forcing the update res = JobStateUpdateClient().setJobStatus(jobID, JobStatus.WAITING, "matching", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) tqDB = TaskQueueDB() tqDefDict = { "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "OwnerGroup": "prod", "Setup": "dirac-JenkinsSetup", "CPUTime": 86400, } res = tqDB.insertJob(jobID, tqDefDict, 10) self.assertTrue(res["OK"], res.get("Message")) res = MatcherClient().requestJob(resourceDescription) print(res) self.assertTrue(res["OK"], res.get("Message")) wmsClient.deleteJob(jobID)
def test_matcher(self): # insert a proper DN to run the test resourceDescription = { 'OwnerGroup': 'prod', 'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion': 'blabla', 'VirtualOrganization': 'LHCb', 'PilotInfoReportedFlag': 'True', 'PilotBenchmark': 'anotherPilot', 'Site': 'DIRAC.Jenkins.ch', 'CPUTime': 86400 } wmsClient = WMSClient() job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData('/a/bbb') job.setType('User') jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK']) jobID = res['Value'] res = JobStateUpdateClient().setJobStatus(jobID, 'Waiting', 'matching', 'source') self.assertTrue(res['OK']) tqDB = TaskQueueDB() tqDefDict = { 'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup': 'prod', 'Setup': 'dirac-JenkinsSetup', 'CPUTime': 86400 } res = tqDB.insertJob(jobID, tqDefDict, 10) self.assertTrue(res['OK']) res = MatcherClient().requestJob(resourceDescription) print(res) self.assertTrue(res['OK']) wmsClient.deleteJob(jobID)
return S_OK() Script.registerSwitch("v", "verbose", "give max details about task queues", setVerbose) Script.registerSwitch("t:", "taskQueue=", "show this task queue only", setTaskQueueID) Script.setUsageMessage('\n'.join([ __doc__.split('\n')[1], 'Usage:', ' %s [option|cfgfile] ' % Script.scriptName ])) Script.parseCommandLine(initializeMonitor=False) result = MatcherClient().getActiveTaskQueues() if not result['OK']: gLogger.error(result['Message']) sys.exit(1) tqDict = result['Value'] if not verbose: fields = [ 'TaskQueue', 'Jobs', 'CPUTime', 'Owner', 'OwnerGroup', 'Sites', 'Platforms', 'SubmitPools', 'Setup', 'Priority' ] records = [] for tqId in sorted(tqDict): if taskQueueID and tqId != taskQueueID:
def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn( "Disabling filling mode as errors calculating time left", self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', '%d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self._setupProxy(ownerDN, jobGroup) if not result['OK']: return self._rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result = self._submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode, maxNumberOfProcessors, mpTag) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self._getCPUTimeLeft() return S_OK('Job Agent cycle complete')
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms( self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus( ceName, "ComputingElement") if not result['OK']: self.log.error( "Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable( queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def createVMs(self): """Go through defined computing elements and submit jobs if necessary""" vmTypeList = list(self.vmTypeDict.keys()) # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999} if self.vo: tqDict["VO"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites tags = [] for vmType in vmTypeList: if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]: tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"] tqDict["Tag"] = list(set(tags)) self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result["Value"]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result["Value"][tqID]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": testSites.add(site) totalWaitingJobs += result["Value"][tqID]["Jobs"] tqIDList = list(result["Value"].keys()) result = virtualMachineDB.getInstanceCounters("Status", {}) totalVMs = 0 if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted", "Running"]: totalVMs += result["Value"][status] self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.siteClient.getUsableSites() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result.get("Value", []) vmTypeList = list(self.vmTypeDict.keys()) random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]["CE"] ceName = self.vmTypeDict[vmType]["CEName"] vmTypeName = self.vmTypeDict[vmType]["VMType"] siteName = self.vmTypeDict[vmType]["Site"] platform = self.vmTypeDict[vmType]["Platform"] vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"]) processorTags = [] # vms support WholeNode naturally processorTags.append("WholeNode") if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]: vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict["JobType"] = "Test" if self.vo: ceDict["VO"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result["OK"]: continue ceDict["Platform"] = result["Value"] ceDict["Tag"] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.verbose("No matching TQs found for %s" % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = list(taskQueueDict.keys()) for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] self.log.verbose( "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType) ) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint}) if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted"]: totalWaitingVMs += result["Value"][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get("Auth") if authType and authType.lower() in ["x509", "voms"]: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCloud(ce) if not result["OK"]: continue ce.setProxy(result["Value"]) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug("%s: No slots available" % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d" % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit) ) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) # result = S_OK() if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"]) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result["Value"] totalSubmittedPilots += len(vmDict) self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]["InstanceID"] endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName) result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result["OK"]: continue pRef = "vm://" + ceName + "/" + diracUUID + ":00" pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict) if not result["OK"]: self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"]) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues) ) return S_OK()
def main(): global verbose global taskQueueID Script.registerSwitch("v", "verbose", "give max details about task queues", setVerbose) Script.registerSwitch("t:", "taskQueue=", "show this task queue only", setTaskQueueID) Script.parseCommandLine(initializeMonitor=False) result = MatcherClient().getActiveTaskQueues() if not result["OK"]: gLogger.error(result["Message"]) sys.exit(1) tqDict = result["Value"] if not verbose: fields = [ "TaskQueue", "Jobs", "CPUTime", "Owner", "OwnerGroup", "Sites", "Platforms", "Priority", ] records = [] for tqId in sorted(tqDict): if taskQueueID and tqId != taskQueueID: continue record = [str(tqId)] tqData = tqDict[tqId] for key in fields[1:]: if key == "Owner": value = tqData.get("OwnerDN", "-") if value != "-": result = getUsernameForDN(value) if not result["OK"]: value = "Unknown" else: value = result["Value"] else: value = tqData.get(key, "-") if isinstance(value, list): if len(value) > 1: record.append(str(value[0]) + "...") else: record.append(str(value[0])) else: record.append(str(value)) records.append(record) printTable(fields, records) else: fields = ["Key", "Value"] for tqId in sorted(tqDict): if taskQueueID and tqId != taskQueueID: continue gLogger.notice("\n==> TQ %s" % tqId) records = [] tqData = tqDict[tqId] for key in sorted(tqData): value = tqData[key] if isinstance(value, list): records.append([key, {"Value": value, "Just": "L"}]) else: value = str(value) records.append([key, {"Value": value, "Just": "L"}]) printTable(fields, records, numbering=False)
def main(): global verbose global taskQueueID Script.registerSwitch("v", "verbose", "give max details about task queues", setVerbose) Script.registerSwitch("t:", "taskQueue=", "show this task queue only", setTaskQueueID) Script.parseCommandLine(initializeMonitor=False) result = MatcherClient().getActiveTaskQueues() if not result['OK']: gLogger.error(result['Message']) sys.exit(1) tqDict = result['Value'] if not verbose: fields = [ 'TaskQueue', 'Jobs', 'CPUTime', 'Owner', 'OwnerGroup', 'Sites', 'Platforms', 'SubmitPools', 'Setup', 'Priority' ] records = [] for tqId in sorted(tqDict): if taskQueueID and tqId != taskQueueID: continue record = [str(tqId)] tqData = tqDict[tqId] for key in fields[1:]: if key == 'Owner': value = tqData.get('OwnerDN', '-') if value != '-': result = getUsernameForDN(value) if not result['OK']: value = 'Unknown' else: value = result['Value'] else: value = tqData.get(key, '-') if isinstance(value, list): if len(value) > 1: record.append(str(value[0]) + '...') else: record.append(str(value[0])) else: record.append(str(value)) records.append(record) printTable(fields, records) else: fields = ['Key', 'Value'] for tqId in sorted(tqDict): if taskQueueID and tqId != taskQueueID: continue gLogger.notice("\n==> TQ %s" % tqId) records = [] tqData = tqDict[tqId] for key in sorted(tqData): value = tqData[key] if isinstance(value, list): records.append([key, {"Value": value, 'Just': 'L'}]) else: value = str(value) records.append([key, {"Value": value, 'Just': 'L'}]) printTable(fields, records, numbering=False)
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS}, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info('Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus(ceName, "ComputingElement") if not result['OK']: self.log.error("Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools if self.checkPlatform: platform = self.queueDict[queue]['Platform'] result = self.resourcesModule.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()