def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict[ 'SubmitPool' ] = self.am_getOption( "SubmitPools" ) #Add all DIRAC platforms if not specified otherwise if not 'Platform' in self.directorDict: result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility' ) if result['OK']: self.directorDict['Platform'] = result['Value'].keys() rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( self.directorDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.info( 'Found %s TaskQueues' % len( taskQueueDict ) ) if not taskQueueDict: self.log.info( 'No TaskQueue to Process' ) return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info( 'Sum of Priorities %s' % prioritySum ) if waitingJobs == 0: self.log.info( 'No waiting Jobs' ) return S_OK( 'No waiting Jobs' ) if prioritySum <= 0: return S_ERROR( 'Wrong TaskQueue Priorities' ) self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration' ) / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration' ) / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption( "maxPilotWaitingHours" ) ) for taskQueueID in taskQueueDict: self.log.verbose( 'Processing TaskQueue', taskQueueID ) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList}, None, timeLimitToConsider ) if not result['OK']: self.log.error( 'Fail to get Number of Waiting pilots', result['Message'] ) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots ) result = self.submitPilotsForTaskQueue( taskQueueDict[taskQueueID], waitingPilots ) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info( 'Number of pilots to be Submitted %s' % self.toSubmitPilots ) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info( 'Number of pilots Submitted %s' % self.submittedPilots ) return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add( site ) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add( site ) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None ) totalWaitingPilots = 0 if result['OK']: totalWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) ) #if totalWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle( queues ) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor if failedCount != 0: self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) ) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] siteMask = siteName in siteMaskList if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) ) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) ) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName #if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found for %s' % queue ) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) ) # Get the number of already waiting pilots for these task queues totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) if totalWaitingPilots >= totalTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots ) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) ) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue totalSlots = self.__getQueueSlots( queue ) if totalSlots == 0: self.log.debug( '%s: No slots available' % queue ) continue pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len( pilotList ) totalSubmittedPilots += len( pilotList ) self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) ) return S_OK()
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() # Add all submit pools self.directorDict["SubmitPool"] = self.am_getOption("SubmitPools") # Add all DIRAC platforms if not specified otherwise if not "Platform" in self.directorDict: result = getDIRACPlatforms() if result["OK"]: self.directorDict["Platform"] = result["Value"] rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] self.log.info("Found %s TaskQueues" % len(taskQueueDict)) if not taskQueueDict: self.log.info("No TaskQueue to Process") return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]["TaskQueueID"] = taskQueueID prioritySum += taskQueueDict[taskQueueID]["Priority"] waitingJobs += taskQueueDict[taskQueueID]["Jobs"] self.log.info("Sum of Priorities %s" % prioritySum) if waitingJobs == 0: self.log.info("No waiting Jobs") return S_OK("No waiting Jobs") if prioritySum <= 0: return S_ERROR("Wrong TaskQueue Priorities") self.pilotsPerPriority = self.am_getOption("pilotsPerIteration") / prioritySum self.pilotsPerJob = self.am_getOption("pilotsPerIteration") / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ["Submitted", "Ready", "Scheduled", "Waiting"] timeLimitToConsider = Time.toString(Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose("Processing TaskQueue", taskQueueID) result = pilotAgentsDB.countPilots( {"TaskQueueID": taskQueueID, "Status": waitingStatusList}, None, timeLimitToConsider ) if not result["OK"]: self.log.error("Fail to get Number of Waiting pilots", result["Message"]) waitingPilots = 0 else: waitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result["OK"]: self.toSubmitPilots += result["Value"] self.log.info("Number of pilots to be Submitted %s" % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if "Default" in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools["Default"].processAllResults() self.log.info("Number of pilots Submitted %s" % self.submittedPilots) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tags'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tags'] siteMask = siteName in siteMaskList processorTags = [] for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors.keys(): self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime( ) - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '') result = self.getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir, processors) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % (queue, result['Message'])) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info('No matching TQs found') continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue return S_OK()
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools") rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] self.log.info('Found %s TaskQueues' % len(taskQueueDict)) if not taskQueueDict: self.log.info('No TaskQueue to Process') return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info('Sum of Priorities %s' % prioritySum) if waitingJobs == 0: self.log.info('No waiting Jobs') return S_OK('No waiting Jobs') if prioritySum <= 0: return S_ERROR('Wrong TaskQueue Priorities') self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration') / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration') / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose('Processing TaskQueue', taskQueueID) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList }, None, timeLimitToConsider) if not result['OK']: self.log.error('Fail to get Number of Waiting pilots', result['Message']) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info('Number of pilots to be Submitted %s' % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info('Number of pilots Submitted %s' % self.submittedPilots) return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools} if self.vo: tqDict["Community"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result["Value"] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] ceType = self.queueDict[queue]["CEType"] queueName = self.queueDict[queue]["QueueName"] siteName = self.queueDict[queue]["Site"] siteMask = siteName in siteMaskList if "CPUTime" in self.queueDict[queue]["ParametersDict"]: queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result["OK"]: self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"])) continue ceInfoDict = result["CEInfoDict"] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % ( ceName, queueName, ceInfoDict["WaitingJobs"], ceInfoDict["RunningJobs"], ceInfoDict["SubmittedJobs"], ceInfoDict["MaxTotalJobs"], ) ) totalSlots = result["Value"] ceDict = ce.getParameterDict() ceDict["GridCE"] = ceName if not siteMask and "Site" in ceDict: self.log.info("Site not in the mask %s" % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict["Site"] if self.vo: ceDict["Community"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups # This is a hack to get rid of ! ceDict["SubmitPool"] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: continue ceDict["Platform"] = result["Value"] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.info("No matching TQs found") continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime ) if not result["OK"]: self.log.error("Failed to get Number of Waiting pilots", result["Message"]) totalWaitingPilots = 0 else: totalWaitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots) pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d" % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get("BundleProxy", False) jobExecDir = "" if ceType == "CREAM": jobExecDir = "." jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir) httpProxy = self.queueDict[queue].get("HttpProxy", "") result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result["OK"]: return result executable, pilotSubmissionChunk = result["Value"] result = ce.submitJob(executable, "", pilotSubmissionChunk) os.unlink(executable) if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % queue, result["Message"]) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result["Value"] self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key("PilotStampDict"): stampDict = result["PilotStampDict"] tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict ) if not result["OK"]: self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"]) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, "Submitted", ceName, "Successfully submitted by the SiteDirector", siteName, queueName, ) if not result["OK"]: self.log.error("Failed to set pilot status: ", result["Message"]) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS}, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info('Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus(ceName, "ComputingElement") if not result['OK']: self.log.error("Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools if self.checkPlatform: platform = self.queueDict[queue]['Platform'] result = self.resourcesModule.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()