def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup, requiredTimeLeft): """ To be overwritten if a given Pilot does not require a full proxy """ return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft)
def getProxyFileForCloud(ce): """Get a file with the proxy to be used to connect to the given cloud endpoint :param ce: cloud endpoint object :return: S_OK/S_ERROR, value is the path to the proxy file """ vo = ce.parameters.get("VO") cloudDN = None cloudGroup = None if vo: result = findGenericCloudCredentials(vo=vo) if not result["OK"]: return result cloudDN, cloudGroup = result["Value"] cloudUser = ce.parameters.get("GenericCloudUser") if cloudUser: result = Registry.getDNForUsername(cloudUser) if not result["OK"]: return result cloudDN = result["Value"][0] cloudGroup = ce.parameters.get("GenericCloudGroup", cloudGroup) if cloudDN and cloudGroup: result = gProxyManager.getPilotProxyFromDIRACGroup( cloudDN, cloudGroup, 3600) if not result["OK"]: return result proxy = result["Value"] result = gProxyManager.dumpProxyToFile(proxy) return result else: return S_ERROR("Could not find generic cloud credentials")
def _getPilotProxyFromDIRACGroup(self, ownerDN, ownerGroup, requiredTimeLeft): """ To be overwritten if a given Pilot does not require a full proxy """ self.log.info("Downloading %s@%s proxy" % (ownerDN, ownerGroup)) return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft)
def getPilotProxy( self, userDN, userGroup, validity = 43200 ): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } @return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft = validity )
def getPilotProxy( self, userDN, userGroup, validity = 43200 ): """Retrieves a pilot proxy with default 12hr validity and stores this in a file in the local directory by default. Example usage: >>> print diracAdmin.getVOMSProxy() {'OK': True, 'Value': } @return: S_OK,S_ERROR """ return gProxyManager.getPilotProxyFromDIRACGroup( userDN, userGroup, requiredTimeLeft = validity )
def __getExecutable( self, queue, pilotsToSubmit, bundleProxy = True ): """ Prepare the full executable for queue """ result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, 1000 ) if not result['OK']: return result proxyString = result['Value'] proxy = '' if bundleProxy: proxy = proxyString pilotOptions = self.__getPilotOptions( queue, pilotsToSubmit ) if pilotOptions is None: return S_ERROR( 'Errors in compiling pilot options' ) executable = self.__writePilotScript( self.workingDirectory, pilotOptions, proxy ) result = S_OK() result['Executable'] = executable result['Proxy'] = proxyString return result
def __getExecutable(self, queue, pilotsToSubmit, bundleProxy=True): """ Prepare the full executable for queue """ result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, 1000) if not result['OK']: return result proxyString = result['Value'] proxy = '' if bundleProxy: proxy = proxyString pilotOptions = self.__getPilotOptions(queue, pilotsToSubmit) if pilotOptions is None: return S_ERROR('Errors in compiling pilot options') executable = self.__writePilotScript(self.workingDirectory, pilotOptions, proxy) result = S_OK() result['Executable'] = executable result['Proxy'] = proxyString return result
def execute(self): """The JobAgent execution method.""" self.log.verbose("Job Agent execution loop") queueDictItems = list(self.queueDict.items()) random.shuffle(queueDictItems) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result for queueName, queueDictionary in queueDictItems: # Make sure there is no problem with the queue before trying to submit if not self._allowedToSubmit(queueName): continue # Get a working proxy ce = queueDictionary["CE"] cpuTime = 86400 * 3 self.log.verbose( "Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result proxy = result["Value"] result = proxy.getRemainingSecs() # pylint: disable=no-member if not result["OK"]: return result lifetime_secs = result["Value"] ce.setProxy(proxy, lifetime_secs) # Check that there is enough slots in the remote CE to match a job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 continue # Get environment details and enhance them result = self._getCEDict(ce) if not result["OK"]: self.failedQueues[queueName] += 1 continue ceDictList = result["Value"] for ceDict in ceDictList: # Information about number of processors might not be returned in CE.getCEStatus() ceDict["NumberOfProcessors"] = ce.ceParameters.get( "NumberOfProcessors") self._setCEDict(ceDict) # Update the configuration with the names of the Site, CE and queue to target # This is used in the next stages self._updateConfiguration("Site", queueDictionary["Site"]) self._updateConfiguration("GridCE", queueDictionary["CEName"]) self._updateConfiguration("CEQueue", queueDictionary["QueueName"]) self._updateConfiguration("RemoteExecution", True) # Try to match a job jobRequest = self._matchAJob(ceDictList) while jobRequest["OK"]: # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) self.failedQueues[queueName] += 1 break params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: self.failedQueues[queueName] += 1 break submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)) try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) jobReport.setJobStatus(status=JobStatus.MATCHED, minorStatus="Job Received by Agent", sendFlag=False) # Setup proxy result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob( jobID, result_setupProxy["Message"]) self.failedQueues[queueName] += 1 break proxyChain = result_setupProxy.get("Value") # Check software and install them if required software = self._checkInstallSoftware( jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) self.failedQueues[queueName] += 1 break # Submit the job to the CE self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams[ "maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error( "Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: self.log.error("Error during submission", result_submitJob["Message"]) self.failedQueues[queueName] += 1 break elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob[ "PayloadFailed"] self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) # Check that there is enough slots locally result = self._checkCEAvailability(self.computingElement) if not result["OK"] or result["Value"]: return result # Check that there is enough slots in the remote CE to match a new job result = self._checkCEAvailability(ce) if not result["OK"] or result["Value"]: self.failedQueues[queueName] += 1 break # Try to match a new job jobRequest = self._matchAJob(ceDictList) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob( jobID, "Job processing failed with exception") self.failedQueues[queueName] += 1 break if not jobRequest["OK"]: self._checkMatchingIssues(jobRequest) self.failedQueues[queueName] += 1 continue return S_OK("Push Job Agent cycle complete")
def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'Status': TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup }) if not result['OK']: self.log.error('Failed to select pilots: %s' % result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue #print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] #print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]['PilotStamp']) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result['OK']: self.log.error('Failed to get pilots status from CE', '%s: %s' % (ceName, result['Message'])) continue pilotCEDict = result['Value'] #print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown': # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info('Updating status to %s for pilot %s' % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector') # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower( ) == 'false' and self.getOutput: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message']) else: self.log.warn( 'Empty pilot output not stored to PilotDB') # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000) if not result['OK']: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'OutputReady': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error('Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error('Failed to store pilot output', result['Message']) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'AccountingSent': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] result = self.sendPilotAccounting(pilotDict) if not result['OK']: self.log.error('Failed to send pilot agent accounting') return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % (queue, result['Message'])) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info('No matching TQs found') continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue #print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info: %s' % result['Message'] ) continue pilotDict = result['Value'] #print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE: %s' % result['Message'] ) continue pilotCEDict = result['Value'] #print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output: %s' % result['Message'] ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output: %s' % result['Message'] ) # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info: %s' % result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output: %s' % result['Message'] ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output: %s' % result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info: %s' % result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] queueName = self.queueDict[queue]["QueueName"] ceType = self.queueDict[queue]["CEType"] siteName = self.queueDict[queue]["Site"] result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "Status": TRANSIENT_PILOT_STATUS, "OwnerDN": self.pilotDN, "OwnerGroup": self.pilotGroup, } ) if not result["OK"]: self.log.error("Failed to select pilots: %s" % result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue # print "AT >>> pilotRefs", pilotRefs result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] # print "AT >>> pilotDict", pilotDict stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]["PilotStamp"]: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]["PilotStamp"]) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result["OK"]: result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 600) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result["OK"]: self.log.error("Failed to get pilots status from CE", "%s: %s" % (ceName, result["Message"])) continue pilotCEDict = result["Value"] # print "AT >>> pilotCEDict", pilotCEDict for pRef in pilotRefs: newStatus = "" oldStatus = pilotDict[pRef]["Status"] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = "Aborted" elif ceStatus != "Unknown": # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info("Updating status to %s for pilot %s" % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus(pRef, newStatus, "", "Updated by SiteDirector") # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]["OutputReady"].lower() == "false" and self.getOutput: self.log.info("Retrieving output for pilot %s" % pRef) pilotStamp = pilotDict[pRef]["PilotStamp"] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ":::" + pilotStamp result = ce.getJobOutput(pRefStamp) if not result["OK"]: self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"])) else: output, error = result["Value"] if output: result = pilotAgentsDB.storePilotOutput(pRef, output, error) if not result["OK"]: self.log.error("Failed to store pilot output", result["Message"]) else: self.log.warn("Empty pilot output not stored to PilotDB") # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]["CE"] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, 1000) if not result["OK"]: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]["CEName"] queueName = self.queueDict[queue]["QueueName"] ceType = self.queueDict[queue]["CEType"] siteName = self.queueDict[queue]["Site"] result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "OutputReady": "False", "Status": FINAL_PILOT_STATUS, } ) if not result["OK"]: self.log.error("Failed to select pilots", result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] if self.getOutput: for pRef in pilotRefs: self.log.info("Retrieving output for pilot %s" % pRef) pilotStamp = pilotDict[pRef]["PilotStamp"] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ":::" + pilotStamp result = ce.getJobOutput(pRefStamp) if not result["OK"]: self.log.error("Failed to get pilot output", "%s: %s" % (ceName, result["Message"])) else: output, error = result["Value"] result = pilotAgentsDB.storePilotOutput(pRef, output, error) if not result["OK"]: self.log.error("Failed to store pilot output", result["Message"]) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( { "DestinationSite": ceName, "Queue": queueName, "GridType": ceType, "GridSite": siteName, "AccountingSent": "False", "Status": FINAL_PILOT_STATUS, } ) if not result["OK"]: self.log.error("Failed to select pilots", result["Message"]) continue pilotRefs = result["Value"] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result["OK"]: self.log.error("Failed to get pilots info from DB", result["Message"]) continue pilotDict = result["Value"] result = self.sendPilotAccounting(pilotDict) if not result["OK"]: self.log.error("Failed to send pilot agent accounting") return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: return S_ERROR('CPU time limit is not specified for queue %s' % queue) if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: %s' % (queue, result['Message'])) continue totalSlots = result['Value'] self.log.verbose(result['Message']) ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] result = taskQueueDB.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found') continue totalTQJobs = 0 for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % (totalSlots, totalTQJobs, pilotsToSubmit)) if pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy) if not result['OK']: return result executable = result['Value'] result = ce.submitJob(executable, '', pilotsToSubmit) if not result['OK']: self.log.error('Failed submission to queue %s:' % queue, result['Message']) continue # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.genericPilotDN, self.genericPilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfuly submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: %s' % result['Message']) continue return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools} if self.vo: tqDict["Community"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result["Value"] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] ceType = self.queueDict[queue]["CEType"] queueName = self.queueDict[queue]["QueueName"] siteName = self.queueDict[queue]["Site"] siteMask = siteName in siteMaskList if "CPUTime" in self.queueDict[queue]["ParametersDict"]: queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result["OK"]: self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"])) continue ceInfoDict = result["CEInfoDict"] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % ( ceName, queueName, ceInfoDict["WaitingJobs"], ceInfoDict["RunningJobs"], ceInfoDict["SubmittedJobs"], ceInfoDict["MaxTotalJobs"], ) ) totalSlots = result["Value"] ceDict = ce.getParameterDict() ceDict["GridCE"] = ceName if not siteMask and "Site" in ceDict: self.log.info("Site not in the mask %s" % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict["Site"] if self.vo: ceDict["Community"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups # This is a hack to get rid of ! ceDict["SubmitPool"] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: continue ceDict["Platform"] = result["Value"] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.info("No matching TQs found") continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime ) if not result["OK"]: self.log.error("Failed to get Number of Waiting pilots", result["Message"]) totalWaitingPilots = 0 else: totalWaitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots) pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d" % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get("BundleProxy", False) jobExecDir = "" if ceType == "CREAM": jobExecDir = "." jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir) httpProxy = self.queueDict[queue].get("HttpProxy", "") result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result["OK"]: return result executable, pilotSubmissionChunk = result["Value"] result = ce.submitJob(executable, "", pilotSubmissionChunk) os.unlink(executable) if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % queue, result["Message"]) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result["Value"] self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key("PilotStampDict"): stampDict = result["PilotStampDict"] tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict ) if not result["OK"]: self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"]) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, "Submitted", ceName, "Successfully submitted by the SiteDirector", siteName, queueName, ) if not result["OK"]: self.log.error("Failed to set pilot status: ", result["Message"]) continue return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add( site ) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add( site ) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None ) totalWaitingPilots = 0 if result['OK']: totalWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) ) #if totalWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle( queues ) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor if failedCount != 0: self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) ) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] siteMask = siteName in siteMaskList if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) ) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) ) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName #if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found for %s' % queue ) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) ) # Get the number of already waiting pilots for these task queues totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) if totalWaitingPilots >= totalTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots ) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) ) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue totalSlots = self.__getQueueSlots( queue ) if totalSlots == 0: self.log.debug( '%s: No slots available' % queue ) continue pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len( pilotList ) totalSubmittedPilots += len( pilotList ) self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) ) return S_OK()
def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ): """ To be overwritten if a given Pilot does not require a full proxy """ return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft )
def _getPilotProxyFromDIRACGroup( self, ownerDN, ownerGroup, requiredTimeLeft ): """ To be overwritten if a given Pilot does not require a full proxy """ self.log.info( "Downloading %s@%s proxy" % ( ownerDN, ownerGroup ) ) return gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup, requiredTimeLeft )
def execute( self ): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption( 'PilotStalledDays', 3 ) self.gridEnv = self.am_getOption( 'GridEnv' ) if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue( '/DIRAC/Setup', '' ) if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' ) if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' ) result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result result = self.pilotDB.getPilotGroups( self.identityFieldsList, {'Status': self.queryStateList } ) if not result['OK']: self.log.error( 'Fail to get identities Groups', result['Message'] ) return result if not result['Value']: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result['Value']: if not gridType in self.eligibleGridTypes: continue self.log.verbose( 'Getting pilots for %s:%s @ %s %s' % ( ownerDN, ownerGroup, gridType, broker ) ) condDict1 = {'Status':'Done', 'StatusReason':'Report from JobAgent', 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'GridType':gridType, 'Broker':broker} condDict2 = {'Status':self.queryStateList, 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'GridType':gridType, 'Broker':broker} for condDict in [ condDict1, condDict2]: result = self.clearWaitingPilots( condDict ) if not result['OK']: self.log.warn( 'Failed to clear Waiting Pilot Jobs' ) result = self.pilotDB.selectPilots( condDict ) if not result['OK']: self.log.warn( 'Failed to get the Pilot Agents' ) return result if not result['Value']: continue refList = result['Value'] ret = gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) ) continue proxy = ret['Value'] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % ( len( refList ), ownerDN, ownerGroup ) ) for start_index in range( 0, len( refList ), MAX_JOBS_QUERY ): refsToQuery = refList[ start_index : start_index + MAX_JOBS_QUERY ] self.log.verbose( 'Querying %d pilots of %s starting at %d' % ( len( refsToQuery ), len( refList ), start_index ) ) result = self.getPilotStatus( proxy, gridType, refsToQuery ) if not result['OK']: if result['Message'] == 'Broker not Available': self.log.error( 'Broker %s not Available' % broker ) break self.log.warn( 'Failed to get pilot status:' ) self.log.warn( '%s:%s @ %s' % ( ownerDN, ownerGroup, gridType ) ) continue statusDict = result[ 'Value' ] for pRef in statusDict: pDict = statusDict[ pRef ] if pDict: if pDict['isParent']: self.log.verbose( 'Clear parametric parent %s' % pRef ) result = self.clearParentJob( pRef, pDict, connection ) if not result['OK']: self.log.warn( result['Message'] ) else: self.log.info( 'Parametric parent removed: %s' % pRef ) if pDict[ 'FinalStatus' ]: self.log.verbose( 'Marking Status for %s to %s' % ( pRef, pDict['Status'] ) ) pilotsToAccount[ pRef ] = pDict else: self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) ) result = self.pilotDB.setPilotStatus( pRef, pDict['Status'], pDict['DestinationSite'], updateTime = pDict['StatusDate'], conn = connection ) if len( pilotsToAccount ) > 100: self.accountPilots( pilotsToAccount, connection ) pilotsToAccount = {} self.accountPilots( pilotsToAccount, connection ) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots( connection ) connection.close() return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break # This proxy is used for checking the pilot status and renewals # We really need at least a few hours otherwise the renewed # proxy may expire before we check again... result = ce.isProxyValid( 3*3600 ) if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ images = self.imageDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['VO'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for image in images: if 'Tags' in self.imageDict[image]['ParametersDict']: tags += self.imageDict[image]['ParametersDict']['Tags'] tqDict['Tag'] = list(set(tags)) tqDict['SubmitPool'] = "mpdPool" self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] images = self.imageDict.keys() random.shuffle(images) totalSubmittedPilots = 0 matchedQueues = 0 for image in images: # Check if the image failed previously #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor #if failedCount != 0: # self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) ) # self.failedImages[image] += 1 # continue #print "AT >>> image parameters:", image #for key,value in self.imageDict[image].items(): # print key,value ce = self.imageDict[image]['CE'] ceName = self.imageDict[image]['CEName'] imageName = self.imageDict[image]['ImageName'] siteName = self.imageDict[image]['Site'] platform = self.imageDict[image]['Platform'] imageTags = self.imageDict[image]['ParametersDict'].get('Tags', []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.imageDict[image]['MaxInstances']) processorTags = [] for tag in imageTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) # vms support WholeNode naturally processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (imageName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (imageName, siteName)) continue if 'CPUTime' in self.imageDict[image]['ParametersDict']: imageCPUTime = int( self.imageDict[image]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % image) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % image) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), image)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, image)) # Get the working proxy self.log.verbose("Getting cloud proxy for %s/%s" % (self.cloudDN, self.cloudGroup)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.cloudDN, self.cloudGroup, 3600) if not result['OK']: return result self.proxy = result['Value'] #ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % image) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \ ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) ) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, image)) result = ce.createInstances(vmsToSubmit) #result = S_OK() if not result['OK']: self.log.error('Failed submission to queue %s:\n' % image, result['Message']) self.failedImages.setdefault(image, 0) self.failedImages[image] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), imageName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, imageName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfCPUs']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB') self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tags'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tags'] siteMask = siteName in siteMaskList processorTags = [] for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors.keys(): self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime( ) - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '') result = self.getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir, processors) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3) self.gridEnv = self.am_getOption('GridEnv') if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue('/DIRAC/Setup', '') if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '') if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '') result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result result = self.pilotDB.getPilotGroups(self.identityFieldsList, {'Status': self.queryStateList}) if not result['OK']: self.log.error('Fail to get identities Groups', result['Message']) return result if not result['Value']: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result['Value']: if not gridType in self.eligibleGridTypes: continue self.log.verbose('Getting pilots for %s:%s @ %s %s' % (ownerDN, ownerGroup, gridType, broker)) condDict1 = { 'Status': 'Done', 'StatusReason': 'Report from JobAgent', 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'GridType': gridType, 'Broker': broker } condDict2 = { 'Status': self.queryStateList, 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup, 'GridType': gridType, 'Broker': broker } for condDict in [condDict1, condDict2]: result = self.clearWaitingPilots(condDict) if not result['OK']: self.log.warn('Failed to clear Waiting Pilot Jobs') result = self.pilotDB.selectPilots(condDict) if not result['OK']: self.log.warn('Failed to get the Pilot Agents') return result if not result['Value']: continue refList = result['Value'] ret = gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup) if not ret['OK']: self.log.error(ret['Message']) self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % (ownerDN, ownerGroup)) continue proxy = ret['Value'] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup)) for start_index in range(0, len(refList), MAX_JOBS_QUERY): refsToQuery = refList[start_index:start_index + MAX_JOBS_QUERY] self.log.verbose( 'Querying %d pilots of %s starting at %d' % (len(refsToQuery), len(refList), start_index)) result = self.getPilotStatus(proxy, gridType, refsToQuery) if not result['OK']: if result['Message'] == 'Broker not Available': self.log.error('Broker %s not Available' % broker) break self.log.warn('Failed to get pilot status:') self.log.warn('%s:%s @ %s' % (ownerDN, ownerGroup, gridType)) continue statusDict = result['Value'] for pRef in statusDict: pDict = statusDict[pRef] if pDict: if pDict['isParent']: self.log.verbose('Clear parametric parent %s' % pRef) result = self.clearParentJob( pRef, pDict, connection) if not result['OK']: self.log.warn(result['Message']) else: self.log.info( 'Parametric parent removed: %s' % pRef) if pDict['FinalStatus']: self.log.verbose( 'Marking Status for %s to %s' % (pRef, pDict['Status'])) pilotsToAccount[pRef] = pDict else: self.log.verbose( 'Setting Status for %s to %s' % (pRef, pDict['Status'])) result = self.pilotDB.setPilotStatus( pRef, pDict['Status'], pDict['DestinationSite'], updateTime=pDict['StatusDate'], conn=connection) if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() return S_OK()
def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3) self.gridEnv = self.am_getOption("GridEnv") if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue("/DIRAC/Setup", "") if setup: instance = gConfig.getValue("/DIRAC/Setups/%s/WorkloadManagement" % setup, "") if instance: self.gridEnv = gConfig.getValue("/Systems/WorkloadManagement/%s/GridEnv" % instance, "") result = self.pilotDB._getConnection() if result["OK"]: connection = result["Value"] else: return result result = self.pilotDB.getPilotGroups(self.identityFieldsList, {"Status": self.queryStateList}) if not result["OK"]: self.log.error("Fail to get identities Groups", result["Message"]) return result if not result["Value"]: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result["Value"]: if not gridType in self.eligibleGridTypes: continue self.log.verbose("Getting pilots for %s:%s @ %s %s" % (ownerDN, ownerGroup, gridType, broker)) condDict1 = { "Status": "Done", "StatusReason": "Report from JobAgent", "OwnerDN": ownerDN, "OwnerGroup": ownerGroup, "GridType": gridType, "Broker": broker, } condDict2 = { "Status": self.queryStateList, "OwnerDN": ownerDN, "OwnerGroup": ownerGroup, "GridType": gridType, "Broker": broker, } for condDict in [condDict1, condDict2]: result = self.clearWaitingPilots(condDict) if not result["OK"]: self.log.warn("Failed to clear Waiting Pilot Jobs") result = self.pilotDB.selectPilots(condDict) if not result["OK"]: self.log.warn("Failed to get the Pilot Agents") return result if not result["Value"]: continue refList = result["Value"] ret = gProxyManager.getPilotProxyFromDIRACGroup(ownerDN, ownerGroup) if not ret["OK"]: self.log.error(ret["Message"]) self.log.error("Could not get proxy:", 'User "%s", Group "%s"' % (ownerDN, ownerGroup)) continue proxy = ret["Value"] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup) ) for start_index in range(0, len(refList), MAX_JOBS_QUERY): refsToQuery = refList[start_index : start_index + MAX_JOBS_QUERY] self.log.verbose( "Querying %d pilots of %s starting at %d" % (len(refsToQuery), len(refList), start_index) ) result = self.getPilotStatus(proxy, gridType, refsToQuery) if not result["OK"]: if result["Message"] == "Broker not Available": self.log.error("Broker %s not Available" % broker) break self.log.warn("Failed to get pilot status:") self.log.warn("%s:%s @ %s" % (ownerDN, ownerGroup, gridType)) continue statusDict = result["Value"] for pRef in statusDict: pDict = statusDict[pRef] if pDict: if pDict["isParent"]: self.log.verbose("Clear parametric parent %s" % pRef) result = self.clearParentJob(pRef, pDict, connection) if not result["OK"]: self.log.warn(result["Message"]) else: self.log.info("Parametric parent removed: %s" % pRef) if pDict["FinalStatus"]: self.log.verbose("Marking Status for %s to %s" % (pRef, pDict["Status"])) pilotsToAccount[pRef] = pDict else: self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"])) result = self.pilotDB.setPilotStatus( pRef, pDict["Status"], pDict["DestinationSite"], updateTime=pDict["StatusDate"], conn=connection, ) if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: return S_ERROR( 'CPU time limit is not specified for queue %s' % queue ) # Get the working proxy cpuTime = queueCPUTime + 86400 result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: %s' % ( queue, result['Message'] ) ) continue totalSlots = result['Value'] self.log.verbose( result['Message'] ) ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] result = taskQueueDB.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found' ) continue totalTQJobs = 0 for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % ( totalSlots, totalTQJobs, pilotsToSubmit ) ) if pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy ) if not result['OK']: return result # If proxy is not bundled in, submit with the user proxy executable = result['Executable'] proxy = result['Proxy'] result = ce.submitJob( executable, proxy, pilotsToSubmit ) if not result['OK']: self.log.error( 'Failed submission to queue %s:' % queue, result['Message'] ) # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.genericPilotDN, self.genericPilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfuly submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: %s' % result['Message'] ) continue return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups if self.checkPlatform: result = self.resourcesModule.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS}, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info('Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus(ceName, "ComputingElement") if not result['OK']: self.log.error("Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools if self.checkPlatform: platform = self.queueDict[queue]['Platform'] result = self.resourcesModule.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()