def __checkLoggingInfo( self, jobID, jobDict ): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo( jobID ) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type( startTime ) in types.StringTypes: startTime = fromString( startTime ) if startTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString( items[3] ) if endTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) endTime = dateTime() return startTime, endTime
def _checkLoggingInfo(self, jobID, jobDict): """Get info from JobLogging""" logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result["OK"]: logList = result["Value"] startTime = jobDict["StartExecTime"] if not startTime or startTime == "None": # status, minor, app, stime, source for items in logList: if items[0] == "Running": startTime = items[3] break if not startTime or startTime == "None": startTime = jobDict["SubmissionTime"] if isinstance(startTime, six.string_types): startTime = fromString(startTime) if startTime is None: self.log.error("Wrong timestamp in DB", items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == "Stalled": endTime = fromString(items[3]) if endTime is None: self.log.error("Wrong timestamp in DB", items[3]) endTime = dateTime() return startTime, endTime
def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type(startTime) in types.StringTypes: startTime = fromString(startTime) if startTime == None: self.log.error('Wrong timestamp in DB', items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString(items[3]) if endTime == None: self.log.error('Wrong timestamp in DB', items[3]) endTime = dateTime() return startTime, endTime
def execute(self): """ The main agent execution method """ limitDate = toString(dateTime() - self.period) limitDate = limitDate[:limitDate.find('.')] commonString = 'FROM MessageRepository WHERE messageTime <' cmd = "SELECT count(*) %s '%s'" % (commonString, limitDate) result = self.SystemLoggingDB._query(cmd) if not result['OK']: return result recordsToErase = result['Value'][0][0] if recordsToErase == 0: self.log.info('No records to erase') return S_OK('No records to erase') cmd = "DELETE LOW_PRIORITY %s '%s'" % (commonString, limitDate) result = self.SystemLoggingDB._update(cmd) if not result['OK']: self.log.error('Could not erase the requested records', 'those older than %s' % limitDate) return result self.log.info('%s records have been erased' % recordsToErase) return result
def execute(self): """ Main execution method """ toDate = dateTime() - day*self.graceRemovalPeriod result = self.requestClient.selectRequests({'Status':'Done','ToDate':str(toDate)}) if not result['OK']: return result requestDict = result['Value'] for rID,rName in requestDict.items(): gLogger.verbose("Removing request %s" % rName) result = self.requestClient.deleteRequest(rName) if not result['OK']: gLogger.error('Failed to delete request %s' % rName, result['Message']) else: gLogger.info('Successfully removed request %d/%s' % (rID,rName) ) if self.checkAssigned: pass if self.ftsCleaning: pass return S_OK()
def __failCompletedJobs(self): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str(dateTime() - self.completedTime * second) result = self.jobDB.selectJobs({'Status': 'Completed'}, older=checkTime) if not result['OK']: self.log.error(result['Message']) return result jobIDs = result['Value'] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttribute(jobID, 'MinorStatus') if not result['OK']: self.log.error(result['Message']) continue if result['Value'] == "Pending Requests": continue result = self.__updateJobStatus(jobID, 'Failed', "Job died during finalization") result = self.__sendAccounting(jobID) if not result['OK']: self.log.error(result['Message']) continue return S_OK()
def __failCompletedJobs( self ): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str( dateTime() - self.completedTime * second ) result = self.jobDB.selectJobs( {'Status':'Completed'}, older = checkTime ) if not result['OK']: self.log.error( result['Message'] ) return result jobIDs = result['Value'] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttribute( jobID, 'MinorStatus' ) if not result['OK']: self.log.error( result['Message'] ) continue if result['Value'] == "Pending Requests": continue result = self.__updateJobStatus( jobID, 'Failed', "Job died during finalization" ) result = self.__sendAccounting( jobID ) if not result['OK']: self.log.error( result['Message'] ) continue return S_OK()
def _failSubmittingJobs(self): """ Failed Jobs stuck in Submitting Status for a long time. They are due to a failed bulk submission transaction. """ # Get old Submitting Jobs checkTime = str(dateTime() - self.submittingTime * second) result = self.jobDB.selectJobs({'Status': 'Submitting'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if not jobIDs: return S_OK() for jobID in jobIDs: result = self.__updateJobStatus(jobID, 'Failed') if not result['OK']: self.log.error('Failed to update job status', result['Message']) continue return S_OK()
def FillMessageRepository(self): """This function fills the MessageRepository with random values. It could be useful to test performance of the database. """ self.__CreateAuxiliaryLists() LogLevels = [ 'ALWAYS', 'INFO', 'VERB', 'DEBUG', 'WARN', 'ERROR', 'EXCEPT', 'FATAL' ] initialDate = dateTime() for i in range(1, 800): limitDate = toString(initialDate - randrange(0, 1680) * hour - randrange(0, 60) * minute - randrange(0, 60) * second) message = tupleToMessage([ self.systemNames[randrange(0, 5)], LogLevels[randrange(0, 8)], limitDate, self.fixedMessages[randrange(0, 6)], 'variable text %s' % randrange(0, 6), '', self.subSystemNames[randrange(0, 5)], self.sites[randrange(0, 5)] ]) userId = randrange(0, 12) result = self.insertMessageIntoDB(message, self.users[userId][0], self.users[userId][1], self.clientIPs[randrange(0, 20)]) if not result['OK']: print result['Value']
def FillMessageRepository(self): """This function fills the MessageRepository with random values. It could be useful to test performance of the database. """ self.__CreateAuxiliaryLists() LogLevels = [ 'ALWAYS' , 'INFO', 'VERB', 'DEBUG', 'WARN', 'ERROR', 'EXCEPT', 'FATAL' ] initialDate=dateTime() for _i in range( 1, 800 ): limitDate = toString( initialDate - randrange(0,1680) * hour - randrange( 0, 60) * minute - randrange( 0, 60) * second ) message = tupleToMessage ( [ self.systemNames[ randrange( 0, 5 ) ], LogLevels[ randrange( 0, 8 ) ], limitDate, self.fixedMessages[ randrange( 0, 6 ) ], 'variable text %s' % randrange( 0, 6 ), '', self.subSystemNames[ randrange( 0, 5 ) ], self.sites[ randrange( 0, 5 ) ] ] ) userId = randrange( 0, 12 ) result = self.insertMessageIntoDB( message, self.users[ userId ][ 0 ], self.users[ userId ][ 1 ], self.clientIPs[ randrange( 0, 20 ) ] ) if not result['OK']: print result['Value']
def __failCompletedJobs(self): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str(dateTime() - self.completedTime * second) result = self.jobDB.selectJobs({"Status": "Completed"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttributes(jobID, ["Status", "MinorStatus"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) continue if result["Value"]["Status"] != "Completed": continue if result["Value"]["MinorStatus"] == "Pending Requests": continue result = self.__updateJobStatus(jobID, "Failed", "Job died during finalization") result = self.__sendAccounting(jobID) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) continue return S_OK()
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = InstallTools.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join( [sname, cname])) elif type(component) in StringTypes: componentList = [component] else: componentList = component resultDict = {} for c in componentList: if not '/' in c: continue system, cname = c.split('/') startDir = InstallTools.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' logFile = file(currentLog, 'r') logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False timeStamp = fromString(fields[0] + ' ' + fields[1]) if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[c] = { 'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError } return S_OK(resultDict)
def __processDir(self, dirPath, dirMetadata): ''' calculate nb of files and size of :dirPath:, remove it if it's empty ''' subDirs = dirMetadata['SubDirs'] closedDirs = dirMetadata['ClosedDirs'] ############################## # FIXME: Until we understand while closed dirs are not working... ############################## closedDirs = [] prStr = "%s: found %s sub-directories" % (dirPath, len(subDirs) if subDirs else 'no') if closedDirs: prStr += ", %s are closed (ignored)" % len(closedDirs) for rmDir in closedDirs + self.__ignoreDirsList: subDirs.pop(rmDir, None) numberOfFiles = long(dirMetadata['Files']) totalSize = long(dirMetadata['TotalSize']) if numberOfFiles: prStr += " and %s files (%s bytes)" % (numberOfFiles, totalSize) else: prStr += " and no files" self.log.notice(prStr) if closedDirs: self.log.verbose("Closed dirs:\n %s" % '\n'.join(closedDirs)) siteUsage = dirMetadata['SiteUsage'] if numberOfFiles > 0: dirData = { 'Files': numberOfFiles, 'TotalSize': totalSize, 'SEUsage': siteUsage } self.__addDirToPublishQueue(dirPath, dirData) # Print statistics self.log.verbose( "%-40s %20s %20s" % ('Storage Element', 'Number of files', 'Total size')) for storageElement in sorted(siteUsage): usageDict = siteUsage[storageElement] self.log.verbose( "%-40s %20s %20s" % (storageElement, str( usageDict['Files']), str(usageDict['Size']))) # If it's empty delete it elif len(subDirs) == 0 and len(closedDirs) == 0: if dirPath != self.__baseDir: self.removeEmptyDir(dirPath) return # We don't need the cached information about owner self.__directoryOwners.pop(dirPath, None) rightNow = dateTime() chosenDirs = [ subDir for subDir in subDirs if not self.activePeriod or timeInterval( subDirs[subDir], self.activePeriod * week).includes(rightNow) ] self.__dirExplorer.addDirList(chosenDirs) self.__processedDirs += 1
def __getToken2(self): """Get the Keystone token for the version v2 of the keystone service :return: S_OK(token) or S_ERROR """ user = self.parameters.get('User') password = self.parameters.get('Password') authArgs = {} if user and password: authDict = {'auth': {"passwordCredentials": {"username": user, "password": password} } } if self.project: authDict['auth']['tenantName'] = self.project elif self.parameters.get('Auth') == "voms": authDict = {'auth': {'voms': True}} if self.project: authDict['auth']['tenantName'] = self.project if self.parameters.get('Proxy'): authArgs['cert'] = self.parameters.get('Proxy') try: result = requests.post("%s/tokens" % self.url, headers={"Content-Type": "application/json"}, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR('Exception getting keystone token: %s' % str(exc)) output = result.json() if result.status_code in [400, 401]: message = "None" if 'error' in output: message = output['error'].get('message') return S_ERROR('Authorization error: %s' % message) self.token = str(output['access']['token']['id']) expires = fromString(str(output['access']['token']['expires']).replace('T', ' ').replace('Z', '')) issued = fromString(str(output['access']['token']['issued_at']).replace('T', ' ').replace('Z', '')) self.expires = dateTime() + (expires - issued) self.projectID = output['access']['token']['tenant']['id'] for endpoint in output['access']['serviceCatalog']: if endpoint['type'] == 'compute': self.computeURL = str(endpoint['endpoints'][0]['publicURL']) elif endpoint['type'] == 'image': self.imageURL = str(endpoint['endpoints'][0]['publicURL']) elif endpoint['type'] == 'network': self.networkURL = str(endpoint['endpoints'][0]['publicURL']) return S_OK(self.token)
def __kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = str(dateTime() - self.matchedTime * second) result = self.jobDB.selectJobs({'Status': 'Matched'}, older=checkTime) if not result['OK']: self.log.error(result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Matched status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs']) checkTime = str(dateTime() - self.rescheduledTime * second) result = self.jobDB.selectJobs({'Status': 'Rescheduled'}, older=checkTime) if not result['OK']: self.log.error(result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Rescheduled status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs']) if message: return S_ERROR(message) else: return S_OK()
def _kickStuckJobs(self): """Reschedule jobs stuck in initialization status Rescheduled, Matched""" message = "" checkTime = dateTime() - self.matchedTime * second result = self.jobDB.selectJobs({"Status": JobStatus.MATCHED}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Matched status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: message = "Failed to reschedule %d jobs stuck in Matched status" % len( result["FailedJobs"]) checkTime = dateTime() - self.rescheduledTime * second result = self.jobDB.selectJobs({"Status": JobStatus.RESCHEDULED}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Rescheduled status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: if message: message += "\n" message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len( result["FailedJobs"]) if message: return S_ERROR(message) return S_OK()
def finalize( self ): transEndTime = dateTime() regStartTime = time.time() res = self.__registerSuccessful() regSuc, regTotal = res['Value'] regTime = time.time() - regStartTime if self.sourceSE and self.targetSE: self.__sendAccounting( regSuc, regTotal, regTime, transEndTime ) self.__removeFailedTargets() self.__determineMissingSource() return S_OK()
def finalize(self): transEndTime = dateTime() regStartTime = time.time() res = self.__registerSuccessful() regSuc, regTotal = res['Value'] regTime = time.time() - regStartTime if self.sourceSE and self.targetSE: self.__sendAccounting(regSuc, regTotal, regTime, transEndTime) self.__removeFailedTargets() self.__determineMissingSource() return S_OK()
def __kickStuckJobs( self ): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = str( dateTime() - self.matchedTime * second ) result = self.jobDB.selectJobs( {'Status':'Matched'}, older = checkTime ) if not result['OK']: self.log.error( result['Message'] ) return result jobIDs = result['Value'] if jobIDs: self.log.info( 'Rescheduling %d jobs stuck in Matched status' % len( jobIDs ) ) result = self.jobDB.rescheduleJobs( jobIDs ) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs'] ) checkTime = str( dateTime() - self.rescheduledTime * second ) result = self.jobDB.selectJobs( {'Status':'Rescheduled'}, older = checkTime ) if not result['OK']: self.log.error( result['Message'] ) return result jobIDs = result['Value'] if jobIDs: self.log.info( 'Rescheduling %d jobs stuck in Rescheduled status' % len( jobIDs ) ) result = self.jobDB.rescheduleJobs( jobIDs ) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs'] ) if message: return S_ERROR( message ) else: return S_OK() #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#
def export_checkComponentLog( self, component ): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = InstallTools.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append( '/'.join( [sname, cname] ) ) elif type( component ) in StringTypes: componentList = [component] else: componentList = component resultDict = {} for c in componentList: if not '/' in c: continue system, cname = c.split( '/' ) startDir = InstallTools.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' logFile = file( currentLog, 'r' ) logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False timeStamp = fromString( fields[0] + ' ' + fields[1] ) if ( now - timeStamp ) < hour: errors_1 += 1 recent = True if ( now - timeStamp ) < day: errors_24 += 1 recent = True if recent: lastError = line.split( 'ERROR:' )[-1].strip() resultDict[c] = {'ErrorsHour':errors_1, 'ErrorsDay':errors_24, 'LastError':lastError} return S_OK( resultDict )
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if "*" in component: if component == "*": result = InstallTools.getSetupComponents() if result["OK"]: for ctype in ["Services", "Agents"]: if ctype in result["Value"]: for sname in result["Value"][ctype]: for cname in result["Value"][ctype][sname]: componentList.append("/".join([sname, cname])) elif type(component) in StringTypes: componentList = [component] else: componentList = component resultDict = {} for c in componentList: if not "/" in c: continue system, cname = c.split("/") startDir = InstallTools.startDir currentLog = startDir + "/" + system + "_" + cname + "/log/current" logFile = file(currentLog, "r") logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = "" for line in logLines: if "ERROR:" in line: fields = line.split() recent = False timeStamp = fromString(fields[0] + " " + fields[1]) if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split("ERROR:")[-1].strip() resultDict[c] = {"ErrorsHour": errors_1, "ErrorsDay": errors_24, "LastError": lastError} return S_OK(resultDict)
def __kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = "" checkTime = str(dateTime() - self.matchedTime * second) result = self.jobDB.selectJobs({"Status": "Matched"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Matched status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: message = "Failed to reschedule %d jobs stuck in Matched status" % len(result["FailedJobs"]) checkTime = str(dateTime() - self.rescheduledTime * second) result = self.jobDB.selectJobs({"Status": "Rescheduled"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Rescheduled status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: if message: message += "\n" message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len(result["FailedJobs"]) if message: return S_ERROR(message) else: return S_OK()
def isProxyValid(self, valid=1000): """Check if the stored proxy is valid""" if not self.valid: result = S_ERROR("Proxy is not valid for the requested length") result["Value"] = 0 return result delta = self.valid - dateTime() totalSeconds = delta.days * 86400 + delta.seconds if totalSeconds > valid: return S_OK(totalSeconds - valid) result = S_ERROR("Proxy is not valid for the requested length") result["Value"] = totalSeconds - valid return result
def _renewCloudProxy(self): """Takes short lived proxy from the site director and promotes it to a long lived proxy keeping the DIRAC group. :returns: True on success, false otherwise. :rtype: bool """ if not self._cloudDN or not self._cloudGroup: self.log.error( "Could not renew cloud proxy, DN and/or Group not set.") return False proxyLifetime = int( self.ceParameters.get("Context_ProxyLifetime", DEF_PROXYLIFETIME)) # only renew proxy if lifetime is less than configured lifetime # self.valid is a datetime if self.valid - dateTime() > proxyLifetime * second: return True proxyLifetime += DEF_PROXYGRACE proxyManager = ProxyManagerClient() self.log.info("Downloading proxy with cloudDN and cloudGroup: %s, %s" % (self._cloudDN, self._cloudGroup)) res = proxyManager.downloadProxy(self._cloudDN, self._cloudGroup, limited=True, requiredTimeLeft=proxyLifetime) if not res["OK"]: self.log.error("Could not download proxy", res["Message"]) return False resdump = res["Value"].dumpAllToString() if not resdump["OK"]: self.log.error("Failed to dump proxy to string", resdump["Message"]) return False self.proxy = resdump["Value"] self.valid = dateTime() + proxyLifetime * second return True
def isProxyValid(self, valid=1000): """ Check if the stored proxy is valid """ if not self.valid: result = S_ERROR("Proxy is not valid for the requested length") result["Value"] = 0 return result delta = self.valid - dateTime() totalSeconds = delta.days * 86400 + delta.seconds if totalSeconds > valid: return S_OK(totalSeconds - valid) else: result = S_ERROR("Proxy is not valid for the requested length") result["Value"] = totalSeconds - valid return result
def getToken(self, force=False): """Get the Keystone token :param force: flag to force getting the token if even there is one in the cache :return: S_OK(token) or S_ERROR """ if self.token is not None and not force: if self.expires and (self.expires - dateTime()).seconds > 300: return S_OK(self.token) if self.apiVersion == 2: result = self.__getToken2() else: result = self.__getToken3() return result
def finalize( self ): """ finalize FTS job :param self: self reference """ self.__updateMetadataCache() transEndTime = dateTime() regStartTime = time.time() res = self.getTransferStatistics() transDict = res['Value'] res = self.__registerSuccessful( transDict['transLFNs'] ) regSuc, regTotal = res['Value'] regTime = time.time() - regStartTime if self.sourceSE and self.targetSE: self.__sendAccounting( regSuc, regTotal, regTime, transEndTime, transDict ) return S_OK()
def _markStalledJobs(self, stalledTime): """ Identifies stalled jobs running or completing without update longer than stalledTime. """ stalledCounter = 0 aliveCounter = 0 # This is the minimum time we wait for declaring a job Stalled, therefore it is safe checkTime = dateTime() - stalledTime * second checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING] # Only get jobs whose HeartBeat is older than the stalledTime result = self.jobDB.selectJobs({'Status': checkedStatuses}, older=checkTime, timeStamp='HeartBeatTime') if not result['OK']: return result if not result['Value']: return S_OK() jobs = sorted(result['Value']) self.log.info( '%d %s jobs will be checked for being stalled, heartbeat before %s' % (len(jobs), ' & '.join(checkedStatuses), str(checkTime))) for job in jobs: delayTime = stalledTime # Add a tolerance time for some sites if required site = self.jobDB.getJobAttribute(job, 'site')['Value'] if site in self.stalledJobsTolerantSites: delayTime += self.stalledJobsToleranceTime # Check if the job is really stalled result = self.__checkJobStalled(job, delayTime) if result['OK']: self.log.verbose('Updating status to Stalled for job %s' % (job)) self.__updateJobStatus(job, 'Stalled') stalledCounter += 1 else: self.log.verbose(result['Message']) aliveCounter += 1 self.log.info('Total jobs: %d, Stalled jobs: %d, %s jobs: %d' % (len(jobs), stalledCounter, '+'.join(checkedStatuses), aliveCounter)) return S_OK()
def _failSubmittingJobs(self): """Failed Jobs stuck in Submitting Status for a long time. They are due to a failed bulk submission transaction. """ # Get old Submitting Jobs checkTime = dateTime() - self.submittingTime * second result = self.jobDB.selectJobs({"Status": JobStatus.SUBMITTING}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result for jobID in result["Value"]: result = self._updateJobStatus(jobID, JobStatus.FAILED, force=True) if not result["OK"]: self.log.error("Failed to update job status", result["Message"]) continue return S_OK()
def __failSubmittingJobs(self): """ Failed Jobs stuck in Submitting Status for a long time. They are due to a failed bulk submission transaction. """ # Get old Submitting Jobs checkTime = str(dateTime() - self.submittingTime * second) result = self.jobDB.selectJobs({'Status': 'Submitting'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if not jobIDs: return S_OK() for jobID in jobIDs: result = self.__updateJobStatus(jobID, 'Failed') if not result['OK']: self.log.error('Failed to update job status', result['Message']) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tags'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tags'] siteMask = siteName in siteMaskList processorTags = [] for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors.keys(): self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime( ) - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '') result = self.getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir, processors) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools} if self.vo: tqDict["Community"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result["Value"] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] ceType = self.queueDict[queue]["CEType"] queueName = self.queueDict[queue]["QueueName"] siteName = self.queueDict[queue]["Site"] siteMask = siteName in siteMaskList if "CPUTime" in self.queueDict[queue]["ParametersDict"]: queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result["OK"]: self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"])) continue ceInfoDict = result["CEInfoDict"] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % ( ceName, queueName, ceInfoDict["WaitingJobs"], ceInfoDict["RunningJobs"], ceInfoDict["SubmittedJobs"], ceInfoDict["MaxTotalJobs"], ) ) totalSlots = result["Value"] ceDict = ce.getParameterDict() ceDict["GridCE"] = ceName if not siteMask and "Site" in ceDict: self.log.info("Site not in the mask %s" % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict["Site"] if self.vo: ceDict["Community"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups # This is a hack to get rid of ! ceDict["SubmitPool"] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: continue ceDict["Platform"] = result["Value"] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.info("No matching TQs found") continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime ) if not result["OK"]: self.log.error("Failed to get Number of Waiting pilots", result["Message"]) totalWaitingPilots = 0 else: totalWaitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots) pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d" % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get("BundleProxy", False) jobExecDir = "" if ceType == "CREAM": jobExecDir = "." jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir) httpProxy = self.queueDict[queue].get("HttpProxy", "") result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result["OK"]: return result executable, pilotSubmissionChunk = result["Value"] result = ce.submitJob(executable, "", pilotSubmissionChunk) os.unlink(executable) if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % queue, result["Message"]) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result["Value"] self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key("PilotStampDict"): stampDict = result["PilotStampDict"] tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict ) if not result["OK"]: self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"]) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, "Submitted", ceName, "Successfully submitted by the SiteDirector", siteName, queueName, ) if not result["OK"]: self.log.error("Failed to set pilot status: ", result["Message"]) continue return S_OK()
def sendAccounting( self, jobID ): """Send WMS accounting data for the given job """ accountingReport = Job() result = self.jobDB.getJobAttributes( jobID ) if not result['OK']: return result jobDict = result['Value'] result = self.logDB.getJobLoggingInfo( jobID ) if not result['OK']: logList = [] else: logList = result['Value'] startTime = jobDict['StartExecTime'] endTime = '' if not startTime or startTime == 'None': for status, minor, app, stime, source in logList: if status == 'Running': startTime = stime break for status, minor, app, stime, source in logList: if status == 'Stalled': endTime = stime if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type( startTime ) in types.StringTypes: startTime = fromString( startTime ) result = self.logDB.getJobLoggingInfo( jobID ) if not result['OK']: endTime = dateTime() else: for status, minor, app, stime, source in result['Value']: if status == 'Stalled': endTime = stime break if not endTime: endTime = dateTime() if type( endTime ) in types.StringTypes: endTime = fromString( endTime ) result = self.jobDB.getHeartBeatData( jobID ) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if result['OK']: for name, value, heartBeatTime in result['Value']: if 'CPUConsumed' == name: try: value = int( float( value ) ) if value > lastCPUTime: lastCPUTime = value except: pass if 'WallClockTime' == name: try: value = int( float( value ) ) if value > lastWallTime: lastWallTime = value except: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime accountingReport.setStartTime( startTime ) accountingReport.setEndTime() # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site' : jobDict['Site'], 'User' : jobDict['Owner'], 'UserGroup' : jobDict['OwnerGroup'], 'JobGroup' : jobDict['JobGroup'], 'JobType' : jobDict['JobType'], 'JobClass' : jobDict['JobSplitType'], 'ProcessingType' : 'unknown', 'FinalMajorStatus' : 'Failed', 'FinalMinorStatus' : 'Stalled', 'CPUTime' : lastCPUTime, 'NormCPUTime' : 0.0, 'ExecTime' : lastWallTime, 'InputDataSize' : 0.0, 'OutputDataSize' : 0.0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0.0, 'InputSandBoxSize' : 0.0, 'OutputSandBoxSize' : 0.0, 'ProcessedEvents' : 0 } self.log.verbose( 'Accounting Report is:' ) self.log.verbose( acData ) accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' ) else: self.log.warn( 'Failed to send accounting report for job %d' % int( jobID ) ) self.log.error( result['Message'] ) return result
def __getToken3(self): """Get the Keystone token for the version v3 of the keystone service :return: S_OK(token) or S_ERROR """ domain = self.parameters.get("Domain", "Default") user = self.parameters.get("User") password = self.parameters.get("Password") appcred_file = self.parameters.get("Appcred") authDict = {} authArgs = {} if user and password: authDict = { "auth": { "identity": { "methods": ["password"], "password": { "user": { "name": user, "domain": { "name": domain }, "password": password } }, } } } elif self.parameters.get("Auth") == "voms": authDict = { "auth": { "identity": { "methods": ["mapped"], "mapped": { "voms": True, "identity_provider": "egi.eu", "protocol": "mapped" }, } } } if self.parameters.get("Proxy"): authArgs["cert"] = self.parameters.get("Proxy") elif appcred_file: # The application credentials are stored in a file of the format: # id secret ac_fd = open(appcred_file, "r") auth_info = ac_fd.read() auth_info = auth_info.strip() ac_id, ac_secret = auth_info.split(" ", 1) ac_fd.close() authDict = { "auth": { "identity": { "methods": ["application_credential"], "application_credential": { "id": ac_id, "secret": ac_secret }, } } } else: return S_ERROR("No valid credentials provided") # appcred includes the project scope binding in the credential itself if self.project and not appcred_file: authDict["auth"]["scope"] = { "project": { "domain": { "name": domain }, "name": self.project } } gLogger.debug("Request token with auth arguments: %s and body %s" % (str(authArgs), str(authDict))) url = "%s/auth/tokens" % self.url try: result = requests.post(url, headers={ "Content-Type": "application/json", "Accept": "application/json", }, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR("Exception getting keystone token: %s" % str(exc)) if result.status_code not in [200, 201, 202, 203, 204]: return S_ERROR("Failed to get keystone token: %s" % result.text) try: self.token = result.headers["X-Subject-Token"] except Exception as exc: return S_ERROR("Failed to get keystone token: %s" % str(exc)) output = result.json() expires = fromString( str(output["token"]["expires_at"]).replace("T", " ").replace("Z", "")) issued = fromString( str(output["token"]["issued_at"]).replace("T", " ").replace("Z", "")) self.expires = dateTime() + (expires - issued) if "project" in output["token"]: if output["token"]["project"]["name"] == self.project: self.projectID = output["token"]["project"]["id"] if "catalog" in output["token"]: for service in output["token"]["catalog"]: if service["type"] == "compute": for endpoint in service["endpoints"]: if endpoint["interface"] == "public": self.computeURL = str(endpoint["url"]) elif service["type"] == "image": for endpoint in service["endpoints"]: if endpoint["interface"] == "public": self.imageURL = str(endpoint["url"]) elif service["type"] == "network": for endpoint in service["endpoints"]: if endpoint["interface"] == "public": self.networkURL = str(endpoint["url"]) return S_OK(self.token)
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = gComponentInstaller.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents', 'Executors']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join([sname, cname])) elif isinstance(component, basestring): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if '/' not in comp: continue system, cname = comp.split('/') startDir = gComponentInstaller.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' try: logFile = file(currentLog, 'r') except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = {'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err)} continue logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split('ERROR:')[-1].strip() continue timeStamp = fromString(fields[0] + ' ' + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split('ERROR:')[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[comp] = {'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError} return S_OK(resultDict)
def setProxy(self, proxy, valid=0): """ Set proxy for this instance """ self.proxy = proxy self.valid = dateTime() + second * valid
def _failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ # Only get jobs that have been Stalled for long enough checkTime = dateTime() - failedTime * second result = self.jobDB.selectJobs({'Status': JobStatus.STALLED}, older=checkTime) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info( '%d jobs Stalled before %s will be checked for failure' % (len(jobs), str(checkTime))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: # Verify that there was no sign of life for long enough result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: self.__sendKillCommand(job) self.__updateJobStatus(job, JobStatus.FAILED, setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': JobStatus.FAILED, 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter)
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = gComponentInstaller.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents', 'Executors']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join( [sname, cname])) elif isinstance(component, basestring): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if '/' not in comp: continue system, cname = comp.split('/') startDir = gComponentInstaller.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' try: logFile = file(currentLog, 'r') except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = { 'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err) } continue logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split('ERROR:')[-1].strip() continue timeStamp = fromString(fields[0] + ' ' + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split('ERROR:')[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[comp] = { 'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError } return S_OK(resultDict)
def __getToken2(self): """Get the Keystone token for the version v2 of the keystone service :return: S_OK(token) or S_ERROR """ user = self.parameters.get("User") password = self.parameters.get("Password") authArgs = {} if user and password: authDict = { "auth": { "passwordCredentials": { "username": user, "password": password } } } if self.project: authDict["auth"]["tenantName"] = self.project elif self.parameters.get("Auth") == "voms": authDict = {"auth": {"voms": True}} if self.project: authDict["auth"]["tenantName"] = self.project if self.parameters.get("Proxy"): authArgs["cert"] = self.parameters.get("Proxy") try: result = requests.post( "%s/tokens" % self.url, headers={"Content-Type": "application/json"}, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR("Exception getting keystone token: %s" % str(exc)) output = result.json() if result.status_code in [400, 401]: message = "None" if "error" in output: message = output["error"].get("message") return S_ERROR("Authorization error: %s" % message) self.token = str(output["access"]["token"]["id"]) expires = fromString( str(output["access"]["token"]["expires"]).replace("T", " ").replace( "Z", "")) issued = fromString( str(output["access"]["token"]["issued_at"]).replace("T", " ").replace( "Z", "")) self.expires = dateTime() + (expires - issued) self.projectID = output["access"]["token"]["tenant"]["id"] for endpoint in output["access"]["serviceCatalog"]: if endpoint["type"] == "compute": self.computeURL = str(endpoint["endpoints"][0]["publicURL"]) elif endpoint["type"] == "image": self.imageURL = str(endpoint["endpoints"][0]["publicURL"]) elif endpoint["type"] == "network": self.networkURL = str(endpoint["endpoints"][0]["publicURL"]) return S_OK(self.token)
def __obtainWMSJobIDs(self, transformation, fileDict, selectDelay, wmsStatusList): """ Group files by the corresponding WMS jobIDs, check the corresponding jobs have not been updated for the delay time. Can't get into any mess because we start from files only in MaxReset / Assigned and check corresponding jobs. Mixtures of files for jobs in MaxReset and Assigned statuses only possibly include some files in Unused status (not Processed for example) that will not be touched. """ taskIDList = sorted( set(taskID for taskID, _status in fileDict.values())) self.transLogger.verbose( "The following %d task IDs correspond to the selected files:\n%s" % (len(taskIDList), ', '.join(str(taskID) for taskID in taskIDList))) jobFileDict = {} olderThan = dateTime() - datetime.timedelta(hours=selectDelay) res = self.transClient.getTransformationTasks( condDict={ 'TransformationID': transformation, 'TaskID': taskIDList }, older=olderThan, timeStamp='LastUpdateTime') if not res['OK']: self.transLogger.error("getTransformationTasks returned an error", '%s' % res['Message']) return res mandatoryKeys = { 'TaskID', 'ExternalID', 'LastUpdateTime', 'ExternalStatus' } for taskDict in res['Value']: missingKey = mandatoryKeys - set(taskDict) if missingKey: for key in missingKey: self.transLogger.warn( 'Missing key %s for job dictionary:\n\t%s' % (key, str(taskDict))) continue taskID = taskDict['TaskID'] wmsID = taskDict['ExternalID'] wmsStatus = taskDict['ExternalStatus'] if not int(wmsID): self.transLogger.verbose( 'TaskID %s: status is %s (jobID = %s) so will not recheck with WMS' % (taskID, wmsStatus, wmsID)) continue # Exclude jobs not having appropriate WMS status - have to trust that production management status is correct if wmsStatus not in wmsStatusList: self.transLogger.verbose( 'Job %s is in status %s, not in %s so will be ignored' % (wmsID, wmsStatus, ', '.join(wmsStatusList))) continue # Must map unique files -> jobs in expected state jobFileDict[wmsID] = [ lfn for lfn, (tID, _st) in fileDict.iteritems() if int(tID) == int(taskID) ] self.transLogger.info( 'Found %d files for taskID %s, jobID %s (%s), last update %s' % (len(jobFileDict[wmsID]), taskID, wmsID, wmsStatus, taskDict['LastUpdateTime'])) return S_OK(jobFileDict)
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % (queue, result['Message'])) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info('No matching TQs found') continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue return S_OK()
def obtainWMSJobIDs(self, transformation, fileDict, selectDelay, wmsStatusList): """ Group files by the corresponding WMS jobIDs, check the corresponding jobs have not been updated for the delay time. Can't get into any mess because we start from files only in MaxReset / Assigned and check corresponding jobs. Mixtures of files for jobs in MaxReset and Assigned statuses only possibly include some files in Unused status (not Processed for example) that will not be touched. """ prodJobIDs = uniqueElements(fileDict.values()) self.log.info('The following %s production jobIDs apply to the selected files:\n%s' % (len(prodJobIDs), prodJobIDs)) jobFileDict = {} condDict = {'TransformationID' : transformation, self.taskIDName : prodJobIDs} delta = datetime.timedelta( hours = selectDelay ) now = dateTime() olderThan = now-delta res = self.prodDB.getTransformationTasks(condDict = condDict, older = olderThan, timeStamp = 'LastUpdateTime', inputVector = True) self.log.debug(res) if not res['OK']: self.log.error('getTransformationTasks returned an error:\n%s') return res for jobDict in res['Value']: missingKey = False for key in [self.taskIDName, self.externalID, 'LastUpdateTime', self.externalStatus, 'InputVector']: if not jobDict.has_key(key): self.log.info('Missing key %s for job dictionary, the following is available:\n%s' % (key, jobDict)) missingKey = True continue if missingKey: continue job = jobDict[self.taskIDName] wmsID = jobDict[self.externalID] lastUpdate = jobDict['LastUpdateTime'] wmsStatus = jobDict[self.externalStatus] jobInputData = jobDict['InputVector'] jobInputData = [lfn.replace('LFN:','') for lfn in jobInputData.split(';')] if not int(wmsID): self.log.info('Prod job %s status is %s (ID = %s) so will not recheck with WMS' %(job, wmsStatus, wmsID)) continue self.log.info('Job %s, prod job %s last update %s, production management system status %s' % (wmsID, job, lastUpdate, wmsStatus)) #Exclude jobs not having appropriate WMS status - have to trust that production management status is correct if not wmsStatus in wmsStatusList: self.log.info('Job %s is in status %s, not %s so will be ignored' % (wmsID, wmsStatus, string.join(wmsStatusList, ', '))) continue finalJobData = [] #Must map unique files -> jobs in expected state for lfn,prodID in fileDict.items(): if int(prodID) == int(job): finalJobData.append(lfn) self.log.info('Found %s files for job %s' % (len(finalJobData), job)) jobFileDict[wmsID] = finalJobData return S_OK(jobFileDict)
def export_checkComponentLog(self, component): """Check component log for errors""" componentList = [] if "*" in component: if component == "*": result = gComponentInstaller.getSetupComponents() if result["OK"]: for ctype in ["Services", "Agents", "Executors"]: if ctype in result["Value"]: for sname in result["Value"][ctype]: for cname in result["Value"][ctype][sname]: componentList.append("/".join( [sname, cname])) elif isinstance(component, six.string_types): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if "/" not in comp: continue system, cname = comp.split("/") startDir = gComponentInstaller.startDir currentLog = startDir + "/" + system + "_" + cname + "/log/current" try: with open(currentLog, "r") as logFile: logLines = logFile.readlines() except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = { "ErrorsHour": -1, "ErrorsDay": -1, "LastError": currentLog + "::" + repr(err) } continue errors_1 = 0 errors_24 = 0 now = dateTime() lastError = "" for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split("ERROR:")[-1].strip() continue timeStamp = fromString(fields[0] + " " + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split("ERROR:")[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split("ERROR:")[-1].strip() resultDict[comp] = { "ErrorsHour": errors_1, "ErrorsDay": errors_24, "LastError": lastError } return S_OK(resultDict)
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS}, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info('Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus(ceName, "ComputingElement") if not result['OK']: self.log.error("Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def main(): Script.registerSwitch( "C", "country", "Sort site names by country postfix (i.e. LCG.IHEP.cn, LCG.IN2P3.fr, LCG.IHEP.su)", sortBy) Script.registerSwitch("R", "reverse", "Reverse the sort order", isReverse) # Registering arguments will automatically add their description to the help menu Script.registerArgument([ "Section: Name of the subsection in '/Resources/Sites/' for sort (i.e. LCG DIRAC)" ], mandatory=False) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() result = getProxyInfo() if not result["OK"]: gLogger.error("Failed to get proxy information", result["Message"]) DIRACExit(2) proxy = result["Value"] if proxy["secondsLeft"] < 1: gLogger.error("Your proxy has expired, please create new one") DIRACExit(2) group = proxy["group"] if "CSAdministrator" not in getPropertiesForGroup(group): gLogger.error( "You must be CSAdministrator user to execute this script") gLogger.notice( "Please issue 'dirac-proxy-init -g [group with CSAdministrator Property]'" ) DIRACExit(2) cs = CSAPI() result = cs.getCurrentCFG() if not result["OK"]: gLogger.error("Failed to get copy of CS", result["Message"]) DIRACExit(2) cfg = result["Value"] if not cfg.isSection("Resources"): gLogger.error("Section '/Resources' is absent in CS") DIRACExit(2) if not cfg.isSection("Resources/Sites"): gLogger.error("Subsection '/Resources/Sites' is absent in CS") DIRACExit(2) if args and len(args) > 0: resultList = args[:] else: resultList = cfg["Resources"]["Sites"].listSections() hasRun = False isDirty = False for i in resultList: if not cfg.isSection("Resources/Sites/%s" % i): gLogger.error("Subsection /Resources/Sites/%s does not exists" % i) continue hasRun = True if SORTBYNAME: dirty = cfg["Resources"]["Sites"][i].sortAlphabetically( ascending=not REVERSE) else: dirty = cfg["Resources"]["Sites"][i].sortByKey(key=country, reverse=REVERSE) if dirty: isDirty = True if not hasRun: gLogger.notice( "Failed to find suitable subsections with site names to sort") DIRACExit(0) if not isDirty: gLogger.notice("Nothing to do, site names are already sorted") DIRACExit(0) timestamp = toString(dateTime()) stamp = "Site names are sorted by %s script at %s" % (Script.scriptName, timestamp) cs.setOptionComment("/Resources/Sites", stamp) result = cs.commit() if not result["OK"]: gLogger.error("Failed to commit changes to CS", result["Message"]) DIRACExit(2) gLogger.notice("Site names are sorted and committed to CS") DIRACExit(0)
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add( site ) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add( site ) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None ) totalWaitingPilots = 0 if result['OK']: totalWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) ) #if totalWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle( queues ) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor if failedCount != 0: self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) ) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] siteMask = siteName in siteMaskList if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) ) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) ) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName #if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found for %s' % queue ) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) ) # Get the number of already waiting pilots for these task queues totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) if totalWaitingPilots >= totalTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots ) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) ) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue totalSlots = self.__getQueueSlots( queue ) if totalSlots == 0: self.log.debug( '%s: No slots available' % queue ) continue pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len( pilotList ) totalSubmittedPilots += len( pilotList ) self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) ) return S_OK()
def execute(self): """The main agent execution method""" # Now we are getting what's going to be checked futures = [] # 1) Queueing the jobs that might be marked Stalled # This is the minimum time we wait for declaring a job Stalled, therefore it is safe checkTime = dateTime() - self.stalledTime * second checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING] # Only get jobs whose HeartBeat is older than the stalledTime result = self.jobDB.selectJobs({"Status": checkedStatuses}, older=checkTime, timeStamp="HeartBeatTime") if not result["OK"]: self.log.error( "Issue selecting %s jobs" % " & ".join(checkedStatuses), result["Message"]) if result["Value"]: jobs = sorted(result["Value"]) self.log.info( "%s jobs will be checked for being stalled" % " & ".join(checkedStatuses), "(n=%d, heartbeat before %s)" % (len(jobs), str(checkTime)), ) for job in jobs: future = self.threadPoolExecutor.submit( self._execute, "%s:_markStalledJobs" % job) futures.append(future) # 2) fail Stalled Jobs result = self.jobDB.selectJobs({"Status": JobStatus.STALLED}) if not result["OK"]: self.log.error("Issue selecting Stalled jobs", result["Message"]) if result["Value"]: jobs = sorted(result["Value"]) self.log.info("Jobs Stalled will be checked for failure", "(n=%d)" % len(jobs)) for job in jobs: future = self.threadPoolExecutor.submit( self._execute, "%s:_failStalledJobs" % job) futures.append(future) # 3) Send accounting for minor in self.minorStalledStatuses: result = self.jobDB.selectJobs({ "Status": JobStatus.FAILED, "MinorStatus": minor, "AccountedFlag": "False" }) if not result["OK"]: self.log.error("Issue selecting jobs for accounting", result["Message"]) if result["Value"]: jobs = result["Value"] self.log.info("Stalled jobs will be Accounted", "(n=%d)" % (len(jobs))) for job in jobs: future = self.threadPoolExecutor.submit( self._execute, "%s:_sendAccounting" % job) futures.append(future) for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as exc: self.log.error("_execute generated an exception: %s" % exc) # From here on we don't use the threads # 4) Fail submitting jobs result = self._failSubmittingJobs() if not result["OK"]: self.log.error("Failed to process jobs being submitted", result["Message"]) # 5) Kick stuck jobs result = self._kickStuckJobs() if not result["OK"]: self.log.error("Failed to kick stuck jobs", result["Message"]) return S_OK()
def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] abortedPilots = 0 result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 23400 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 23300 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] lastUpdateTime = pilotDict[pRef]['LastUpdateTime'] sinceLastUpdate = dateTime() - lastUpdateTime if oldStatus == ceStatus and ceStatus != "Unknown": # Normal status did not change, continue continue elif ceStatus == "Unknown" and oldStatus == "Unknown": if sinceLastUpdate < 3600*second: # Allow 1 hour of Unknown status assuming temporary problems on the CE continue else: newStatus = 'Aborted' elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Possible problems on the CE, let's keep the Unknown status for a while newStatus = 'Unknown' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) if newStatus == "Aborted": abortedPilots += 1 # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # If something wrong in the queue, make a pause for the job submission if abortedPilots: self.failedQueues[queue] += 1 # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK()
for i in resultList: if not cfg.isSection("Resources/Sites/%s" % i): gLogger.error("Subsection /Resources/Sites/%s does not exists" % i) continue hasRun = True if SORTBYNAME: dirty = cfg["Resources"]["Sites"][i].sortAlphabetically(ascending=not REVERSE) else: dirty = cfg["Resources"]["Sites"][i].sortByKey(key=country, reverse=REVERSE) if dirty: isDirty = True if not hasRun: gLogger.notice("Failed to find suitable subsections with site names to sort") DIRAC.exit(0) if not isDirty: gLogger.notice("Nothing to do, site names are already sorted") DIRAC.exit(0) timestamp = toString(dateTime()) stamp = "Site names are sorted by %s script at %s" % (Script.scriptName, timestamp) cs.setOptionComment("/Resources/Sites", stamp) result = cs.commit() if not result["OK"]: gLogger.error("Failed to commit changes to CS", result["Message"]) DIRAC.exit(2) gLogger.notice("Site names are sorted and committed to CS") DIRAC.exit(0)
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK()