def _checkLoggingInfo(self, jobID, jobDict): """Get info from JobLogging""" logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result["OK"]: logList = result["Value"] startTime = jobDict["StartExecTime"] if not startTime or startTime == "None": # status, minor, app, stime, source for items in logList: if items[0] == "Running": startTime = items[3] break if not startTime or startTime == "None": startTime = jobDict["SubmissionTime"] if isinstance(startTime, six.string_types): startTime = fromString(startTime) if startTime is None: self.log.error("Wrong timestamp in DB", items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == "Stalled": endTime = fromString(items[3]) if endTime is None: self.log.error("Wrong timestamp in DB", items[3]) endTime = dateTime() return startTime, endTime
def __sendAccounting( ftsJob, ownerDN ): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime( fromString( ftsJob.SubmitTime ) ) dataOp.setEndTime( fromString( ftsJob.LastUpdate ) ) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN( ownerDN ) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS" # accountingDict['RegistrationTime'] = 0 # accountingDict['RegistrationOK'] = 0 # accountingDict['RegistrationTotal'] = 0 accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status == "Finished" ] ) accountingDict["TransferTotal"] = len( ftsJob ) accountingDict["TransferSize"] = ftsJob.Size accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime dataOp.setValuesFromDict( accountingDict ) dataOp.commit()
def __checkLoggingInfo( self, jobID, jobDict ): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo( jobID ) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type( startTime ) in types.StringTypes: startTime = fromString( startTime ) if startTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString( items[3] ) if endTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) endTime = dateTime() return startTime, endTime
def __getLatestUpdateTime( self, job ): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] ) if not result['OK']: self.log.error( 'Failed to get job attributes', result['Message'] ) if not result['OK'] or not result['Value']: self.log.error( 'Could not get attributes for job', '%s' % job ) return S_ERROR( 'Could not get attributes for job' ) self.log.verbose( result ) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None': self.log.verbose( 'HeartBeatTime is null for job %s' % job ) else: latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) ) if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None': self.log.verbose( 'LastUpdateTime is null for job %s' % job ) else: lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) ) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job ) else: self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) ) return S_OK( latestUpdate )
def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type(startTime) in types.StringTypes: startTime = fromString(startTime) if startTime == None: self.log.error('Wrong timestamp in DB', items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString(items[3]) if endTime == None: self.log.error('Wrong timestamp in DB', items[3]) endTime = dateTime() return startTime, endTime
def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime']) if not result['OK']: self.log.error(result['Message']) if not result['OK'] or not result['Value']: return S_ERROR('Could not get attributes for job %s' % job) self.log.verbose(result) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value'][ 'HeartBeatTime'] == 'None': self.log.verbose('HeartBeatTime is null for job %s' % job) else: latestUpdate = toEpoch(fromString( result['Value']['HeartBeatTime'])) if not result['Value']['LastUpdateTime'] or result['Value'][ 'LastUpdateTime'] == 'None': self.log.verbose('LastUpdateTime is null for job %s' % job) else: lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime'])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job) else: self.log.verbose('Latest update time from epoch for job %s is %s' % (job, latestUpdate)) return S_OK(latestUpdate)
def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) if not result["OK"] or not result["Value"]: self.log.error("Could not get attributes for job", "%s" % job) return S_ERROR("Could not get attributes for job") self.log.verbose(result) latestUpdate = 0 if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None": self.log.verbose("HeartBeatTime is null for job %s" % job) else: latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"])) if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None": self.log.verbose("LastUpdateTime is null for job %s" % job) else: lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job) else: self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate)) return S_OK(latestUpdate)
def __getToken2(self): """Get the Keystone token for the version v2 of the keystone service :return: S_OK(token) or S_ERROR """ user = self.parameters.get('User') password = self.parameters.get('Password') authArgs = {} if user and password: authDict = {'auth': {"passwordCredentials": {"username": user, "password": password} } } if self.project: authDict['auth']['tenantName'] = self.project elif self.parameters.get('Auth') == "voms": authDict = {'auth': {'voms': True}} if self.project: authDict['auth']['tenantName'] = self.project if self.parameters.get('Proxy'): authArgs['cert'] = self.parameters.get('Proxy') try: result = requests.post("%s/tokens" % self.url, headers={"Content-Type": "application/json"}, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR('Exception getting keystone token: %s' % str(exc)) output = result.json() if result.status_code in [400, 401]: message = "None" if 'error' in output: message = output['error'].get('message') return S_ERROR('Authorization error: %s' % message) self.token = str(output['access']['token']['id']) expires = fromString(str(output['access']['token']['expires']).replace('T', ' ').replace('Z', '')) issued = fromString(str(output['access']['token']['issued_at']).replace('T', ' ').replace('Z', '')) self.expires = dateTime() + (expires - issued) self.projectID = output['access']['token']['tenant']['id'] for endpoint in output['access']['serviceCatalog']: if endpoint['type'] == 'compute': self.computeURL = str(endpoint['endpoints'][0]['publicURL']) elif endpoint['type'] == 'image': self.imageURL = str(endpoint['endpoints'][0]['publicURL']) elif endpoint['type'] == 'network': self.networkURL = str(endpoint['endpoints'][0]['publicURL']) return S_OK(self.token)
def __sendAccounting(ftsJob): """ prepare and send DataOperation to AccountingDB :param ftsJob: the FTS3Job from which we send the accounting info """ dataOp = DataOperation() dataOp.setStartTime(fromString(ftsJob.submitTime)) dataOp.setEndTime(fromString(ftsJob.lastUpdate)) dataOp.setValuesFromDict(ftsJob.accountingDict) dataOp.delayedCommit()
def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime ): transSuc = 0 transSize = 0 missingSize = [] for lfn in self.fileDict.keys(): if self.fileDict[lfn].get( 'Status' ) == 'Finished': transSuc += 1 if not self.catalogMetadata.has_key( lfn ): missingSize.append( lfn ) if missingSize: self.__updateMetadataCache( missingSize ) for lfn in self.fileDict.keys(): if self.fileDict[lfn].get( 'Status' ) == 'Finished': transSize += self.catalogMetadata[lfn]['Size'] transTotal = 0 for state in ( self.statusSummary.keys() ): transTotal += self.statusSummary[state] submitTime = fromString( self.submitTime ) endTime = fromString( transEndTime ) oAccounting = DataOperation() #oAccounting.setEndTime(endTime) oAccounting.setEndTime( transEndTime ) oAccounting.setStartTime( submitTime ) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' accountingDict['User'] = '******' accountingDict['Protocol'] = 'FTS' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transSuc accountingDict['TransferTotal'] = transTotal accountingDict['TransferSize'] = transSize accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE c = transEndTime - submitTime transferTime = c.days * 86400 + c.seconds accountingDict['TransferTime'] = transferTime oAccounting.setValuesFromDict( accountingDict ) gLogger.verbose( "Attempting to commit accounting message..." ) oAccounting.commit() gLogger.verbose( "...committed." ) return S_OK()
def __sendAccounting(self, regSuc, regTotal, regTime, transEndTime): transSuc = 0 transSize = 0 missingSize = [] for lfn in self.fileDict.keys(): if self.fileDict[lfn].get('Status') == 'Finished': transSuc += 1 if not self.catalogMetadata.has_key(lfn): missingSize.append(lfn) if missingSize: self.__updateMetadataCache(missingSize) for lfn in self.fileDict.keys(): if self.fileDict[lfn].get('Status') == 'Finished': transSize += self.catalogMetadata[lfn]['Size'] transTotal = 0 for state in (self.statusSummary.keys()): transTotal += self.statusSummary[state] submitTime = fromString(self.submitTime) endTime = fromString(transEndTime) oAccounting = DataOperation() #oAccounting.setEndTime(endTime) oAccounting.setEndTime(transEndTime) oAccounting.setStartTime(submitTime) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' accountingDict['User'] = '******' accountingDict['Protocol'] = 'FTS' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transSuc accountingDict['TransferTotal'] = transTotal accountingDict['TransferSize'] = transSize accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE c = transEndTime - submitTime transferTime = c.days * 86400 + c.seconds accountingDict['TransferTime'] = transferTime oAccounting.setValuesFromDict(accountingDict) gLogger.verbose("Attempting to commit accounting message...") oAccounting.commit() gLogger.verbose("...committed.") return S_OK()
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = InstallTools.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join( [sname, cname])) elif type(component) in StringTypes: componentList = [component] else: componentList = component resultDict = {} for c in componentList: if not '/' in c: continue system, cname = c.split('/') startDir = InstallTools.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' logFile = file(currentLog, 'r') logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False timeStamp = fromString(fields[0] + ' ' + fields[1]) if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[c] = { 'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError } return S_OK(resultDict)
def __sendAccounting(ftsJob, ownerDN): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime(fromString(ftsJob.SubmitTime)) dataOp.setEndTime(fromString(ftsJob.LastUpdate)) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN(ownerDN) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict[ "Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower( ) else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES]) accountingDict["TransferTotal"] = len(ftsJob) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob]) dataOp.setValuesFromDict(accountingDict) dataOp.commit()
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if "*" in component: if component == "*": result = InstallTools.getSetupComponents() if result["OK"]: for ctype in ["Services", "Agents"]: if ctype in result["Value"]: for sname in result["Value"][ctype]: for cname in result["Value"][ctype][sname]: componentList.append("/".join([sname, cname])) elif type(component) in StringTypes: componentList = [component] else: componentList = component resultDict = {} for c in componentList: if not "/" in c: continue system, cname = c.split("/") startDir = InstallTools.startDir currentLog = startDir + "/" + system + "_" + cname + "/log/current" logFile = file(currentLog, "r") logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = "" for line in logLines: if "ERROR:" in line: fields = line.split() recent = False timeStamp = fromString(fields[0] + " " + fields[1]) if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split("ERROR:")[-1].strip() resultDict[c] = {"ErrorsHour": errors_1, "ErrorsDay": errors_24, "LastError": lastError} return S_OK(resultDict)
def export_checkComponentLog( self, component ): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = InstallTools.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append( '/'.join( [sname, cname] ) ) elif type( component ) in StringTypes: componentList = [component] else: componentList = component resultDict = {} for c in componentList: if not '/' in c: continue system, cname = c.split( '/' ) startDir = InstallTools.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' logFile = file( currentLog, 'r' ) logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False timeStamp = fromString( fields[0] + ' ' + fields[1] ) if ( now - timeStamp ) < hour: errors_1 += 1 recent = True if ( now - timeStamp ) < day: errors_24 += 1 recent = True if recent: lastError = line.split( 'ERROR:' )[-1].strip() resultDict[c] = {'ErrorsHour':errors_1, 'ErrorsDay':errors_24, 'LastError':lastError} return S_OK( resultDict )
def __sendAccounting( ftsJob, ownerDN ): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime( fromString( ftsJob.SubmitTime ) ) dataOp.setEndTime( fromString( ftsJob.LastUpdate ) ) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN( ownerDN ) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] ) accountingDict["TransferTotal"] = len( ftsJob ) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE # dt = ftsJob.LastUpdate - ftsJob.SubmitTime # transferTime = dt.days * 86400 + dt.seconds # accountingDict["TransferTime"] = transferTime accountingDict['TransferTime'] = sum( [int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] ) dataOp.setValuesFromDict( accountingDict ) dataOp.commit()
def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime, transDict ): """ send accounting record :param self: self reference :param regSuc: number of files successfully registered :param regTotal: number of files attepted to register :param regTime: time stamp at the end of registration :param transEndTime: time stamp at the end of FTS job :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers """ submitTime = fromString( self.submitTime ) oAccounting = DataOperation() dt = transEndTime - submitTime transferTime = dt.days * 86400 + dt.seconds if 'fts3' in self.ftsServer and transferTime < 0: import datetime while transferTime < 0: # Shift by one hour until transfer time is positive (ugly fix for FTS3 bug) transferTime += 3600 submitTime -= datetime.timedelta( 0, 3600 ) self.log.verbose( 'Fixed UTC submit time... Submit: %s, end: %s' % ( submitTime, transEndTime ) ) oAccounting.setEndTime( transEndTime ) oAccounting.setStartTime( submitTime ) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' result = getProxyInfo() if not result['OK']: userName = '******' else: userName = result['Value'].get( 'username', 'unknown' ) accountingDict['User'] = userName accountingDict['Protocol'] = 'FTS' if 'fts3' not in self.ftsServer else 'FTS3' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transDict['transOK'] accountingDict['TransferTotal'] = transDict['transTotal'] accountingDict['TransferSize'] = transDict['transSize'] accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE accountingDict['TransferTime'] = transferTime oAccounting.setValuesFromDict( accountingDict ) self.log.verbose( "Attempting to commit accounting message..." ) oAccounting.commit() self.log.verbose( "...committed." ) return S_OK()
def _getLatestUpdateTime(self, job): """Returns the most recent of HeartBeatTime and LastUpdateTime""" result = self.jobDB.getJobAttributes( job, ["HeartBeatTime", "LastUpdateTime"]) if not result["OK"] or not result["Value"]: self.log.error( "Failed to get job attributes", "for job %d: %s" % (job, result["Message"] if "Message" in result else "empty"), ) return S_ERROR("Could not get attributes for job") latestUpdate = 0 if not result["Value"]["HeartBeatTime"] or result["Value"][ "HeartBeatTime"] == "None": self.log.verbose("HeartBeatTime is null", "for job %s" % job) else: latestUpdate = toEpoch(fromString( result["Value"]["HeartBeatTime"])) if not result["Value"]["LastUpdateTime"] or result["Value"][ "LastUpdateTime"] == "None": self.log.verbose("LastUpdateTime is null", "for job %s" % job) else: latestUpdate = max( latestUpdate, toEpoch(fromString(result["Value"]["LastUpdateTime"]))) if not latestUpdate: return S_ERROR( "LastUpdate and HeartBeat times are null for job %s" % job) else: self.log.verbose( "", "Latest update time from epoch for job %s is %s" % (job, latestUpdate)) return S_OK(latestUpdate)
def __sendAccounting(ftsJob, ownerDN): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime(fromString(ftsJob.SubmitTime)) dataOp.setEndTime(fromString(ftsJob.LastUpdate)) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN(ownerDN) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS" # accountingDict['RegistrationTime'] = 0 # accountingDict['RegistrationOK'] = 0 # accountingDict['RegistrationTotal'] = 0 accountingDict["TransferOK"] = len( [f for f in ftsJob if f.Status == "Finished"]) accountingDict["TransferTotal"] = len(ftsJob) accountingDict["TransferSize"] = ftsJob.Size accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime dataOp.setValuesFromDict(accountingDict) dataOp.commit()
def __sendAccounting(self, regSuc, regTotal, regTime, transEndTime, transDict): """ send accounting record :param self: self reference :param regSuc: number of files successfully registered :param regTotal: number of files attepted to register :param regTime: time stamp at the end of registration :param transEndTime: time stamp at the end of FTS job :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers """ submitTime = fromString(self.submitTime) oAccounting = DataOperation() oAccounting.setEndTime(transEndTime) oAccounting.setStartTime(submitTime) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' accountingDict['User'] = '******' accountingDict['Protocol'] = 'FTS' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transDict['transOK'] accountingDict['TransferTotal'] = transDict['transTotal'] accountingDict['TransferSize'] = transDict['transSize'] accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE dt = transEndTime - submitTime transferTime = dt.days * 86400 + dt.seconds accountingDict['TransferTime'] = transferTime oAccounting.setValuesFromDict(accountingDict) self.log.verbose("Attempting to commit accounting message...") oAccounting.commit() self.log.verbose("...committed.") return S_OK()
def __sendAccounting( self, regSuc, regTotal, regTime, transEndTime, transDict ): """ send accounting record :param self: self reference :param regSuc: number of files successfully registered :param regTotal: number of files attepted to register :param regTime: time stamp at the end of registration :param transEndTime: time stamp at the end of FTS job :param dict transDict: dict holding couters for files being transerred, their sizes and successfull transfers """ submitTime = fromString( self.submitTime ) oAccounting = DataOperation() oAccounting.setEndTime( transEndTime ) oAccounting.setStartTime( submitTime ) accountingDict = {} accountingDict['OperationType'] = 'replicateAndRegister' accountingDict['User'] = '******' accountingDict['Protocol'] = 'FTS' accountingDict['RegistrationTime'] = regTime accountingDict['RegistrationOK'] = regSuc accountingDict['RegistrationTotal'] = regTotal accountingDict['TransferOK'] = transDict['transOK'] accountingDict['TransferTotal'] = transDict['transTotal'] accountingDict['TransferSize'] = transDict['transSize'] accountingDict['FinalStatus'] = self.requestStatus accountingDict['Source'] = self.sourceSE accountingDict['Destination'] = self.targetSE dt = transEndTime - submitTime transferTime = dt.days * 86400 + dt.seconds accountingDict['TransferTime'] = transferTime oAccounting.setValuesFromDict( accountingDict ) self.log.verbose( "Attempting to commit accounting message..." ) oAccounting.commit() self.log.verbose( "...committed." ) return S_OK()
def __getToken3(self): """Get the Keystone token for the version v3 of the keystone service :return: S_OK(token) or S_ERROR """ domain = self.parameters.get("Domain", "Default") user = self.parameters.get("User") password = self.parameters.get("Password") appcred_file = self.parameters.get("Appcred") authDict = {} authArgs = {} if user and password: authDict = { "auth": { "identity": { "methods": ["password"], "password": { "user": { "name": user, "domain": { "name": domain }, "password": password } }, } } } elif self.parameters.get("Auth") == "voms": authDict = { "auth": { "identity": { "methods": ["mapped"], "mapped": { "voms": True, "identity_provider": "egi.eu", "protocol": "mapped" }, } } } if self.parameters.get("Proxy"): authArgs["cert"] = self.parameters.get("Proxy") elif appcred_file: # The application credentials are stored in a file of the format: # id secret ac_fd = open(appcred_file, "r") auth_info = ac_fd.read() auth_info = auth_info.strip() ac_id, ac_secret = auth_info.split(" ", 1) ac_fd.close() authDict = { "auth": { "identity": { "methods": ["application_credential"], "application_credential": { "id": ac_id, "secret": ac_secret }, } } } else: return S_ERROR("No valid credentials provided") # appcred includes the project scope binding in the credential itself if self.project and not appcred_file: authDict["auth"]["scope"] = { "project": { "domain": { "name": domain }, "name": self.project } } gLogger.debug("Request token with auth arguments: %s and body %s" % (str(authArgs), str(authDict))) url = "%s/auth/tokens" % self.url try: result = requests.post(url, headers={ "Content-Type": "application/json", "Accept": "application/json", }, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR("Exception getting keystone token: %s" % str(exc)) if result.status_code not in [200, 201, 202, 203, 204]: return S_ERROR("Failed to get keystone token: %s" % result.text) try: self.token = result.headers["X-Subject-Token"] except Exception as exc: return S_ERROR("Failed to get keystone token: %s" % str(exc)) output = result.json() expires = fromString( str(output["token"]["expires_at"]).replace("T", " ").replace("Z", "")) issued = fromString( str(output["token"]["issued_at"]).replace("T", " ").replace("Z", "")) self.expires = dateTime() + (expires - issued) if "project" in output["token"]: if output["token"]["project"]["name"] == self.project: self.projectID = output["token"]["project"]["id"] if "catalog" in output["token"]: for service in output["token"]["catalog"]: if service["type"] == "compute": for endpoint in service["endpoints"]: if endpoint["interface"] == "public": self.computeURL = str(endpoint["url"]) elif service["type"] == "image": for endpoint in service["endpoints"]: if endpoint["interface"] == "public": self.imageURL = str(endpoint["url"]) elif service["type"] == "network": for endpoint in service["endpoints"]: if endpoint["interface"] == "public": self.networkURL = str(endpoint["url"]) return S_OK(self.token)
def export_checkComponentLog(self, component): """Check component log for errors""" componentList = [] if "*" in component: if component == "*": result = gComponentInstaller.getSetupComponents() if result["OK"]: for ctype in ["Services", "Agents", "Executors"]: if ctype in result["Value"]: for sname in result["Value"][ctype]: for cname in result["Value"][ctype][sname]: componentList.append("/".join( [sname, cname])) elif isinstance(component, six.string_types): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if "/" not in comp: continue system, cname = comp.split("/") startDir = gComponentInstaller.startDir currentLog = startDir + "/" + system + "_" + cname + "/log/current" try: with open(currentLog, "r") as logFile: logLines = logFile.readlines() except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = { "ErrorsHour": -1, "ErrorsDay": -1, "LastError": currentLog + "::" + repr(err) } continue errors_1 = 0 errors_24 = 0 now = dateTime() lastError = "" for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split("ERROR:")[-1].strip() continue timeStamp = fromString(fields[0] + " " + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split("ERROR:")[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split("ERROR:")[-1].strip() resultDict[comp] = { "ErrorsHour": errors_1, "ErrorsDay": errors_24, "LastError": lastError } return S_OK(resultDict)
def __getToken3(self): """Get the Keystone token for the version v3 of the keystone service :return: S_OK(token) or S_ERROR """ domain = self.parameters.get('Domain', "Default") user = self.parameters.get('User') password = self.parameters.get('Password') appcred_file = self.parameters.get('Appcred') authDict = {} authArgs = {} if user and password: authDict = {'auth': {"identity": {"methods": ["password"], "password": {"user": {"name": user, "domain": {"name": domain}, "password": password } } } } } elif self.parameters.get('Auth') == "voms": authDict = {"auth": {"identity": {"methods": ["mapped"], "mapped": {'voms': True, 'identity_provider': 'egi.eu', "protocol": 'mapped'}}}} if self.parameters.get('Proxy'): authArgs['cert'] = self.parameters.get('Proxy') elif appcred_file: # The application credentials are stored in a file of the format: # id secret ac_fd = open(appcred_file, 'r') auth_info = ac_fd.read() auth_info = auth_info.strip() ac_id, ac_secret = auth_info.split(" ", 1) ac_fd.close() authDict = {'auth': {"identity": {"methods": ["application_credential"], "application_credential": {"id": ac_id, "secret": ac_secret}}}} else: return S_ERROR("No valid credentials provided") # appcred includes the project scope binding in the credential itself if self.project and not appcred_file: authDict['auth']['scope'] = {"project": {"domain": {"name": domain}, "name": self.project } } gLogger.debug('Request token with auth arguments: %s and body %s' % (str(authArgs), str(authDict))) url = "%s/auth/tokens" % self.url try: result = requests.post(url, headers={"Content-Type": "application/json", "Accept": "application/json", }, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR('Exception getting keystone token: %s' % str(exc)) if result.status_code not in [200, 201, 202, 203, 204]: return S_ERROR('Failed to get keystone token: %s' % result.text) try: self.token = result.headers['X-Subject-Token'] except Exception as exc: return S_ERROR('Failed to get keystone token: %s' % str(exc)) output = result.json() expires = fromString(str(output['token']['expires_at']).replace('T', ' ').replace('Z', '')) issued = fromString(str(output['token']['issued_at']).replace('T', ' ').replace('Z', '')) self.expires = dateTime() + (expires - issued) if 'project' in output['token']: if output['token']['project']['name'] == self.project: self.projectID = output['token']['project']['id'] if 'catalog' in output['token']: for service in output['token']['catalog']: if service['type'] == 'compute': for endpoint in service['endpoints']: if endpoint['interface'] == 'public': self.computeURL = str(endpoint['url']) elif service['type'] == 'image': for endpoint in service['endpoints']: if endpoint['interface'] == 'public': self.imageURL = str(endpoint['url']) elif service['type'] == 'network': for endpoint in service['endpoints']: if endpoint['interface'] == 'public': self.networkURL = str(endpoint['url']) return S_OK(self.token)
pollingtime = line.split(':')[4].split(' ')[1].split( '.')[0] except BaseException: try: pollingtime = line.split(':')[7].split(' ')[1].split( '.')[0] except BaseException: write_log(" wrong format for Polling Time : " + line) break else: lastLine = line lastLineList = lastLine.split(' ') try: lastupdate = fromString(lastLineList[0] + ' ' + lastLineList[1]) except BaseException: write_log(' EXCEPT : ' + dirname) write_log(' last line is ' + str(lastLineList)) if isinstance(pollingtime, int): if int(pollingtime) < 59: pollingtime = 120 interval = timeInterval(lastupdate, second * int(pollingtime)) if not interval.includes(now): write_log(" the PollingTime is : " + str(pollingtime) + " s") write_log(' last update for ' + dirname + ' was : ' + str(lastupdate)) write_log(' Polling Time is ' + str(pollingtime) + ' s') write_log(' last known status' + result + '\n')
def optimizeJob( self, jid, jobState ): #Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ValueError: return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) #Get site requirements result = self.__getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] #Get active and banned sites from DIRAC result = self.__jobDB.getSiteMask( 'Active' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve active sites from JobDB" ) wmsActiveSites = result[ 'Value' ] result = self.__jobDB.getSiteMask( 'Banned' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) wmsBannedSites = result[ 'Value' ] #If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): sites = self.__applySiteFilter( userSites, wmsActiveSites, wmsBannedSites ) if not sites: if len( userSites ) > 1: return self.__holdJob( jobState, "Requested sites %s are inactive" % ",".join( userSites ) ) else: return self.__holdJob( jobState, "Requested site %s is inactive" % userSites[0] ) #Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: #No input data? Generate requirements and next return self.__sendToTQ( jobState, userSites, userBannedSites ) inputData = result[ 'Value' ] self.jobLog.verbose( 'Has an input data requirement' ) idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info: %s" % result[ 'Message' ] ) return S_ERROR( "File Catalog Access Failure" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) #Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) siteCandidates = self.__applySiteFilter( siteCandidates, userSites, userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] #Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) #Is any site active? stageSites = self.__applySiteFilter( siteCandidates, wmsActiveSites, wmsBannedSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) #If no staging is required send to TQ if not stageRequired: #Use siteCandidates and not stageSites because active and banned sites #will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) #Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): if not self.__checkStageAllowed( jobState ): return S_ERROR( "Stage not allowed" ) #Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] #Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 #Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData result = self.__requestStaging( jobState, stageSite, opData ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData ) #Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self.__setJobSite( jobState, stageSites )
def _sendAccounting(self, jobID): """ Send WMS accounting data for the given job. Run inside thread. """ try: accountingReport = Job() endTime = "Unknown" lastHeartBeatTime = "Unknown" result = self.jobDB.getJobAttributes(jobID) if not result["OK"]: return result jobDict = result["Value"] startTime, endTime = self._checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self._checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter( jobID, "CPUNormalizationFactor") if not result["OK"] or not result["Value"]: self.log.error( "Error getting Job Parameter CPUNormalizationFactor, setting 0", result.get("Message", "No such value"), ) cpuNormalization = 0.0 else: cpuNormalization = float( result["Value"].get("CPUNormalizationFactor")) except Exception as e: self.log.exception( "Exception in _sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e, ) return S_ERROR("Exception") processingType = self._getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { "Site": jobDict["Site"], "User": jobDict["Owner"], "UserGroup": jobDict["OwnerGroup"], "JobGroup": jobDict["JobGroup"], "JobType": jobDict["JobType"], "JobClass": jobDict["JobSplitType"], "ProcessingType": processingType, "FinalMajorStatus": JobStatus.FAILED, "FinalMinorStatus": JobMinorStatus.STALLED_PILOT_NOT_RUNNING, "CPUTime": lastCPUTime, "NormCPUTime": lastCPUTime * cpuNormalization, "ExecTime": lastWallTime, "InputDataSize": 0.0, "OutputDataSize": 0.0, "InputDataFiles": 0, "OutputDataFiles": 0, "DiskSpace": 0.0, "InputSandBoxSize": 0.0, "OutputSandBoxSize": 0.0, "ProcessedEvents": 0, } # For accidentally stopped jobs ExecTime can be not set if not acData["ExecTime"]: acData["ExecTime"] = acData["CPUTime"] elif acData["ExecTime"] < acData["CPUTime"]: acData["ExecTime"] = acData["CPUTime"] self.log.verbose("Accounting Report is:") self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result["OK"]: self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True") else: self.log.error( "Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"])) return result
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = gComponentInstaller.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents', 'Executors']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join([sname, cname])) elif isinstance(component, basestring): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if '/' not in comp: continue system, cname = comp.split('/') startDir = gComponentInstaller.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' try: logFile = file(currentLog, 'r') except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = {'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err)} continue logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split('ERROR:')[-1].strip() continue timeStamp = fromString(fields[0] + ' ' + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split('ERROR:')[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[comp] = {'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError} return S_OK(resultDict)
def __sendAccounting( self, jobID ): """ Send WMS accounting data for the given job """ accountingReport = Job() result = self.jobDB.getJobAttributes( jobID ) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo( jobID, jobDict ) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict ) if lastHeartBeatTime and fromString( lastHeartBeatTime ) > endTime: endTime = fromString( lastHeartBeatTime ) cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' ) if not cpuNormalization['OK'] or not cpuNormalization['Value']: cpuNormalization = 0.0 else: cpuNormalization = float( cpuNormalization['Value'] ) processingType = self.__getProcessingType( jobID ) accountingReport.setStartTime( startTime ) accountingReport.setEndTime( endTime ) # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site' : jobDict['Site'], 'User' : jobDict['Owner'], 'UserGroup' : jobDict['OwnerGroup'], 'JobGroup' : jobDict['JobGroup'], 'JobType' : jobDict['JobType'], 'JobClass' : jobDict['JobSplitType'], 'ProcessingType' : processingType, 'FinalMajorStatus' : 'Failed', 'FinalMinorStatus' : 'Stalled', 'CPUTime' : lastCPUTime, 'NormCPUTime' : lastCPUTime * cpuNormalization, 'ExecTime' : lastWallTime, 'InputDataSize' : 0.0, 'OutputDataSize' : 0.0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0.0, 'InputSandBoxSize' : 0.0, 'OutputSandBoxSize' : 0.0, 'ProcessedEvents' : 0 } self.log.verbose( 'Accounting Report is:' ) self.log.verbose( acData ) accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' ) else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) ) return result
def execute(self): """ The main agent execution method """ limitDate = date() - self._period tableList = ["MessageRepository", "FixedTextMessages", "Systems", "SubSystems"] columnsList = ["SystemName", "SubSystemName", "count(*) as entries", "FixedTextString"] cmd = "SELECT " + ', '.join(columnsList) + " FROM " \ + " NATURAL JOIN ".join(tableList) \ + " WHERE MessageTime > '%s'" % limitDate \ + " AND LogLevel in ('ERROR','FATAL','EXCEPT')" \ + " GROUP BY FixedTextID,SystemName,SubSystemName HAVING entries > %s" % self._threshold \ + " ORDER BY entries DESC LIMIT %i;" % self._limit result = self.systemLoggingDB._query(cmd) if not result['OK']: return result messageList = result['Value'] if messageList == 'None' or messageList == (): self.log.warn('The DB query returned an empty result') return S_OK() mailBody = '\n' for message in messageList: mailBody = mailBody + "Count: " + str(message[2]) + "\tError: '"\ + message[3] + "'\tSystem: '" + message[0]\ + "'\tSubsystem: '" + message[1] + "'\n" mailBody = mailBody + "\n\n-------------------------------------------------------\n"\ + "Please do not reply to this mail. It was automatically\n"\ + "generated by a Dirac Agent.\n" result = self.systemLoggingDB._getDataFromAgentTable(self.agentName) self.log.debug(result) if not result['OK']: errorString = "Could not get the date when the last mail was sent" self.log.error(errorString) return S_ERROR(errorString) else: if result['Value']: self.log.debug("date value: %s" % fromString(result['Value'][0][0][1:-1])) lastMailSentDate = fromString(result['Value'][0][0][1:-1]) else: lastMailSentDate = limitDate - 1 * day result = self.systemLoggingDB._insertDataIntoAgentTable(self.agentName, lastMailSentDate) if not result['OK']: errorString = "Could not insert data into the DB" self.log.error(errorString, result['Message']) return S_ERROR(errorString + ": " + result['Message']) self.log.debug("limitDate: %s\t" % limitDate + "lastMailSentDate: %s\n" % lastMailSentDate) if lastMailSentDate > limitDate: self.log.info("The previous report was sent less " + " than %s days ago" % self.__days) return S_OK() dateSent = toString(date()) self.log.info("The list with the top errors has been sent") result = self.systemLoggingDB._insertDataIntoAgentTable(self.agentName, dateSent) if not result['OK']: errorString = "Could not insert data into the DB" self.log.error(errorString, result['Message']) return S_ERROR(errorString + ": " + result['Message']) result = self.notification.sendMail(self._mailAddress, self._subject, mailBody) if not result['OK']: self.log.warn("The notification could not be sent") return S_OK() return S_OK("The list with the top errors has been sent")
def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor') if not cpuNormalization['OK'] or not cpuNormalization['Value']: cpuNormalization = 0.0 else: cpuNormalization = float(cpuNormalization['Value']) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), '', False) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result
def __sendAccounting( self, jobID ): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes( jobID ) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo( jobID, jobDict ) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict ) lastHeartBeatTime = fromString( lastHeartBeatTime ) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' ) if not cpuNormalization['OK'] or not cpuNormalization['Value']: cpuNormalization = 0.0 else: cpuNormalization = float( cpuNormalization['Value'] ) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % ( str( jobID ), str( endTime ), str( lastHeartBeatTime ) ), '' , False ) return S_ERROR( "Exception" ) processingType = self.__getProcessingType( jobID ) accountingReport.setStartTime( startTime ) accountingReport.setEndTime( endTime ) # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site' : jobDict['Site'], 'User' : jobDict['Owner'], 'UserGroup' : jobDict['OwnerGroup'], 'JobGroup' : jobDict['JobGroup'], 'JobType' : jobDict['JobType'], 'JobClass' : jobDict['JobSplitType'], 'ProcessingType' : processingType, 'FinalMajorStatus' : 'Failed', 'FinalMinorStatus' : 'Stalled', 'CPUTime' : lastCPUTime, 'NormCPUTime' : lastCPUTime * cpuNormalization, 'ExecTime' : lastWallTime, 'InputDataSize' : 0.0, 'OutputDataSize' : 0.0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0.0, 'InputSandBoxSize' : 0.0, 'OutputSandBoxSize' : 0.0, 'ProcessedEvents' : 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose( 'Accounting Report is:' ) self.log.verbose( acData ) accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' ) else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) ) return result
def sendAccounting( self, jobID ): """Send WMS accounting data for the given job """ accountingReport = Job() result = self.jobDB.getJobAttributes( jobID ) if not result['OK']: return result jobDict = result['Value'] result = self.logDB.getJobLoggingInfo( jobID ) if not result['OK']: logList = [] else: logList = result['Value'] startTime = jobDict['StartExecTime'] endTime = '' if not startTime or startTime == 'None': for status, minor, app, stime, source in logList: if status == 'Running': startTime = stime break for status, minor, app, stime, source in logList: if status == 'Stalled': endTime = stime if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type( startTime ) in types.StringTypes: startTime = fromString( startTime ) result = self.logDB.getJobLoggingInfo( jobID ) if not result['OK']: endTime = dateTime() else: for status, minor, app, stime, source in result['Value']: if status == 'Stalled': endTime = stime break if not endTime: endTime = dateTime() if type( endTime ) in types.StringTypes: endTime = fromString( endTime ) result = self.jobDB.getHeartBeatData( jobID ) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if result['OK']: for name, value, heartBeatTime in result['Value']: if 'CPUConsumed' == name: try: value = int( float( value ) ) if value > lastCPUTime: lastCPUTime = value except: pass if 'WallClockTime' == name: try: value = int( float( value ) ) if value > lastWallTime: lastWallTime = value except: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime accountingReport.setStartTime( startTime ) accountingReport.setEndTime() # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site' : jobDict['Site'], 'User' : jobDict['Owner'], 'UserGroup' : jobDict['OwnerGroup'], 'JobGroup' : jobDict['JobGroup'], 'JobType' : jobDict['JobType'], 'JobClass' : jobDict['JobSplitType'], 'ProcessingType' : 'unknown', 'FinalMajorStatus' : 'Failed', 'FinalMinorStatus' : 'Stalled', 'CPUTime' : lastCPUTime, 'NormCPUTime' : 0.0, 'ExecTime' : lastWallTime, 'InputDataSize' : 0.0, 'OutputDataSize' : 0.0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0.0, 'InputSandBoxSize' : 0.0, 'OutputSandBoxSize' : 0.0, 'ProcessedEvents' : 0 } self.log.verbose( 'Accounting Report is:' ) self.log.verbose( acData ) accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' ) else: self.log.warn( 'Failed to send accounting report for job %d' % int( jobID ) ) self.log.error( result['Message'] ) return result
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose('Job %s will be processed' % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Can not get job attributes from JobDB') jobDict = result['Value'] reCounter = int(jobDict['RescheduleCounter']) if reCounter != 0: reTime = fromString(jobDict['RescheduleTime']) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling') == -1: result = self.jobDB.setJobStatus( job, application='On Hold: after rescheduling #%d' % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR(msg) # Second, get the Active and Banned sites from the WMS wmsSites = self.jobDB.getSiteMask('Active') wmsBannedSites = self.jobDB.getSiteMask('Banned') if not (wmsSites['OK'] and wmsBannedSites['OK']): if not wmsSites['OK']: self.log.error(wmsSites['Message']) if not wmsBannedSites['OK']: self.log.error(wmsBannedSites['Message']) return S_ERROR('Can not get Active and Banned Sites from JobDB') wmsSites = wmsSites['Value'] wmsBannedSites = wmsBannedSites['Value'] if userSites: sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString('JobType') if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobDB for %s' % (job)) self.log.error(result['Message']) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose('Job %s has no input data requirement' % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose('Job %s has an input data requirement ' % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info('Input Data Site Candidates: %s' % (', '.join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR(msg) sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo['SiteCandidates']) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR('No destination sites available') stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose('Job %s requires staging of input data' % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % (self.dataAgentName, job), optInfo) result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose('Staging site candidate for job %s is %s' % (job, stagingSite)) result = self.__getStagingSites(stagingSite, destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len(stagingSites) == 1: self.jobDB.setJobAttribute(job, 'Site', stagingSite) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute(job, 'Site', groupName) else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') else: self.jobDB.setJobAttribute(job, 'Site', 'Multiple') stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict['OK']: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict['Value'], optInfo) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose('Job %s does not require staging of input data' % (job)) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
def checkJob(self, job, classAdJob): """This method controls the checking of the job. """ self.log.verbose("Job %s will be processed" % (job)) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes(job, ["RescheduleCounter", "RescheduleTime", "ApplicationStatus"]) if not result["OK"]: self.log.error(result["Message"]) return S_ERROR("Can not get job attributes from JobDB") jobDict = result["Value"] reCounter = int(jobDict["RescheduleCounter"]) if reCounter != 0: reTime = fromString(jobDict["RescheduleTime"]) delta = toEpoch() - toEpoch(reTime) delay = self.maxRescheduleDelay if reCounter <= len(self.rescheduleDelaysList): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict["ApplicationStatus"].find("On Hold: after rescheduling") == -1: result = self.jobDB.setJobStatus(job, application="On Hold: after rescheduling #%d" % reCounter) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement(job, classAdJob) userBannedSites = result["BannedSites"] userSites = result["Sites"] if userSites: userSites = applySiteRequirements(userSites, [], userBannedSites) if not userSites: msg = "Impossible Site Requirement" return S_ERROR(msg) # Second, get the Active and Banned sites from the WMS wmsSites = self.jobDB.getSiteMask("Active") wmsBannedSites = self.jobDB.getSiteMask("Banned") if not (wmsSites["OK"] and wmsBannedSites["OK"]): if not wmsSites["OK"]: self.log.error(wmsSites["Message"]) if not wmsBannedSites["OK"]: self.log.error(wmsBannedSites["Message"]) return S_ERROR("Can not get Active and Banned Sites from JobDB") wmsSites = wmsSites["Value"] wmsBannedSites = wmsBannedSites["Value"] if userSites: sites = applySiteRequirements(userSites, wmsSites, wmsBannedSites) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString("JobType") if not jobType in self.excludedOnHoldJobTypes: msg = "On Hold: Requested site is Banned or not Active" self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData(job) if not result["OK"]: self.log.warn("Failed to get input data from JobDB for %s" % (job)) self.log.error(result["Message"]) return S_ERROR("Failed to get input data from JobDB") if not result["Value"]: return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) hasInputData = False inputData = [] for lfn in result["Value"]: if lfn: inputData.append(lfn) hasInputData = True if not hasInputData: # With no input data requirement, job can proceed directly to task queue self.log.verbose("Job %s has no input data requirement" % (job)) return self.__sendJobToTaskQueue(job, classAdJob, userSites, userBannedSites) self.log.verbose("Job %s has an input data requirement " % (job)) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo(job) if not result["OK"]: return result optInfo = result["Value"] # Compare site candidates with current mask optSites = optInfo["SiteCandidates"].keys() self.log.info("Input Data Site Candidates: %s" % (", ".join(optSites))) # Check that it is compatible with user requirements optSites = applySiteRequirements(optSites, userSites, userBannedSites) if not optSites: msg = "Impossible Site + InputData Requirement" return S_ERROR(msg) sites = applySiteRequirements(optSites, wmsSites, wmsBannedSites) if not sites: msg = "On Hold: InputData Site is Banned or not Active" self.log.info(msg) result = self.jobDB.setJobStatus(job, application=msg) return S_OK() # Set stager request as necessary, optimize for smallest #files on tape if # more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging(job, sites, inputData, optInfo["SiteCandidates"]) if not checkStaging["OK"]: return checkStaging destinationSites = checkStaging["SiteCandidates"] if not destinationSites: return S_ERROR("No destination sites available") stagingFlag = checkStaging["Value"] if stagingFlag: # Single site candidate chosen and staging required self.log.verbose("Job %s requires staging of input data" % (job)) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo["SiteCandidates"][stagingSite] siteDict["disk"] = siteDict["disk"] + siteDict["tape"] siteDict["tape"] = 0 optInfo["SiteCandidates"][stagingSite] = siteDict result = self.setOptimizerJobInfo(job, self.dataAgentName, optInfo) if not result["OK"]: return result # Site is selected for staging, report it self.log.verbose("Staging site candidate for job %s is %s" % (job, stagingSite)) if len(destinationSites) == 1: self.jobDB.setJobAttribute(job, "Site", stagingSite) else: self.jobDB.setJobAttribute(job, "Site", "Multiple") stagerDict = self.__setStagingRequest(job, stagingSite, optInfo) if not stagerDict["OK"]: return stagerDict self.__updateOtherSites(job, stagingSite, stagerDict["Value"], optInfo) return S_OK() else: # No staging required, can proceed to task queue agent and then waiting status self.log.verbose("Job %s does not require staging of input data" % (job)) # Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue(job, classAdJob, destinationSites, userBannedSites)
def optimizeJob( self, jid, jobState ): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes( [ 'RescheduleCounter', 'RescheduleTime', 'ApplicationStatus' ] ) if not result[ 'OK' ]: return result attDict = result[ 'Value' ] try: reschedules = int( attDict[ 'RescheduleCounter' ] ) except ( ValueError, KeyError ): return S_ERROR( "RescheduleCounter has to be an integer" ) if reschedules != 0: delays = self.ex_getOption( 'RescheduleDelays', [60, 180, 300, 600] ) delay = delays[ min( reschedules, len( delays ) - 1 ) ] waited = toEpoch() - toEpoch( fromString( attDict[ 'RescheduleTime' ] ) ) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay ) # Get site requirements result = self.__getSitesRequired( jobState ) if not result[ 'OK' ]: return result userSites, userBannedSites = result[ 'Value' ] # Get job type result = jobState.getAttribute( "JobType" ) if not result[ 'OK' ]: return S_ERROR( "Could not retrieve job type" ) jobType = result[ 'Value' ] # Get banned sites from DIRAC result = self.__jobDB.getSiteMask( 'Banned' ) if not result[ 'OK' ]: return S_ERROR( "Cannot retrieve banned sites from JobDB" ) wmsBannedSites = result[ 'Value' ] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption( 'ExcludedOnHoldJobTypes', [] ): result = self.__jobDB.getUserSitesTuple( userSites ) if not result[ 'OK' ]: return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" ) userSites, bannedSites, invalidSites = result['Value'] if invalidSites: self.jobLog.debug( "Invalid site(s) requested: %s" % ','.join( invalidSites ) ) if not self.ex_getOption( 'AllowInvalidSites', True ): return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join( invalidSites ) ) if bannedSites: self.jobLog.debug( "Banned site(s) %s ignored" % ",".join( bannedSites ) ) if not userSites: return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join( bannedSites ) ) if not userSites: return self.__holdJob( jobState, "No requested site(s) are active/valid" ) userSites = list(userSites) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error( "Cannot get input data %s" % ( result['Message'] ) ) return S_ERROR( "Failed to get input data from JobDB" ) if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ( jobState, userSites, userBannedSites ) self.jobLog.verbose( "Has an input data requirement" ) inputData = result[ 'Value' ] # Production jobs are sent to TQ, but first we have to verify if staging is necessary if jobType in Operations().getValue( 'Transformations/DataProcessing', [] ): self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" ) userName = jobState.getAttribute( 'Owner' ) if not userName[ 'OK' ]: return userName userName = userName['Value'] userGroup = jobState.getAttribute( 'OwnerGroup' ) if not userGroup[ 'OK' ]: return userGroup userGroup = userGroup['Value'] res = getFilesToStage( inputData, proxyUserName = userName, proxyUserGroup = userGroup ) #pylint: disable=unexpected-keyword-arg if not res['OK']: return self.__holdJob( jobState, res['Message'] ) stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed( jobState ) if not res['OK']: return res if not res['Value']: return S_ERROR( "Stage not allowed" ) self.__requestStaging( jobState, stageLFNs ) return S_OK() else: return self.__sendToTQ( jobState, userSites, userBannedSites ) # From now on we know it's a user job with input data idAgent = self.ex_getOption( 'InputDataAgent', 'InputData' ) result = self.retrieveOptimizerParam( idAgent ) if not result['OK']: self.jobLog.error( "Could not retrieve input data info", result[ 'Message' ] ) return S_ERROR( "Could not retrieve input data info" ) opData = result[ 'Value' ] if 'SiteCandidates' not in opData: return S_ERROR( "No possible site candidates" ) # Filter input data sites with user requirement siteCandidates = list( opData[ 'SiteCandidates' ] ) self.jobLog.info( "Site candidates are %s" % siteCandidates ) if userSites: siteCandidates = list( set( siteCandidates ) & set( userSites ) ) siteCandidates = self._applySiteFilter( siteCandidates, banned = userBannedSites ) if not siteCandidates: return S_ERROR( "Impossible InputData * Site requirements" ) idSites = {} for site in siteCandidates: idSites[ site ] = opData[ 'SiteCandidates' ][ site ] # Check if sites have correct count of disk+tape replicas numData = len( inputData ) errorSites = set() for site in idSites: if numData != idSites[ site ][ 'disk' ] + idSites[ site ][ 'tape' ]: self.jobLog.error( "Site candidate %s does not have all the input data" % site ) errorSites.add( site ) for site in errorSites: idSites.pop( site ) if not idSites: return S_ERROR( "Site candidates do not have all the input data" ) # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites ) if not siteCandidates: return S_ERROR( "No destination sites available" ) # Is any site active? stageSites = self._applySiteFilter( siteCandidates, banned = wmsBannedSites ) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join( siteCandidates ) ) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ( jobState, siteCandidates, userBannedSites ) # Check if the user is allowed to stage if self.ex_getOption( "RestrictDataStage", False ): res = self.__checkStageAllowed( jobState ) if not res['OK']: return res if not res['Value']: return S_ERROR( "Stage not allowed" ) # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose( " Staging site will be %s" % ( stageSite ) ) stageData = idSites[ stageSite ] # Set as if everything has already been staged stageData[ 'disk' ] += stageData[ 'tape' ] stageData[ 'tape' ] = 0 # Set the site info back to the original dict to save afterwards opData[ 'SiteCandidates' ][ stageSite ] = stageData stageRequest = self.__preRequestStaging( jobState, stageSite, opData ) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging( jobState, stageLFNs ) if not result[ 'OK' ]: return result stageLFNs = result[ 'Value' ] self.__updateSharedSESites( jobState, stageSite, stageLFNs, opData ) # Save the optimizer data again self.jobLog.verbose( 'Updating %s Optimizer Info:' % ( idAgent ), opData ) result = self.storeOptimizerParam( idAgent, opData ) if not result[ 'OK' ]: return result return self.__setJobSite( jobState, stageSites )
def checkJob( self, job, classAdJob ): """This method controls the checking of the job. """ self.log.verbose( 'Job %s will be processed' % ( job ) ) # Check if the job was recently rescheduled result = self.jobDB.getJobAttributes( job, ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus'] ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Can not get job attributes from JobDB' ) jobDict = result['Value'] reCounter = int( jobDict['RescheduleCounter'] ) if reCounter != 0 : reTime = fromString( jobDict['RescheduleTime'] ) delta = toEpoch() - toEpoch( reTime ) delay = self.maxRescheduleDelay if reCounter <= len( self.rescheduleDelaysList ): delay = self.rescheduleDelaysList[reCounter - 1] if delta < delay: if jobDict['ApplicationStatus'].find( 'On Hold: after rescheduling' ) == -1: result = self.jobDB.setJobStatus( job, application = 'On Hold: after rescheduling #%d' % reCounter ) return S_OK() # First, get Site and BannedSites from the Job result = self.__getJobSiteRequirement( job, classAdJob ) userBannedSites = result['BannedSites'] userSites = result['Sites'] if userSites: userSites = applySiteRequirements( userSites, [], userBannedSites ) if not userSites: msg = 'Impossible Site Requirement' return S_ERROR( msg ) # Second, get the Active and Banned sites from the WMS wmsSites = self.jobDB.getSiteMask( 'Active' ) wmsBannedSites = self.jobDB.getSiteMask( 'Banned' ) if not ( wmsSites['OK'] and wmsBannedSites['OK'] ): if not wmsSites['OK']: self.log.error( wmsSites['Message'] ) if not wmsBannedSites['OK']: self.log.error( wmsBannedSites['Message'] ) return S_ERROR( 'Can not get Active and Banned Sites from JobDB' ) wmsSites = wmsSites['Value'] wmsBannedSites = wmsBannedSites['Value'] if userSites: sites = applySiteRequirements( userSites, wmsSites, wmsBannedSites ) if not sites: # Put on Hold only non-excluded job types jobType = classAdJob.getAttributeString( 'JobType' ) if not jobType in self.excludedOnHoldJobTypes: msg = 'On Hold: Requested site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() # Third, check if there is input data result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobDB for %s' % ( job ) ) self.log.error( result['Message'] ) return S_ERROR( 'Failed to get input data from JobDB' ) if not result['Value']: return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) hasInputData = False inputData = [] for lfn in result['Value']: if lfn: inputData.append( lfn ) hasInputData = True if not hasInputData: #With no input data requirement, job can proceed directly to task queue self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.__sendJobToTaskQueue( job, classAdJob, userSites, userBannedSites ) self.log.verbose( 'Job %s has an input data requirement ' % ( job ) ) # Fourth, Check all optimizer information result = self.__checkOptimizerInfo( job ) if not result['OK']: return result optInfo = result['Value'] #Compare site candidates with current mask optSites = optInfo['SiteCandidates'].keys() self.log.info( 'Input Data Site Candidates: %s' % ( ', '.join( optSites ) ) ) # Check that it is compatible with user requirements optSites = applySiteRequirements( optSites, userSites, userBannedSites ) if not optSites: msg = 'Impossible Site + InputData Requirement' return S_ERROR( msg ) sites = applySiteRequirements( optSites, wmsSites, wmsBannedSites ) if not sites: msg = 'On Hold: InputData Site is Banned or not Active' self.log.info( msg ) result = self.jobDB.setJobStatus( job, application = msg ) return S_OK() #Set stager request as necessary, optimize for smallest #files on tape if #more than one site candidate left at this point checkStaging = self.__resolveSitesForStaging( job, sites, inputData, optInfo['SiteCandidates'] ) if not checkStaging['OK']: return checkStaging destinationSites = checkStaging['SiteCandidates'] if not destinationSites: return S_ERROR( 'No destination sites available' ) stagingFlag = checkStaging['Value'] if stagingFlag: #Single site candidate chosen and staging required self.log.verbose( 'Job %s requires staging of input data' % ( job ) ) # set all LFN to disk for the selected site stagingSite = destinationSites[0] siteDict = optInfo['SiteCandidates'][stagingSite] siteDict['disk'] = siteDict['disk'] + siteDict['tape'] siteDict['tape'] = 0 optInfo['SiteCandidates'][stagingSite] = siteDict self.log.verbose( 'Updating %s Optimizer Info for Job %s:' % ( self.dataAgentName, job ), optInfo ) result = self.setOptimizerJobInfo( job, self.dataAgentName, optInfo ) if not result['OK']: return result # Site is selected for staging, report it self.log.verbose( 'Staging site candidate for job %s is %s' % ( job, stagingSite ) ) result = self.__getStagingSites(stagingSite,destinationSites) if not result['OK']: stagingSites = [stagingSite] else: stagingSites = result['Value'] if len( stagingSites ) == 1: self.jobDB.setJobAttribute( job, 'Site', stagingSite ) else: # Get the name of the site group result = self.__getSiteGroup(stagingSites) if result['OK']: groupName = result['Value'] if groupName: self.jobDB.setJobAttribute( job, 'Site', groupName ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) else: self.jobDB.setJobAttribute( job, 'Site', 'Multiple' ) stagerDict = self.__setStagingRequest( job, stagingSite, optInfo ) if not stagerDict['OK']: return stagerDict self.__updateOtherSites( job, stagingSite, stagerDict['Value'], optInfo ) return S_OK() else: #No staging required, can proceed to task queue agent and then waiting status self.log.verbose( 'Job %s does not require staging of input data' % ( job ) ) #Finally send job to TaskQueueAgent return self.__sendJobToTaskQueue( job, classAdJob, destinationSites, userBannedSites )
def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = "Unknown" lastHeartBeatTime = "Unknown" result = self.jobDB.getJobAttributes(jobID) if not result["OK"]: return result jobDict = result["Value"] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter(jobID, "CPUNormalizationFactor") if not cpuNormalization["OK"] or not cpuNormalization["Value"]: cpuNormalization = 0.0 else: cpuNormalization = float(cpuNormalization["Value"]) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), "", False, ) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { "Site": jobDict["Site"], "User": jobDict["Owner"], "UserGroup": jobDict["OwnerGroup"], "JobGroup": jobDict["JobGroup"], "JobType": jobDict["JobType"], "JobClass": jobDict["JobSplitType"], "ProcessingType": processingType, "FinalMajorStatus": "Failed", "FinalMinorStatus": "Stalled", "CPUTime": lastCPUTime, "NormCPUTime": lastCPUTime * cpuNormalization, "ExecTime": lastWallTime, "InputDataSize": 0.0, "OutputDataSize": 0.0, "InputDataFiles": 0, "OutputDataFiles": 0, "DiskSpace": 0.0, "InputSandBoxSize": 0.0, "OutputSandBoxSize": 0.0, "ProcessedEvents": 0, } # For accidentally stopped jobs ExecTime can be not set if not acData["ExecTime"]: acData["ExecTime"] = acData["CPUTime"] elif acData["ExecTime"] < acData["CPUTime"]: acData["ExecTime"] = acData["CPUTime"] self.log.verbose("Accounting Report is:") self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result["OK"]: self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True") else: self.log.error("Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"])) return result
def __getToken2(self): """Get the Keystone token for the version v2 of the keystone service :return: S_OK(token) or S_ERROR """ user = self.parameters.get("User") password = self.parameters.get("Password") authArgs = {} if user and password: authDict = { "auth": { "passwordCredentials": { "username": user, "password": password } } } if self.project: authDict["auth"]["tenantName"] = self.project elif self.parameters.get("Auth") == "voms": authDict = {"auth": {"voms": True}} if self.project: authDict["auth"]["tenantName"] = self.project if self.parameters.get("Proxy"): authArgs["cert"] = self.parameters.get("Proxy") try: result = requests.post( "%s/tokens" % self.url, headers={"Content-Type": "application/json"}, json=authDict, verify=self.caPath, **authArgs) except Exception as exc: return S_ERROR("Exception getting keystone token: %s" % str(exc)) output = result.json() if result.status_code in [400, 401]: message = "None" if "error" in output: message = output["error"].get("message") return S_ERROR("Authorization error: %s" % message) self.token = str(output["access"]["token"]["id"]) expires = fromString( str(output["access"]["token"]["expires"]).replace("T", " ").replace( "Z", "")) issued = fromString( str(output["access"]["token"]["issued_at"]).replace("T", " ").replace( "Z", "")) self.expires = dateTime() + (expires - issued) self.projectID = output["access"]["token"]["tenant"]["id"] for endpoint in output["access"]["serviceCatalog"]: if endpoint["type"] == "compute": self.computeURL = str(endpoint["endpoints"][0]["publicURL"]) elif endpoint["type"] == "image": self.imageURL = str(endpoint["endpoints"][0]["publicURL"]) elif endpoint["type"] == "network": self.networkURL = str(endpoint["endpoints"][0]["publicURL"]) return S_OK(self.token)
def optimizeJob(self, jid, jobState): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes( ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except (ValueError, KeyError): return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get the job manifest for the later checks result = jobState.getManifest() if not result['OK']: return S_ERROR("Could not retrieve job manifest: %s" % result['Message']) jobManifest = result['Value'] # Get site requirements result = self.__getSitesRequired(jobManifest) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get job type result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] # Get banned sites from DIRAC result = self.siteClient.getSites('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): result = self.siteClient.getUsableSites(userSites) if not result['OK']: return S_ERROR( "Problem checking userSites for tuple of active/banned/invalid sites" ) usableSites = set(result['Value']) bannedSites = [] invalidSites = [] for site in userSites: if site in wmsBannedSites: bannedSites.append(site) elif site not in usableSites: invalidSites.append(site) if invalidSites: self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites)) if not self.ex_getOption('AllowInvalidSites', True): return self.__holdJob( jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites)) if bannedSites: self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites)) if not usableSites: return self.__holdJob( jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites)) if not usableSites: return self.__holdJob( jobState, "No requested site(s) are active/valid") userSites = list(usableSites) checkPlatform = self.ex_getOption('CheckPlatform', False) jobPlatform = jobManifest.getOption("Platform", None) # First check that the platform is valid (in OSCompatibility list) if checkPlatform and jobPlatform: result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility') if not result['OK']: return S_ERROR("Unable to get OSCompatibility list") allPlatforms = result['Value'] if jobPlatform not in allPlatforms: self.jobLog.error("Platform not supported", jobPlatform) return S_ERROR("Platform %s is not supported" % jobPlatform) # Filter the userSites by the platform selection (if there is one) if checkPlatform and userSites: if jobPlatform: result = self.__filterByPlatform(jobPlatform, userSites) if not result['OK']: self.jobLog.error("Failed to filter job sites by platform", result['Message']) return S_ERROR("Failed to filter job sites by platform") userSites = result['Value'] if not userSites: # No sites left after filtering -> Invalid platform/sites combination self.jobLog.error("No selected sites match platform", jobPlatform) return S_ERROR("No selected sites match platform '%s'" % jobPlatform) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data", result['Message']) return S_ERROR("Failed to get input data from JobDB") if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites) self.jobLog.verbose("Has an input data requirement") inputData = result['Value'] # =================================================================================== # Production jobs are sent to TQ, but first we have to verify if staging is necessary # =================================================================================== if jobType in Operations().getValue('Transformations/DataProcessing', []): self.jobLog.info( "Production job: sending to TQ, but first checking if staging is requested" ) res = getFilesToStage(inputData, jobState=jobState, checkOnlyTapeSEs=self.ex_getOption( 'CheckOnlyTapeSEs', True), jobLog=self.jobLog) if not res['OK']: return self.__holdJob(jobState, res['Message']) if res['Value']['absentLFNs']: # Some files do not exist at all... set the job Failed # Reverse errors reasons = {} for lfn, reason in res['Value']['absentLFNs'].iteritems(): reasons.setdefault(reason, []).append(lfn) for reason, lfns in reasons.iteritems(): # Some files are missing in the FC or in SEs, fail the job self.jobLog.error(reason, ','.join(lfns)) error = ','.join(reasons) return S_ERROR(error) if res['Value']['failedLFNs']: return self.__holdJob( jobState, "Couldn't get storage metadata of some files") stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") self.__requestStaging(jobState, stageLFNs) return S_OK() else: # No staging required onlineSites = res['Value']['onlineSites'] if onlineSites: # Set the online site(s) first userSites = set(userSites) onlineSites &= userSites userSites = list(onlineSites) + list(userSites - onlineSites) return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites) # =================================================== # From now on we know it's a user job with input data # =================================================== idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info", result['Message']) return S_ERROR("Could not retrieve input data info") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) if userSites: siteCandidates = list(set(siteCandidates) & set(userSites)) siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] # Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error( "Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging(jobState, stageLFNs) if not result['OK']: return result stageLFNs = result['Value'] self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self.__setJobSite(jobState, stageSites)
def optimizeJob(self, jid, jobState): # Reschedule delay result = jobState.getAttributes( ['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except ValueError: return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob( jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get site requirements result = self._getSitesRequired(jobState) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get active and banned sites from DIRAC result = self.__jobDB.getSiteMask('Active') if not result['OK']: return S_ERROR("Cannot retrieve active sites from JobDB") wmsActiveSites = result['Value'] result = self.__jobDB.getSiteMask('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): sites = self._applySiteFilter(userSites, wmsActiveSites, wmsBannedSites) if not sites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(userSites)) # Get the Input data # Third, check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data %s" % (result['Message'])) return S_ERROR('Failed to get input data from JobDB') if not result['Value']: # No input data? Generate requirements and next return self.__sendToTQ(jobState, userSites, userBannedSites) inputData = result['Value'] self.jobLog.verbose('Has an input data requirement') idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info: %s" % result['Message']) return S_ERROR("File Catalog Access Failure") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) siteCandidates = self._applySiteFilter(siteCandidates, userSites, userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] #Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error( "Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") #Check if staging is required stageRequired, siteCandidates = self.__resolveStaging( jobState, inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, wmsActiveSites, wmsBannedSites) if not stageSites: return self.__holdJob( jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): if not self.__checkStageAllowed(jobState): return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData result = self.__requestStaging(jobState, stageSite, opData) if not result['OK']: return result stageLFNs = result['Value'] self._updateSharedSESites(stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self._setJobSite(jobState, stageSites)
def export_checkComponentLog(self, component): """ Check component log for errors """ componentList = [] if '*' in component: if component == '*': result = gComponentInstaller.getSetupComponents() if result['OK']: for ctype in ['Services', 'Agents', 'Executors']: if ctype in result['Value']: for sname in result['Value'][ctype]: for cname in result['Value'][ctype][sname]: componentList.append('/'.join( [sname, cname])) elif isinstance(component, basestring): componentList = [component] else: componentList = component resultDict = {} for comp in componentList: if '/' not in comp: continue system, cname = comp.split('/') startDir = gComponentInstaller.startDir currentLog = startDir + '/' + system + '_' + cname + '/log/current' try: logFile = file(currentLog, 'r') except IOError as err: gLogger.error("File does not exists:", currentLog) resultDict[comp] = { 'ErrorsHour': -1, 'ErrorsDay': -1, 'LastError': currentLog + '::' + repr(err) } continue logLines = logFile.readlines() logFile.close() errors_1 = 0 errors_24 = 0 now = dateTime() lastError = '' for line in logLines: if "ERROR:" in line: fields = line.split() recent = False if len(fields) < 2: # if the line contains only one word lastError = line.split('ERROR:')[-1].strip() continue timeStamp = fromString(fields[0] + ' ' + fields[1]) if not timeStamp: # if the timestamp is missing in the log lastError = line.split('ERROR:')[-1].strip() continue if (now - timeStamp) < hour: errors_1 += 1 recent = True if (now - timeStamp) < day: errors_24 += 1 recent = True if recent: lastError = line.split('ERROR:')[-1].strip() resultDict[comp] = { 'ErrorsHour': errors_1, 'ErrorsDay': errors_24, 'LastError': lastError } return S_OK(resultDict)
def optimizeJob(self, jid, jobState): """ 1. Banned sites are removed from the destination list. 2. Get input files 3. Production jobs are sent directly to TQ 4. Check if staging is necessary """ # Reschedule delay result = jobState.getAttributes(['RescheduleCounter', 'RescheduleTime', 'ApplicationStatus']) if not result['OK']: return result attDict = result['Value'] try: reschedules = int(attDict['RescheduleCounter']) except (ValueError, KeyError): return S_ERROR("RescheduleCounter has to be an integer") if reschedules != 0: delays = self.ex_getOption('RescheduleDelays', [60, 180, 300, 600]) delay = delays[min(reschedules, len(delays) - 1)] waited = toEpoch() - toEpoch(fromString(attDict['RescheduleTime'])) if waited < delay: return self.__holdJob(jobState, 'On Hold: after rescheduling %s' % reschedules, delay) # Get the job manifest for the later checks result = jobState.getManifest() if not result['OK']: return S_ERROR("Could not retrieve job manifest: %s" % result['Message']) jobManifest = result['Value'] # Get site requirements result = self.__getSitesRequired(jobManifest) if not result['OK']: return result userSites, userBannedSites = result['Value'] # Get job type result = jobState.getAttribute("JobType") if not result['OK']: return S_ERROR("Could not retrieve job type") jobType = result['Value'] # Get banned sites from DIRAC result = self.siteClient.getSites('Banned') if not result['OK']: return S_ERROR("Cannot retrieve banned sites from JobDB") wmsBannedSites = result['Value'] # If the user has selected any site, filter them and hold the job if not able to run if userSites: if jobType not in self.ex_getOption('ExcludedOnHoldJobTypes', []): result = self.siteClient.getUsableSites(userSites) if not result['OK']: return S_ERROR("Problem checking userSites for tuple of active/banned/invalid sites") usableSites = set(result['Value']) bannedSites = [] invalidSites = [] for site in userSites: if site in wmsBannedSites: bannedSites.append(site) elif site not in usableSites: invalidSites.append(site) if invalidSites: self.jobLog.debug("Invalid site(s) requested: %s" % ','.join(invalidSites)) if not self.ex_getOption('AllowInvalidSites', True): return self.__holdJob(jobState, "Requested site(s) %s are invalid" % ",".join(invalidSites)) if bannedSites: self.jobLog.debug("Banned site(s) %s ignored" % ",".join(bannedSites)) if not usableSites: return self.__holdJob(jobState, "Requested site(s) %s are inactive" % ",".join(bannedSites)) if not usableSites: return self.__holdJob(jobState, "No requested site(s) are active/valid") userSites = list(usableSites) checkPlatform = self.ex_getOption('CheckPlatform', False) jobPlatform = jobManifest.getOption("Platform", None) # First check that the platform is valid (in OSCompatibility list) if checkPlatform and jobPlatform: result = gConfig.getOptionsDict('/Resources/Computing/OSCompatibility') if not result['OK']: return S_ERROR("Unable to get OSCompatibility list") allPlatforms = result['Value'] if jobPlatform not in allPlatforms: self.jobLog.error("Platform %s is not supported" % jobPlatform) return S_ERROR("Platform %s is not supported" % jobPlatform) # Filter the userSites by the platform selection (if there is one) if checkPlatform and userSites: if jobPlatform: result = self.__filterByPlatform(jobPlatform, userSites) if not result['OK']: self.jobLog.error("Failed to filter job sites by platform: %s" % result['Message']) return S_ERROR("Failed to filter job sites by platform") userSites = result['Value'] if not userSites: # No sites left after filtering -> Invalid platform/sites combination self.jobLog.error("No selected sites match platform '%s'" % jobPlatform) return S_ERROR("No selected sites match platform '%s'" % jobPlatform) # Check if there is input data result = jobState.getInputData() if not result['OK']: self.jobLog.error("Cannot get input data %s" % (result['Message'])) return S_ERROR("Failed to get input data from JobDB") if not result['Value']: # No input data? Just send to TQ return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites) self.jobLog.verbose("Has an input data requirement") inputData = result['Value'] # =================================================================================== # Production jobs are sent to TQ, but first we have to verify if staging is necessary # =================================================================================== if jobType in Operations().getValue('Transformations/DataProcessing', []): self.jobLog.info("Production job: sending to TQ, but first checking if staging is requested") res = getFilesToStage(inputData, jobState=jobState, checkOnlyTapeSEs=self.ex_getOption('CheckOnlyTapeSEs', True), jobLog=self.jobLog) if not res['OK']: return self.__holdJob(jobState, res['Message']) if res['Value']['absentLFNs']: # Some files do not exist at all... set the job Failed # Reverse errors reasons = {} for lfn, reason in res['Value']['absentLFNs'].iteritems(): reasons.setdefault(reason, []).append(lfn) for reason, lfns in reasons.iteritems(): # Some files are missing in the FC or in SEs, fail the job self.jobLog.error(reason, ','.join(lfns)) error = ','.join(reasons) return S_ERROR(error) if res['Value']['failedLFNs']: return self.__holdJob(jobState, "Couldn't get storage metadata of some files") stageLFNs = res['Value']['offlineLFNs'] if stageLFNs: res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") self.__requestStaging(jobState, stageLFNs) return S_OK() else: # No staging required onlineSites = res['Value']['onlineSites'] if onlineSites: # Set the online site(s) first userSites = set(userSites) onlineSites &= userSites userSites = list(onlineSites) + list(userSites - onlineSites) return self.__sendToTQ(jobState, jobManifest, userSites, userBannedSites, onlineSites=onlineSites) # =================================================== # From now on we know it's a user job with input data # =================================================== idAgent = self.ex_getOption('InputDataAgent', 'InputData') result = self.retrieveOptimizerParam(idAgent) if not result['OK']: self.jobLog.error("Could not retrieve input data info", result['Message']) return S_ERROR("Could not retrieve input data info") opData = result['Value'] if 'SiteCandidates' not in opData: return S_ERROR("No possible site candidates") # Filter input data sites with user requirement siteCandidates = list(opData['SiteCandidates']) self.jobLog.info("Site candidates are %s" % siteCandidates) if userSites: siteCandidates = list(set(siteCandidates) & set(userSites)) siteCandidates = self._applySiteFilter(siteCandidates, banned=userBannedSites) if not siteCandidates: return S_ERROR("Impossible InputData * Site requirements") idSites = {} for site in siteCandidates: idSites[site] = opData['SiteCandidates'][site] # Check if sites have correct count of disk+tape replicas numData = len(inputData) errorSites = set() for site in idSites: if numData != idSites[site]['disk'] + idSites[site]['tape']: self.jobLog.error("Site candidate %s does not have all the input data" % site) errorSites.add(site) for site in errorSites: idSites.pop(site) if not idSites: return S_ERROR("Site candidates do not have all the input data") # Check if staging is required stageRequired, siteCandidates = self.__resolveStaging(inputData, idSites) if not siteCandidates: return S_ERROR("No destination sites available") # Is any site active? stageSites = self._applySiteFilter(siteCandidates, banned=wmsBannedSites) if not stageSites: return self.__holdJob(jobState, "Sites %s are inactive or banned" % ", ".join(siteCandidates)) # If no staging is required send to TQ if not stageRequired: # Use siteCandidates and not stageSites because active and banned sites # will be taken into account on matching time return self.__sendToTQ(jobState, jobManifest, siteCandidates, userBannedSites) # Check if the user is allowed to stage if self.ex_getOption("RestrictDataStage", False): res = self.__checkStageAllowed(jobState) if not res['OK']: return res if not res['Value']: return S_ERROR("Stage not allowed") # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites stageSite = stageSites[0] self.jobLog.verbose(" Staging site will be %s" % (stageSite)) stageData = idSites[stageSite] # Set as if everything has already been staged stageData['disk'] += stageData['tape'] stageData['tape'] = 0 # Set the site info back to the original dict to save afterwards opData['SiteCandidates'][stageSite] = stageData stageRequest = self.__preRequestStaging(jobManifest, stageSite, opData) if not stageRequest['OK']: return stageRequest stageLFNs = stageRequest['Value'] result = self.__requestStaging(jobState, stageLFNs) if not result['OK']: return result stageLFNs = result['Value'] self.__updateSharedSESites(jobManifest, stageSite, stageLFNs, opData) # Save the optimizer data again self.jobLog.verbose('Updating %s Optimizer Info:' % (idAgent), opData) result = self.storeOptimizerParam(idAgent, opData) if not result['OK']: return result return self.__setJobSite(jobState, stageSites)