def sendNumTaskQueues(): result = gTaskQueueDB.getNumTaskQueues() if result['OK']: gMonitor.addMark('numTQs', result['Value']) else: gLogger.error("Cannot get the number of task queues", result['Message'])
def __checkReplicas( self ): """ check done replicas and update file states """ waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation if opFile.Status in ( "Waiting", "Scheduled" ) ] ) targetSESet = set( self.operation.targetSEList ) replicas = self.fc.getReplicas( waitingFiles.keys() ) if not replicas["OK"]: self.log.error( 'Failed to get replicas', replicas["Message"] ) return replicas reMissing = re.compile( r".*such file.*" ) for failedLFN, errStr in replicas["Value"]["Failed"].items(): waitingFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): self.log.error( "File does not exists", failedLFN ) gMonitor.addMark( "ReplicateFail", len( targetSESet ) ) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].items(): if targetSESet.issubset( set( reps ) ): self.log.info( "file %s has been replicated to all targets" % successfulLFN ) waitingFiles[successfulLFN].Status = "Done" return S_OK()
class MatcherHandler(RequestHandler): def initialize(self): self.limiter = Limiter(jobDB=gJobDB) ############################################################################## types_requestJob = [[StringType, DictType]] def export_requestJob(self, resourceDescription): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ resourceDescription['Setup'] = self.serviceInfoDict['clientSetup'] credDict = self.getRemoteCredentials() try: opsHelper = Operations(group=credDict['group']) matcher = Matcher(pilotAgentsDB=pilotAgentsDB, jobDB=gJobDB, tqDB=gTaskQueueDB, jlDB=jlDB, opsHelper=opsHelper) result = matcher.selectJob(resourceDescription, credDict) except RuntimeError, rte: self.log.error("Error requesting job: ", rte) return S_ERROR("Error requesting job") gMonitor.addMark("matchesDone") gMonitor.addMark("matchesOK") return S_OK(result)
def export_requestJob(self, resourceDescription): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ resourceDescription['Setup'] = self.serviceInfoDict['clientSetup'] credDict = self.getRemoteCredentials() try: opsHelper = Operations(group=credDict['group']) matcher = Matcher(pilotAgentsDB=pilotAgentsDB, jobDB=gJobDB, tqDB=gTaskQueueDB, jlDB=jlDB, opsHelper=opsHelper) result = matcher.selectJob(resourceDescription, credDict) except RuntimeError as rte: self.log.error("Error requesting job: ", rte) return S_ERROR("Error requesting job") # result can be empty, meaning that no job matched if result: gMonitor.addMark("matchesDone") gMonitor.addMark("matchesOK") return S_OK(result) # FIXME: This is correctly interpreted by the JobAgent, but DErrno should be used instead return S_ERROR("No match found")
def export_putRequest( self, requestJSON ): """ forward request from local RequestDB to central RequestManager :param self: self reference :param str requestType: request type """ gMonitor.addMark( 'reqReceived', 1 ) requestDict = json.loads( requestJSON ) requestName = requestDict.get( "RequestID", requestDict.get( 'RequestName', "***UNKNOWN***" ) ) gLogger.info( "putRequest: got request '%s'" % requestName ) forwardable = self.__forwardable( requestDict ) if not forwardable["OK"]: gLogger.warn( "putRequest: %s" % forwardable["Message"] ) setRequest = self.requestManager().putRequest( requestJSON ) if not setRequest["OK"]: gLogger.error( "setReqeuest: unable to set request '%s' @ RequestManager: %s" % ( requestName, setRequest["Message"] ) ) # # put request to the request file cache save = self.__saveRequest( requestName, requestJSON ) if not save["OK"]: gLogger.error( "setRequest: unable to save request to the cache: %s" % save["Message"] ) return save gLogger.info( "setRequest: %s is saved to %s file" % ( requestName, save["Value"] ) ) return S_OK( { "set" : False, "saved" : True } ) gLogger.info( "setRequest: request '%s' has been set to the ReqManager" % ( requestName ) ) return S_OK( { "set" : True, "saved" : False } )
def export_requestJob(self, resourceDescription): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ resourceDescription['Setup'] = self.serviceInfoDict['clientSetup'] credDict = self.getRemoteCredentials() try: opsHelper = Operations(group=credDict['group']) matcher = Matcher(pilotAgentsDB=pilotAgentsDB, jobDB=gJobDB, tqDB=gTaskQueueDB, jlDB=jlDB, opsHelper=opsHelper) result = matcher.selectJob(resourceDescription, credDict) except RuntimeError as rte: self.log.error("Error requesting job: ", rte) return S_ERROR("Error requesting job") # result can be empty, meaning that no job matched if result: gMonitor.addMark("matchesDone") gMonitor.addMark("matchesOK") return S_OK(result) # FIXME: This is correctly interpreted by the JobAgent, but DErrno should be used instead return S_ERROR("No match found")
def export_requestJob(self, resourceDescription): """Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ resourceDescription["Setup"] = self.serviceInfoDict["clientSetup"] credDict = self.getRemoteCredentials() pilotRef = resourceDescription.get("PilotReference", "Unknown") try: opsHelper = Operations(group=credDict["group"]) matcher = Matcher( pilotAgentsDB=self.pilotAgentsDB, jobDB=self.jobDB, tqDB=self.taskQueueDB, jlDB=self.jobLoggingDB, opsHelper=opsHelper, pilotRef=pilotRef, ) result = matcher.selectJob(resourceDescription, credDict) except RuntimeError as rte: self.log.error("Error requesting job for pilot", "[%s] %s" % (pilotRef, rte)) return S_ERROR("Error requesting job") except PilotVersionError as pve: self.log.warn("Pilot version error for pilot", "[%s] %s" % (pilotRef, pve)) return S_ERROR(DErrno.EWMSPLTVER, callStack=[]) # result can be empty, meaning that no job matched if result: gMonitor.addMark("matchesDone") gMonitor.addMark("matchesOK") return S_OK(result) return S_ERROR(DErrno.EWMSNOMATCH, callStack=[])
def __printSummary(self): ''' pretty print summary ''' res = self.storageUsage.getStorageSummary() if res['OK']: self.log.notice("Storage Usage Summary") self.log.notice( "============================================================") self.log.notice( "%-40s %20s %20s" % ('Storage Element', 'Number of files', 'Total size')) for se in sorted(res['Value']): site = se.split('_')[0].split('-')[0] gMonitor.registerActivity("%s-used" % se, "%s usage" % se, "StorageUsage/%s usage" % site, "", gMonitor.OP_MEAN, bucketLength=600) gMonitor.registerActivity("%s-files" % se, "%s files" % se, "StorageUsage/%s files" % site, "Files", gMonitor.OP_MEAN, bucketLength=600) time.sleep(2) for se in sorted(res['Value']): usage = res['Value'][se]['Size'] files = res['Value'][se]['Files'] self.log.notice("%-40s %20s %20s" % (se, str(files), str(usage))) gMonitor.addMark("%s-used" % se, usage) gMonitor.addMark("%s-files" % se, files)
def __checkReplicas(self): """ check done replicas and update file states """ waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) replicas = self.fc.getReplicas(waitingFiles.keys()) if not replicas["OK"]: self.log.error('Failed to get replicas', replicas["Message"]) return replicas reMissing = re.compile(r".*such file.*") for failedLFN, errStr in replicas["Value"]["Failed"].iteritems(): waitingFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): self.log.error("File does not exists", failedLFN) gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[failedLFN].Status = "Failed" for successfulLFN, reps in replicas["Value"]["Successful"].iteritems(): if targetSESet.issubset(set(reps)): self.log.info("file %s has been replicated to all targets" % successfulLFN) waitingFiles[successfulLFN].Status = "Done" return S_OK()
def execute(self): """ Main execution method. Just fills a list, and a queue, with BKKQueries ID. """ gMonitor.addMark('Iteration', 1) # Get all the transformations result = self.transClient.getTransformations(condDict={'Status': ['Active', 'Idle']}) if not result['OK']: self._logError("Failed to get transformations.", result['Message']) return S_OK() transIDsList = [long(transDict['TransformationID']) for transDict in result['Value']] res = self.transClient.getTransformationsWithBkQueries(transIDsList) if not res['OK']: self._logError("Failed to get transformations with Bk Queries.", res['Message']) return S_OK() transIDsWithBkQueriesList = res['Value'] _count = 0 # Process each transformation for transID in transIDsWithBkQueriesList: if transID in self.bkQueriesInCheck: continue self.bkQueriesInCheck.append(transID) self.bkQueriesToBeChecked.put(transID) _count += 1 self._logInfo("Out of %d transformations, %d put in thread queue" % (len(result['Value']), _count)) self.__dumpLog() return S_OK()
def execute(self): """ one cycle execution """ now = datetime.datetime.now() kickTime = now - datetime.timedelta(hours=self.KICK_ASSIGNED_HOURS) rmTime = now - datetime.timedelta(days=self.DEL_GRACE_DAYS) kicked = 0 deleted = 0 # # select Assigned FTSJobs assignedFTSJobList = self.ftsClient().getFTSJobList(["Assigned"], self.KICK_LIMIT) if not assignedFTSJobList["OK"]: self.log.error("execute: %s" % assignedFTSJobList["Message"]) return assignedFTSJobList assignedFTSJobList = assignedFTSJobList["Value"] for ftsJob in assignedFTSJobList: if ftsJob.LastUpdate > kickTime: self.log.debug( "FTSJob %s is Assigned for too long and has to be kicked" % ftsJob.FTSGUID) kicked += 1 ftsJob.Status = "Submitted" put = self.ftsClient().putFTSJob(ftsJob) if not put["OK"]: self.log.error("execute: unable to put back FTSJob %s: %s" % (ftsJob.FTSGUID, put["Message"])) return put finishedFTSJobList = self.ftsClient().getFTSJobList( list(FTSJob.FINALSTATES), self.DEL_LIMIT) if not finishedFTSJobList["OK"]: self.log.error("execute: %s" % finishedFTSJobList["Message"]) return finishedFTSJobList finishedFTSJobList = finishedFTSJobList["Value"] for ftsJob in finishedFTSJobList: if ftsJob.LastUpdate > rmTime: self.log.debug("FTSJob %s is too old and has to be deleted" % ftsJob.FTSGUID) delJob = self.ftsClient().deleteFTSJob(ftsJob.FTSJobID) if not delJob["OK"]: self.log.error("execute: %s" % delJob["Message"]) return delJob else: putJob = self.ftsClient().putFTSJob(ftsJob) if not putJob["OK"]: self.log.error("execute: %s" % putJob["Message"]) return putJob self.log.info( "Assigned FTSJobs kicked %s Finished FTSJobs deleted %s" % (kicked, deleted)) gMonitor.addMark("KickedFTSJobs", kicked) gMonitor.addMark("DeletedFTSJobs", deleted) return S_OK()
def _endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = 0 if wallTime: percentage = cpuTime / wallTime * 100. if percentage > 0: gMonitor.addMark('CPU', percentage)
def _endReportToMonitoring( self, initialWallTime, initialCPUTime ): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = 0 if wallTime: percentage = cpuTime / wallTime * 100. if percentage > 0: gMonitor.addMark( 'CPU', percentage )
def execute( self ): """ one cycle execution """ # Don't use the server certificate otherwise the DFC wont let us write gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) log = gLogger.getSubLogger( "execute" ) # # reset FTSPlacement if expired now = datetime.datetime.now() if now > self.__ftsPlacementValidStamp: log.info( "resetting expired FTS placement..." ) resetFTSPlacement = self.resetFTSPlacement() if not resetFTSPlacement["OK"]: log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] ) return resetFTSPlacement self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS ) if not requestIDs["OK"]: log.error( "unable to read scheduled request ids" , requestIDs["Message"] ) return requestIDs if not requestIDs["Value"]: requestIDs = [] else: requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ] requestIDs += self.__reqCache.keys() if not requestIDs: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestIDs ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) ) for requestID in requestIDs: request = self.getRequest( requestID ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestID while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK()
def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND ) if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \ 'was not found' in monitor['Message'] or\ "Not found" in monitor['Message'] or\ 'Unknown transfer state' in monitor['Message']: log.error( "FTSJob not known (expired on server?): delete it" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: if 'Unknown transfer state' in finalizeFTSJob['Message']: for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) else: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict )
def execute( self ): """ one cycle execution """ # Don't use the server certificate otherwise the DFC wont let us write gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) log = gLogger.getSubLogger( "execute" ) # # reset FTSPlacement if expired now = datetime.datetime.now() if now > self.__ftsPlacementValidStamp: log.info( "resetting expired FTS placement..." ) resetFTSPlacement = self.resetFTSPlacement() if not resetFTSPlacement["OK"]: log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] ) return resetFTSPlacement self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS ) if not requestIDs["OK"]: log.error( "unable to read scheduled request ids" , requestIDs["Message"] ) return requestIDs if not requestIDs["Value"]: requestIDs = [] else: requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ] requestIDs += self.__reqCache.keys() if not requestIDs: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestIDs ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) ) for requestID in requestIDs: request = self.getRequest( requestID ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestID while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK()
def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND ) if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \ 'was not found' in monitor['Message'] or\ "Not found" in monitor['Message'] or\ 'Unknown transfer state' in monitor['Message']: log.error( "FTSJob not known (expired on server?): delete it" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: if 'Unknown transfer state' in finalizeFTSJob['Message']: for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) else: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict )
def export_requestJob(self, resourceDescription): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ result = self.selectJob(resourceDescription) gMonitor.addMark("matchesDone") if result['OK']: gMonitor.addMark("matchesOK") return result
def export_requestJob( self, resourceDescription ): """ Serve a job to the request of an agent which is the highest priority one matching the agent's site capacity """ result = self.selectJob( resourceDescription ) gMonitor.addMark( "matchesDone" ) if result[ 'OK' ]: gMonitor.addMark( "matchesOK" ) return result
def execute( self ): """ one cycle execution """ now = datetime.datetime.now() kickTime = now - datetime.timedelta( hours = self.KICK_ASSIGNED_HOURS ) rmTime = now - datetime.timedelta( days = self.DEL_GRACE_DAYS ) kicked = 0 deleted = 0 # # select Assigned FTSJobs assignedFTSJobList = self.ftsClient().getFTSJobList( ["Assigned"], self.KICK_LIMIT ) if not assignedFTSJobList["OK"]: self.log.error( "execute: %s" % assignedFTSJobList["Message"] ) return assignedFTSJobList assignedFTSJobList = assignedFTSJobList["Value"] for ftsJob in assignedFTSJobList: if ftsJob.LastUpdate > kickTime: self.log.debug( "FTSJob %s is Assigned for too long and has to be kicked" % ftsJob.FTSGUID ) kicked += 1 ftsJob.Status = "Submitted" put = self.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: self.log.error( "execute: unable to put back FTSJob %s: %s" % ( ftsJob.FTSGUID, put["Message"] ) ) return put finishedFTSJobList = self.ftsClient().getFTSJobList( list( FTSJob.FINALSTATES ), self.DEL_LIMIT ) if not finishedFTSJobList["OK"]: self.log.error( "execute: %s" % finishedFTSJobList["Message"] ) return finishedFTSJobList finishedFTSJobList = finishedFTSJobList["Value"] for ftsJob in finishedFTSJobList: if ftsJob.LastUpdate > rmTime: self.log.debug( "FTSJob %s is too old and has to be deleted" % ftsJob.FTSGUID ) delJob = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not delJob["OK"]: self.log.error( "execute: %s" % delJob["Message"] ) return delJob else: putJob = self.ftsClient().putFTSJob( ftsJob ) if not putJob["OK"]: self.log.error( "execute: %s" % putJob["Message"] ) return putJob self.log.info( "Assigned FTSJobs kicked %s Finished FTSJobs deleted %s" % ( kicked, deleted ) ) gMonitor.addMark( "KickedFTSJobs", kicked ) gMonitor.addMark( "DeletedFTSJobs", deleted ) return S_OK()
def registerCopiedFiles(self, filesNewlyCopied, copiedFiles, allUnmigratedFilesMeta): """ Register successfuly copied files (newly, or in Copied status in the DB) in the DFC. :param filesNewlyCopied: [lfns] of files newly copied :param copiedFiles: {lfn:RIDb metadata} of files that were in Copied state. :param allUnmigratedFilesMeta: {lfn:RI Db metadata} for all lfns non migrated at the beginning of the loop. :return: {lfn:True} for successfuly registered lfns """ if filesNewlyCopied or copiedFiles: self.log.info("Attempting to register %s newly copied and %s previously copied files" % (len(filesNewlyCopied), len(copiedFiles))) else: self.log.info("No files to be registered") # Update copiedFiles to also contain the newly copied files copiedFiles.update(dict((lfn, allUnmigratedFilesMeta[lfn]) for lfn in filesNewlyCopied)) successfulRegister = {} failedRegister = {} # Try to register them by batch for lfnChunk in breakListIntoChunks(copiedFiles, 100): # Add the metadata lfnDictChuck = dict((lfn, copiedFiles[lfn]) for lfn in lfnChunk) res = self.fileCatalog.addFile(lfnDictChuck) if not res['OK']: self.log.error("Completely failed to register some successfully copied file.", res['Message']) failedRegister.update(dict((lfn, res['Message']) for lfn in lfnDictChuck)) else: successfulRegister.update(res['Value']['Successful']) failedRegister.update(res['Value']['Failed']) gMonitor.addMark("ErrorRegister", len(failedRegister)) for lfn, reason in failedRegister.iteritems(): self.log.error("Failed to register lfn. Setting to Copied", "%s: %s" % (lfn, reason)) res = self.rawIntegrityDB.setFileStatus(lfn, 'Copied') if not res['OK']: self.log.error("Error setting file status to Copied", "%s: %s" % (lfn, res['Message'])) for lfn in successfulRegister: self.log.info("Successfully registered %s in the File Catalog." % lfn) return successfulRegister
def export_putRequest(self, requestJSON): """forward request from local RequestDB to central RequestManager :param self: self reference :param str requestType: request type """ gMonitor.addMark("reqReceived", 1) requestDict = json.loads(requestJSON) requestName = requestDict.get( "RequestID", requestDict.get("RequestName", "***UNKNOWN***")) gLogger.info("putRequest: got request '%s'" % requestName) # We only need the object to check the authorization request = Request(requestDict) # Check whether the credentials in the Requests are correct and allowed to be set isAuthorized = RequestValidator.setAndCheckRequestOwner( request, self.getRemoteCredentials()) if not isAuthorized: return S_ERROR(DErrno.ENOAUTH, "Credentials in the requests are not allowed") forwardable = self.__forwardable(requestDict) if not forwardable["OK"]: gLogger.warn("putRequest: %s" % forwardable["Message"]) setRequest = self.requestManager().putRequest(requestJSON) if not setRequest["OK"]: gLogger.error( "setReqeuest: unable to set request '%s' @ RequestManager: %s" % (requestName, setRequest["Message"])) # # put request to the request file cache save = self.__saveRequest(requestName, requestJSON) if not save["OK"]: gLogger.error( "setRequest: unable to save request to the cache: %s" % save["Message"]) return save gLogger.info("setRequest: %s is saved to %s file" % (requestName, save["Value"])) return S_OK({"set": False, "saved": True}) gLogger.info( "setRequest: request '%s' has been set to the ReqManager" % (requestName)) return S_OK({"set": True, "saved": False})
def __getGraph( self, plotFunc, args ): fromSecs = args[0] toSecs = args[1] graphFile = "%s-%s-%s.png" % ( self.__generateName( *args[2:] ), self.rrdManager.bucketize( fromSecs, self.graceTime ), self.rrdManager.bucketize( toSecs, self.graceTime ) ) if self.__isCacheGraph( graphFile ): self.__refreshGraph( graphFile ) gLogger.info( "Cached graph file %s" % graphFile ) gMonitor.addMark( "cachedplots" ) return S_OK( graphFile ) else: gMonitor.addMark( "drawnplots" ) self.__registerGraph( graphFile, fromSecs, toSecs ) return plotFunc( graphFilename = graphFile, *args )
def __getGraph(self, plotFunc, args): fromSecs = args[0] toSecs = args[1] graphFile = "%s-%s-%s.png" % (self.__generateName( *args[2:]), self.rrdManager.bucketize(fromSecs, self.graceTime), self.rrdManager.bucketize( toSecs, self.graceTime)) if self.__isCacheGraph(graphFile): self.__refreshGraph(graphFile) gLogger.info("Cached graph file %s" % graphFile) gMonitor.addMark("cachedplots") return S_OK(graphFile) else: gMonitor.addMark("drawnplots") self.__registerGraph(graphFile, fromSecs, toSecs) return plotFunc(graphFilename=graphFile, *args)
def _startReportToMonitoring( self ): try: now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 10: return ( now, cpuTime ) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB( 'VmRSS:' ) if membytes: mem = membytes / ( 1024. * 1024. ) gMonitor.addMark( 'MEM', mem ) return( now, cpuTime ) except Exception: return False
def _startReportToMonitoring(self): try: now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 10: return (now, cpuTime) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB('VmRSS:') if membytes: mem = membytes / (1024. * 1024.) gMonitor.addMark('MEM', mem) return (now, cpuTime) except Exception: return False
def export_getReport( self, reportRequest ): """ Plot a accounting Arguments: - viewName : Name of view (easy!) - startTime - endTime - argsDict : Arguments to the view. - grouping - extraArgs """ retVal = self.__checkPlotRequest( reportRequest ) if not retVal[ 'OK' ]: return retVal reporter = MainReporter( self.__acDB, self.serviceInfoDict[ 'clientSetup' ] ) gMonitor.addMark( "reportsRequested" ) reportRequest[ 'generatePlot' ] = False return reporter.generate( reportRequest, self.getRemoteCredentials() )
def export_getReport( self, reportRequest ): """ Plot a accounting Arguments: - viewName : Name of view (easy!) - startTime - endTime - argsDict : Arguments to the view. - grouping - extraArgs """ retVal = self.__checkPlotRequest( reportRequest ) if not retVal[ 'OK' ]: return retVal reporter = MainReporter( self.__acDB, self.serviceInfoDict[ 'clientSetup' ] ) gMonitor.addMark( "reportsRequested" ) reportRequest[ 'generatePlot' ] = False return reporter.generate( reportRequest, self.getRemoteCredentials() )
def dmRemoval(self, toRemoveDict, targetSEs ): gMonitor.addMark( "RemoveReplicaAtt", len( toRemoveDict ) * len( targetSEs ) ) # # keep status for each targetSE removalStatus = dict.fromkeys( toRemoveDict.keys(), None ) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys( targetSEs, None ) # # loop over targetSEs for targetSE in targetSEs: self.log.info( "removing replicas at %s" % targetSE ) # # 1st step - bulk removal bulkRemoval = self.bulkRemoval( toRemoveDict, targetSE ) if not bulkRemoval["OK"]: self.log.error( 'Bulk replica removal failed', bulkRemoval["Message"] ) return bulkRemoval bulkRemoval = bulkRemoval["Value"] # # update removal status for successful files removalOK = [ opFile for opFile in bulkRemoval.values() if not opFile.Error ] for opFile in removalOK: removalStatus[opFile.LFN][targetSE] = "" gMonitor.addMark( "RemoveReplicaOK", len( removalOK ) ) # # 2nd step - process the rest again toRetry = dict( [ ( lfn, opFile ) for lfn, opFile in bulkRemoval.items() if opFile.Error ] ) for lfn, opFile in toRetry.items(): self.singleRemoval( opFile, targetSE ) if not opFile.Error: gMonitor.addMark( "RemoveReplicaOK", 1 ) removalStatus[lfn][targetSE] = "" else: gMonitor.addMark( "RemoveReplicaFail", 1 ) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = list( set( [ error for error in removalStatus[lfn].values() if error ] ) ) if errors: opFile.Error = ",".join( errors ) # This seems to be the only offending error if "Write access not permitted for this credential" in opFile.Error: failed += 1 continue opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s replicas" % failed return S_OK(removalStatus)
def __checkReplicas(self): """ check done replicas and update file states """ waitingFiles = dict([(opFile.LFN, opFile) for opFile in self.operation if opFile.Status in ("Waiting", "Scheduled")]) targetSESet = set(self.operation.targetSEList) # Check replicas res = self.ci._getCatalogReplicas(list(waitingFiles)) if not res["OK"]: self.log.error('Failed to get catalog replicas', res["Message"]) return S_ERROR() allReplicas = res['Value'][0] replicas = self.ci.compareChecksum(list(waitingFiles)) if not replicas["OK"]: self.log.error('Failed to check replicas', replicas["Message"]) return S_ERROR() replicas = replicas["Value"] noReplicas = replicas['NoReplicas'] if noReplicas: if self.rmsMonitoring: self.rmsMonitoringReporter.addRecord( self.createRMSRecord("Failed", len(noReplicas))) self.rmsMonitoringReporter.commit() for lfn in noReplicas.keys(): self.log.error("File %s doesn't exist" % lfn) if not self.rmsMonitoring: gMonitor.addMark("ReplicateFail", len(targetSESet)) waitingFiles[lfn].Status = "Failed" for lfn, reps in allReplicas.items(): if targetSESet.issubset(set(reps)): self.log.info("file %s has been replicated to all targets" % lfn) waitingFiles[lfn].Status = "Done" return S_OK()
def _startReportToMonitoring(self): try: if not self.activityMonitoring: now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 10: return (now, cpuTime) # Send CPU consumption mark self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB("VmRSS:") if membytes: mem = membytes / (1024.0 * 1024.0) gMonitor.addMark("MEM", mem) return (now, cpuTime) else: return False except Exception: return False
def export_putRequest(self, requestJSON): """ forward request from local RequestDB to central RequestManager :param self: self reference :param str requestType: request type """ gMonitor.addMark('reqReceived', 1) requestDict = json.loads(requestJSON) requestName = requestDict.get("RequestID", requestDict.get('RequestName', "***UNKNOWN***")) gLogger.info("putRequest: got request '%s'" % requestName) # We only need the object to check the authorization request = Request(requestDict) # Check whether the credentials in the Requests are correct and allowed to be set isAuthorized = RequestValidator.setAndCheckRequestOwner(request, self.getRemoteCredentials()) if not isAuthorized: return S_ERROR(DErrno.ENOAUTH, "Credentials in the requests are not allowed") forwardable = self.__forwardable(requestDict) if not forwardable["OK"]: gLogger.warn("putRequest: %s" % forwardable["Message"]) setRequest = self.requestManager().putRequest(requestJSON) if not setRequest["OK"]: gLogger.error( "setReqeuest: unable to set request '%s' @ RequestManager: %s" % (requestName, setRequest["Message"])) # # put request to the request file cache save = self.__saveRequest(requestName, requestJSON) if not save["OK"]: gLogger.error("setRequest: unable to save request to the cache: %s" % save["Message"]) return save gLogger.info("setRequest: %s is saved to %s file" % (requestName, save["Value"])) return S_OK({"set": False, "saved": True}) gLogger.info("setRequest: request '%s' has been set to the ReqManager" % (requestName)) return S_OK({"set": True, "saved": False})
def export_putRequest(self, requestJSON): """ forward request from local RequestDB to central RequestManager :param self: self reference :param str requestType: request type """ gMonitor.addMark('reqReceived', 1) requestDict = json.loads(requestJSON) requestName = requestDict.get( "RequestID", requestDict.get('RequestName', "***UNKNOWN***")) gLogger.info("putRequest: got request '%s'" % requestName) forwardable = self.__forwardable(requestDict) if not forwardable["OK"]: gLogger.warn("putRequest: %s" % forwardable["Message"]) setRequest = self.requestManager().putRequest(requestJSON) if not setRequest["OK"]: gLogger.error( "setReqeuest: unable to set request '%s' @ RequestManager: %s" % (requestName, setRequest["Message"])) # # put request to the request file cache save = self.__saveRequest(requestName, requestJSON) if not save["OK"]: gLogger.error( "setRequest: unable to save request to the cache: %s" % save["Message"]) return save gLogger.info("setRequest: %s is saved to %s file" % (requestName, save["Value"])) return S_OK({"set": False, "saved": True}) gLogger.info( "setRequest: request '%s' has been set to the ReqManager" % (requestName)) return S_OK({"set": True, "saved": False})
def __call__(self): """ call me maybe """ # # counter for failed files failedFiles = 0 # # catalog to use catalog = self.operation.Catalog dm = DataManager(catalogs=catalog) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark("RegisterAtt", 1) # # get LFN lfn = opFile.LFN # # and others fileTuple = (lfn, opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum) # # call DataManager registerFile = dm.registerFile(fileTuple) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: gMonitor.addMark("RegisterFail", 1) self.dataLoggingClient().addFileRecord(lfn, "RegisterFail", catalog, "", "RegisterFile") reason = registerFile.get( "Message", registerFile.get("Value", {}).get("Failed", {}).get(lfn, 'Unknown')) errorStr = "failed to register LFN %s: %s" % (lfn, reason) opFile.Error = errorStr self.log.warn(errorStr) failedFiles += 1 else: gMonitor.addMark("RegisterOK", 1) self.dataLoggingClient().addFileRecord(lfn, "Register", catalog, "", "RegisterFile") self.log.info("file %s has been registered at %s" % (lfn, catalog)) opFile.Status = "Done" # # final check if failedFiles: self.log.info("all files processed, %s files failed to register" % failedFiles) self.operation.Error = "some files failed to register" return S_ERROR(self.operation.Error) return S_OK()
def removeRegisteredFiles(self, filesNewlyRegistered, registeredFiles, allUnmigratedFilesMeta): """ Remove successfuly registered files (newly, or in Registered status in the DB) from the OnlineStorage :param filesNewlyCopied: [lfns] of files newly copied :param copiedFiles: {lfn:RIDb metadata} of files that were in Copied state. :param allUnmigratedFilesMeta: {lfn:RI Db metadata} for all lfns non migrated at the beginning of the loop. :return: {lfn:True} for successfuly registered lfns """ if filesNewlyRegistered or registeredFiles: self.log.info("Attempting to remove %s newly registered and %s previously registered files" % (len(filesNewlyRegistered), len(registeredFiles))) else: self.log.info("No files to be removed") # Update registeredFiles to also contain the newly registered files registeredFiles.update(dict((lfn, allUnmigratedFilesMeta[lfn]) for lfn in filesNewlyRegistered)) onlineSE = StorageElement('OnlineRunDB') # Try to them them all res = onlineSE.removeFile(registeredFiles) filesNewlyRemoved = {} failedRemove = {} if not res['OK']: self.log.error("Completely failed to remove successfully registered files.", res['Message']) failedRemove = dict((lfn, res['Message']) for lfn in registeredFiles) else: filesNewlyRemoved = res['Value']['Successful'] failedRemove = res['Value']['Failed'] gMonitor.addMark("ErrorRemove", len(failedRemove)) for lfn, reason in failedRemove.iteritems(): self.log.error("Failed to remove lfn. Setting to Registered", "%s: %s" % (lfn, reason)) res = self.rawIntegrityDB.setFileStatus(lfn, 'Registered') if not res['OK']: self.log.error("Error setting file status to Registered", "%s: %s" % (lfn, res['Message'])) now = datetime.datetime.utcnow() for lfn in filesNewlyRemoved: self.log.info("Successfully removed %s from the Online storage. Setting it to Done" % lfn) res = self.rawIntegrityDB.setFileStatus(lfn, 'Done') if not res['OK']: self.log.error("Error setting file status to Done", "%s: %s" % (lfn, res['Message'])) else: # SubmitTime is ALREADY a datetime since it is declared as such in the DB. submitTime = allUnmigratedFilesMeta[lfn]['SubmitTime'] migrationTime = (now - submitTime).total_seconds() gMonitor.addMark("MigrationTime", migrationTime) fileSizeMB = allUnmigratedFilesMeta[lfn]['Size'] / (1024 * 1024.0) gMonitor.addMark("MigrationRate", fileSizeMB / migrationTime) return filesNewlyRemoved
def export_removeReplica(self, lfns): """ Remove the supplied replicas """ gMonitor.addMark("RemoveReplica", 1) res = self.fileCatalogDB.removeReplica(lfns, self.getRemoteCredentials()) if res['OK']: gMonitor.addMark("RemoveReplicaSuccessful", len(res.get('Value', {}).get('Successful', []))) gMonitor.addMark("RemoveReplicaFailed", len(res.get('Value', {}).get('Failed', []))) return res
def export_addFile( self, lfns ): """ Register supplied files """ gMonitor.addMark( "AddFile", 1 ) res = gFileCatalogDB.addFile( lfns, self.getRemoteCredentials() ) if res['OK']: gMonitor.addMark( "AddFileSuccessful", len( res.get( 'Value', {} ).get( 'Successful', [] ) ) ) gMonitor.addMark( "AddFileFailed", len( res.get( 'Value', {} ).get( 'Failed', [] ) ) ) return res
def export_addFile(self, lfns): """ Register supplied files """ gMonitor.addMark("AddFile", 1) res = self.fileCatalogDB.addFile(lfns, self.getRemoteCredentials()) if res['OK']: gMonitor.addMark("AddFileSuccessful", len(res.get('Value', {}).get('Successful', []))) gMonitor.addMark("AddFileFailed", len(res.get('Value', {}).get('Failed', []))) return res
def export_removeReplica( self, lfns ): """ Remove the supplied replicas """ gMonitor.addMark( "RemoveReplica", 1 ) res = gFileCatalogDB.removeReplica( lfns, self.getRemoteCredentials() ) if res['OK']: gMonitor.addMark( "RemoveReplicaSuccessful", len( res.get( 'Value', {} ).get( 'Successful', [] ) ) ) gMonitor.addMark( "RemoveReplicaFailed", len( res.get( 'Value', {} ).get( 'Failed', [] ) ) ) return res
def __call__( self ): """ call me maybe """ # # counter for failed files failedFiles = 0 # # catalog(s) to use catalogs = self.operation.Catalog if catalogs: catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ] dm = DataManager( catalogs = catalogs ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark( "RegisterAtt", 1 ) # # get LFN lfn = opFile.LFN # # and others fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum ) # # call DataManager registerFile = dm.registerFile( fileTuple ) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: gMonitor.addMark( "RegisterFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" ) reason = str( registerFile.get( "Message", registerFile.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) ) ) errorStr = "failed to register LFN" opFile.Error = "%s: %s" % ( errorStr, reason ) if 'GUID already registered' in reason: opFile.Status = 'Failed' self.log.error( errorStr, "%s: %s" % ( lfn, reason ) ) elif 'File already registered with no replicas' in reason: self.log.warn( errorStr, "%s: %s, will remove it and retry" % ( lfn, reason ) ) dm.removeFile( lfn ) else: self.log.warn( errorStr, "%s: %s" % ( lfn, reason ) ) failedFiles += 1 else: gMonitor.addMark( "RegisterOK", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "Register", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" ) self.log.verbose( "file %s has been registered at %s" % ( lfn, ','.join( catalogs ) if catalogs else "all catalogs" ) ) opFile.Status = "Done" # # final check if failedFiles: self.log.warn( "all files processed, %s files failed to register" % failedFiles ) self.operation.Error = "some files failed to register" return S_ERROR( self.operation.Error ) return S_OK()
def __checkReplicas( self ): """ check done replicas and update file states """ waitingFiles = dict( [ ( opFile.LFN, opFile ) for opFile in self.operation if opFile.Status in ( "Waiting", "Scheduled" ) ] ) targetSESet = set( self.operation.targetSEList ) # Check replicas res = self.ci._getCatalogReplicas( waitingFiles.keys() ) if not res["OK"]: self.log.error( 'Failed to get catalog replicas', res["Message"] ) return S_ERROR() allReplicas = res['Value'][0] replicas = self.ci.compareChecksum( waitingFiles.keys() ) if not replicas["OK"]: self.log.error( 'Failed to check replicas', replicas["Message"] ) return S_ERROR() replicas = replicas["Value"] noReplicas = replicas['NoReplicas'] if noReplicas: for lfn in noReplicas.keys(): self.log.error( "File %s doesn't exist" % lfn ) gMonitor.addMark( "ReplicateFail", len( targetSESet ) ) waitingFiles[lfn].Status = "Failed" for lfn, reps in allReplicas.items(): if targetSESet.issubset( set( reps ) ): self.log.info( "file %s has been replicated to all targets" % lfn ) waitingFiles[lfn].Status = "Done" return S_OK()
def __call__(self): """Process the ArchiveFiles operation.""" try: gMonitor.addMark('ArchiveFilesAtt', 1) self._run() gMonitor.addMark('ArchiveFilesOK', 1) except RuntimeError as e: self.log.info('Failed to execute ArchiveFiles', repr(e)) gMonitor.addMark('ArchiveFilesFail', 1) return S_ERROR(str(e)) except Exception as e: self.log.exception('Failed to execute ArchiveFiles', repr(e), lException=e) gMonitor.addMark('ArchiveFilesFail', 1) return S_ERROR(str(e)) finally: self._cleanup() return S_OK()
def __call__( self ): """ call me maybe """ # # counter for failed files failedFiles = 0 # # catalog(s) to use catalogs = self.operation.Catalog if catalogs: catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ] dm = DataManager( catalogs = catalogs ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark( "RegisterAtt", 1 ) # # get LFN lfn = opFile.LFN # # and others fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum ) # # call DataManager registerFile = dm.registerFile( fileTuple ) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: gMonitor.addMark( "RegisterFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" ) reason = str( registerFile.get( "Message", registerFile.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) ) ) errorStr = "failed to register LFN" opFile.Error = "%s: %s" % ( errorStr, reason ) if 'GUID already registered' in reason: opFile.Status = 'Failed' self.log.error( errorStr, "%s: %s" % ( lfn, reason ) ) elif 'File already registered with no replicas' in reason: self.log.warn( errorStr, "%s: %s, will remove it and retry" % ( lfn, reason ) ) dm.removeFile( lfn ) else: self.log.warn( errorStr, "%s: %s" % ( lfn, reason ) ) failedFiles += 1 else: gMonitor.addMark( "RegisterOK", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "Register", ','.join( catalogs ) if catalogs else "all catalogs", "", "RegisterFile" ) self.log.verbose( "file %s has been registered at %s" % ( lfn, ','.join( catalogs ) if catalogs else "all catalogs" ) ) opFile.Status = "Done" # # final check if failedFiles: self.log.warn( "all files processed, %s files failed to register" % failedFiles ) self.operation.Error = "some files failed to register" return S_ERROR( self.operation.Error ) return S_OK()
def export_removeFile(self, lfns): """Remove the supplied lfns""" gMonitor.addMark("RemoveFile", 1) res = self.fileCatalogDB.removeFile(lfns, self.getRemoteCredentials()) if res["OK"]: gMonitor.addMark("RemoveFileSuccessful", len(res.get("Value", {}).get("Successful", []))) gMonitor.addMark("RemoveFileFailed", len(res.get("Value", {}).get("Failed", []))) return res
def export_addReplica(self, lfns): """Register supplied replicas""" gMonitor.addMark("AddReplica", 1) res = self.fileCatalogDB.addReplica(lfns, self.getRemoteCredentials()) if res["OK"]: gMonitor.addMark("AddReplicaSuccessful", len(res.get("Value", {}).get("Successful", []))) gMonitor.addMark("AddReplicaFailed", len(res.get("Value", {}).get("Failed", []))) return res
def sweeper(cls): """move cached request to the central request manager :param self: self reference """ cacheDir = cls.cacheDir() # # cache dir empty? if not os.listdir(cacheDir): gLogger.always("sweeper: CacheDir %s is empty, nothing to do" % cacheDir) return S_OK() else: # # read <sweepSize> cache dir files, the oldest first cachedRequests = [ os.path.abspath(requestFile) for requestFile in sorted( filter(os.path.isfile, [ os.path.join(cacheDir, requestName) for requestName in os.listdir(cacheDir) ]), key=os.path.getctime, ) ][:cls.sweepSize] # # set cached requests to the central RequestManager for cachedFile in cachedRequests: # # break if something went wrong last time try: requestJSON = "".join(open(cachedFile, "r").readlines()) cachedRequest = json.loads(requestJSON) cachedName = cachedRequest.get("RequestName", "***UNKNOWN***") putRequest = cls.requestManager().putRequest(requestJSON) if not putRequest["OK"]: gLogger.error( "sweeper: unable to set request %s @ ReqManager: %s" % (cachedName, putRequest["Message"])) gMonitor.addMark("reqFailed", 1) continue gLogger.info( "sweeper: successfully put request '%s' @ ReqManager" % cachedName) gMonitor.addMark("reqSwept", 1) os.unlink(cachedFile) except Exception as error: gMonitor.addMark("reqFailed", 1) gLogger.exception("sweeper: hit by exception", lException=error) return S_OK()
def __call__( self ): """ call me maybe """ # # counter for failed files failedFiles = 0 # # catalog to use catalog = self.operation.Catalog dm = DataManager( catalogs = catalog ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: gMonitor.addMark( "RegisterAtt", 1 ) # # get LFN lfn = opFile.LFN # # and others fileTuple = ( lfn , opFile.PFN, opFile.Size, self.operation.targetSEList[0], opFile.GUID, opFile.Checksum ) # # call DataManager registerFile = dm.registerFile( fileTuple ) # # check results if not registerFile["OK"] or lfn in registerFile["Value"]["Failed"]: gMonitor.addMark( "RegisterFail", 1 ) self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", catalog, "", "RegisterFile" ) reason = registerFile.get( "Message", registerFile.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) ) errorStr = "failed to register LFN %s: %s" % ( lfn, reason ) opFile.Error = errorStr self.log.warn( errorStr ) failedFiles += 1 else: gMonitor.addMark( "RegisterOK", 1 ) self.dataLoggingClient().addFileRecord( lfn, "Register", catalog, "", "RegisterFile" ) self.log.info( "file %s has been registered at %s" % ( lfn, catalog ) ) opFile.Status = "Done" # # final check if failedFiles: self.log.info( "all files processed, %s files failed to register" % failedFiles ) self.operation.Error = "some files failed to register" return S_ERROR( self.operation.Error ) return S_OK()
def sweeper( cls ): """ move cached request to the central request manager :param self: self reference """ cacheDir = cls.cacheDir() # # cache dir empty? if not os.listdir( cacheDir ): gLogger.always( "sweeper: CacheDir %s is empty, nothing to do" % cacheDir ) return S_OK() else: # # read 10 cache dir files, the oldest first cachedRequests = [ os.path.abspath( requestFile ) for requestFile in sorted( filter( os.path.isfile, [ os.path.join( cacheDir, requestName ) for requestName in os.listdir( cacheDir ) ] ), key = os.path.getctime ) ][:10] # # set cached requests to the central RequestManager for cachedFile in cachedRequests: # # break if something went wrong last time try: requestJSON = "".join( open( cachedFile, "r" ).readlines() ) cachedRequest = json.loads( requestJSON ) cachedName = cachedRequest.get( "RequestName", "***UNKNOWN***" ) putRequest = cls.requestManager().putRequest( requestJSON ) if not putRequest["OK"]: gLogger.error( "sweeper: unable to set request %s @ ReqManager: %s" % ( cachedName, putRequest["Message"] ) ) gMonitor.addMark( "reqFailed", 1 ) continue gLogger.info( "sweeper: successfully put request '%s' @ ReqManager" % cachedName ) gMonitor.addMark( "reqSwept", 1 ) os.unlink( cachedFile ) except Exception as error: gMonitor.addMark( "reqFailed", 1 ) gLogger.exception( "sweeper: hit by exception", lException = error ) return S_OK()
def ftsTransfer(self): """ replicate and register using FTS """ self.log.info("scheduling files in FTS...") bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("FTSScheduleAtt") gMonitor.addMark("FTSScheduleFail") return bannedTargets if bannedTargets['Value']: return S_OK("%s targets are banned for writing" % ",".join(bannedTargets['Value'])) # Can continue now self.log.verbose("No targets banned for writing") toSchedule = {} delayExecution = 0 errors = defaultdict(int) for opFile in self.getWaitingFilesList(): opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get('NoReplicas') badReplicas = replicas.get('Bad') noActiveReplicas = replicas.get('NoActiveReplicas') if validReplicas: validTargets = list(set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validReplicas, validTargets] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to schedule '%s', %s at %s" % (opFile.LFN, err, ','.join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.error("Unable to schedule transfer", "%s %s at %s" % (opFile.LFN, err, ','.join(noReplicas))) opFile.Error = err opFile.Status = 'Failed' elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error("Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(badReplicas))) opFile.Error = err opFile.Status = 'Failed' elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose("Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) # Log error counts for error, count in errors.iteritems(): self.log.error(error, 'for %d files' % count) filesToScheduleList = [] res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToSchedule = res['Value'] for lfn in filesToSchedule: filesToScheduleList.append((filesToSchedule[lfn][0].toJSON()['Value'], toSchedule[lfn][1], toSchedule[lfn][2])) if filesToScheduleList: ftsSchedule = FTSClient().ftsSchedule(self.request.RequestID, self.operation.OperationID, filesToScheduleList) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] if not ftsSchedule: return S_OK() self.log.info("%d files have been scheduled to FTS" % len(ftsSchedule['Successful'])) for opFile in self.operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) elif fileID in ftsSchedule["Failed"]: gMonitor.addMark("FTSScheduleFail", 1) opFile.Error = ftsSchedule["Failed"][fileID] if 'sourceSURL equals to targetSURL' in opFile.Error: # In this case there is no need to continue opFile.Status = 'Failed' self.log.warn("unable to schedule %s for FTS: %s" % (opFile.LFN, opFile.Error)) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True)
def __call__( self ): """ perform physical removal operation """ bannedTargets = self.checkSEsRSS( access = 'RemoveAccess' ) if not bannedTargets['OK']: gMonitor.addMark( "PhysicalRemovalAtt" ) gMonitor.addMark( "PhysicalRemovalFail" ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for removal" % ",".join( bannedTargets['Value'] ) ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # prepare lfn dict toRemoveDict = dict( ( opFile.LFN, opFile ) for opFile in waitingFiles ) targetSEs = self.operation.targetSEList gMonitor.addMark( "PhysicalRemovalAtt", len( toRemoveDict ) * len( targetSEs ) ) # # keep errors dict removalStatus = dict.fromkeys( toRemoveDict.keys(), None ) for lfn in removalStatus: removalStatus[lfn] = dict.fromkeys( targetSEs, "" ) for targetSE in targetSEs: self.log.info( "removing files from %s" % targetSE ) # # 1st - bulk removal bulkRemoval = self.bulkRemoval( toRemoveDict, targetSE ) if not bulkRemoval["OK"]: self.log.error( 'Failed bulk removal', bulkRemoval["Message"] ) self.operation.Error = bulkRemoval["Message"] return bulkRemoval bulkRemoval = bulkRemoval["Value"] for lfn, opFile in toRemoveDict.items(): removalStatus[lfn][targetSE] = bulkRemoval["Failed"].get( lfn, "" ) opFile.Error = removalStatus[lfn][targetSE] # # 2nd - single file removal toRetry = dict( ( lfn, opFile ) for lfn, opFile in toRemoveDict.items() if lfn in bulkRemoval["Failed"] ) for lfn, opFile in toRetry.items(): self.singleRemoval( opFile, targetSE ) if not opFile.Error: removalStatus[lfn][targetSE] = "" else: gMonitor.addMark( "PhysicalRemovalFail", 1 ) removalStatus[lfn][targetSE] = opFile.Error # # update file status for waiting files failed = 0 for opFile in self.operation: if opFile.Status == "Waiting": errors = [ error for error in removalStatus[opFile.LFN].values() if error.strip() ] if errors: failed += 1 opFile.Error = ",".join( errors ) if "Write access not permitted for this credential" in opFile.Error: opFile.Status = "Failed" gMonitor.addMark( "PhysicalRemovalFail", len( errors ) ) continue gMonitor.addMark( "PhysicalRemovalOK", len( targetSEs ) ) gMonitor.addMark( "PhysicalRemovalSize", opFile.Size * len( targetSEs ) ) opFile.Status = "Done" if failed: self.operation.Error = "failed to remove %s files" % failed return S_OK()
def __call__( self ): """ action for 'removeFile' operation """ # # get waiting files waitingFiles = self.getWaitingFilesList() fc = FileCatalog( self.operation.catalogList ) res = fc.getReplicas( [wf.LFN for wf in waitingFiles] ) if not res['OK']: gMonitor.addMark( "RemoveFileAtt" ) gMonitor.addMark( "RemoveFileFail" ) return res # We check the status of the SE from the LFN that are successful # No idea what to do with the others... replicas = res['Value']['Successful'] targetSEs = set( [se for lfn in replicas for se in replicas[lfn] ] ) bannedTargets = set() if targetSEs: bannedTargets = self.checkSEsRSS( targetSEs, access = 'RemoveAccess' ) if not bannedTargets['OK']: gMonitor.addMark( "RemoveFileAtt" ) gMonitor.addMark( "RemoveFileFail" ) return bannedTargets bannedTargets = set( bannedTargets['Value'] ) if bannedTargets and 'always banned' in self.operation.Error: return S_OK( "%s targets are always banned for removal" % ",".join( sorted( bannedTargets ) ) ) # # prepare waiting file dict # # We take only files that have no replica at the banned SEs... If no replica, don't toRemoveDict = dict( ( opFile.LFN, opFile ) for opFile in waitingFiles if not bannedTargets.intersection( replicas.get( opFile.LFN, [] ) ) ) if toRemoveDict: gMonitor.addMark( "RemoveFileAtt", len( toRemoveDict ) ) # # 1st step - bulk removal self.log.debug( "bulk removal of %s files" % len( toRemoveDict ) ) bulkRemoval = self.bulkRemoval( toRemoveDict ) if not bulkRemoval["OK"]: self.log.error( "Bulk file removal failed", bulkRemoval["Message"] ) else: gMonitor.addMark( "RemoveFileOK", len( toRemoveDict ) - len( bulkRemoval["Value"] ) ) toRemoveDict = bulkRemoval["Value"] # # 2nd step - single file removal for lfn, opFile in toRemoveDict.items(): self.log.info( "removing single file %s" % lfn ) singleRemoval = self.singleRemoval( opFile ) if not singleRemoval["OK"]: self.log.error( 'Error removing single file', singleRemoval["Message"] ) gMonitor.addMark( "RemoveFileFail", 1 ) else: self.log.info( "file %s has been removed" % lfn ) gMonitor.addMark( "RemoveFileOK", 1 ) # # set failedFiles = [ ( lfn, opFile ) for ( lfn, opFile ) in toRemoveDict.items() if opFile.Status in ( "Failed", "Waiting" ) ] if failedFiles: self.operation.Error = "failed to remove %d files" % len( failedFiles ) if bannedTargets: return S_OK( "%s targets are banned for removal" % ",".join( sorted( bannedTargets ) ) ) return S_OK()
def sendNumTaskQueues(): result = gTaskQueueDB.getNumTaskQueues() if result[ 'OK' ]: gMonitor.addMark( 'numTQs', result[ 'Value' ] ) else: gLogger.error( "Cannot get the number of task queues", result[ 'Message' ] )
def selectJob( self, resourceDescription ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self.__processResourceDescription( resourceDescription ) credDict = self.getRemoteCredentials() #Check credentials if not generic pilot if Properties.GENERIC_PILOT in credDict[ 'properties' ]: #You can only match groups in the same VO vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: #If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: gLogger.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] #If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] gLogger.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ] or credDict[ 'group' ] not in result[ 'Value' ]: #DN is not in the same group! bad boy. gLogger.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] #Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] # Check the pilot DIRAC version if self.__opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: return S_ERROR( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.__opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: return S_ERROR( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) #Check project if requested validProject = self.__opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: return S_ERROR( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: return S_ERROR( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) ) # Update pilot information pilotInfoReported = False pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: if "PilotInfoReportedFlag" in resourceDict and not resourceDict['PilotInfoReportedFlag']: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) gLogger.verbose('Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference,gridCE,site,benchmark) ) result = gPilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if result['OK']: pilotInfoReported = True #Check the site mask if not 'Site' in resourceDict: return S_ERROR( 'Missing Site Name in Resource JDL' ) # Get common site mask and check the agent site result = gJobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: return S_ERROR( 'Internal error: can not get site mask' ) maskList = result['Value'] siteName = resourceDict['Site'] if siteName not in maskList: # if 'GridCE' not in resourceDict: # return S_ERROR( 'Site not in mask and GridCE not specified' ) # Even if the site is banned, if it defines a CE, it must be able to check it # del resourceDict['Site'] # Banned site can only take Test jobs resourceDict['JobType'] = 'Test' resourceDict['Setup'] = self.serviceInfoDict['clientSetup'] gLogger.verbose( "Resource description:" ) for key in resourceDict: gLogger.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) negativeCond = self.__limiter.getNegativeCondForSite( siteName ) result = gTaskQueueDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if DEBUG: print result if not result['OK']: return result result = result['Value'] if not result['matchFound']: return S_ERROR( 'No match found' ) jobID = result['jobId'] resAtt = gJobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: return S_ERROR( 'Could not retrieve job attributes' ) if not resAtt['Value']: return S_ERROR( 'No attributes returned for job' ) if not resAtt['Value']['Status'] == 'Waiting': gLogger.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = gTaskQueueDB.deleteJob( jobID ) if not result[ 'OK' ]: return result return S_ERROR( "Job %s is not in Waiting state" % str( jobID ) ) attNames = ['Status','MinorStatus','ApplicationStatus','Site'] attValues = ['Matched','Assigned','Unknown',siteName] result = gJobDB.setJobAttributes( jobID, attNames, attValues ) # result = gJobDB.setJobStatus( jobID, status = 'Matched', minor = 'Assigned' ) result = gJobLoggingDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) result = gJobDB.getJobJDL( jobID ) if not result['OK']: return S_ERROR( 'Failed to get the job JDL' ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime gLogger.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = gJobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = gJobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: return S_ERROR( 'Could not retrieve job attributes' ) if not resAtt['Value']: return S_ERROR( 'No attributes returned for job' ) if self.__opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.__limiter.updateDelayCounters( siteName, jobID ) # Report pilot-job association if pilotReference: result = gPilotAgentsDB.setCurrentJobID( pilotReference, jobID ) result = gPilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus=False ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = pilotInfoReported return S_OK( resultDict )
def fts3Transfer(self): """ replicate and register using FTS3 """ self.log.info("scheduling files in FTS3...") # Check first if we do not have ongoing transfers res = self._checkExistingFTS3Operations() if not res['OK']: return res # if res['Value'] is False # it means that there are ongoing transfers # and we should stop here if res['Value'] is False: # return S_OK such that the request is put back return S_OK() fts3Files = [] toSchedule = {} # Dict which maps the FileID to the object rmsFilesIds = {} for opFile in self.getWaitingFilesList(): rmsFilesIds[opFile.FileID] = opFile opFile.Error = '' gMonitor.addMark("FTSScheduleAtt") # # check replicas replicas = self._filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas['NoReplicas'] badReplicas = replicas['Bad'] noPFN = replicas['NoPFN'] if validReplicas: validTargets = list(set(self.operation.targetSEList) - set(validReplicas)) if not validTargets: self.log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" else: toSchedule[opFile.LFN] = [opFile, validTargets] else: gMonitor.addMark("FTSScheduleFail") if noMetaReplicas: self.log.warn("unable to schedule '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = "Couldn't get metadata" elif noReplicas: self.log.error( "Unable to schedule transfer", "File %s doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = 'No replicas found' opFile.Status = 'Failed' elif badReplicas: self.log.error( "Unable to schedule transfer", "File %s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = 'All replicas have a bad checksum' opFile.Status = 'Failed' elif noPFN: self.log.warn( "unable to schedule %s, could not get a PFN at %s" % (opFile.LFN, ','.join(noPFN))) res = self._addMetadataToFiles(toSchedule) if not res['OK']: return res else: filesToSchedule = res['Value'] for lfn in filesToSchedule: opFile = filesToSchedule[lfn] validTargets = toSchedule[lfn][1] for targetSE in validTargets: ftsFile = FTS3File.fromRMSFile(opFile, targetSE) fts3Files.append(ftsFile) if fts3Files: res = Registry.getUsernameForDN(self.request.OwnerDN) if not res['OK']: self.log.error( "Cannot get username for DN", "%s %s" % (self.request.OwnerDN, res['Message'])) return res username = res['Value'] fts3Operation = FTS3TransferOperation.fromRMSObjects(self.request, self.operation, username) fts3Operation.ftsFiles = fts3Files ftsSchedule = FTS3Client().persistOperation(fts3Operation) if not ftsSchedule["OK"]: self.log.error("Completely failed to schedule to FTS3:", ftsSchedule["Message"]) return ftsSchedule # might have nothing to schedule ftsSchedule = ftsSchedule["Value"] self.log.info("Scheduled with FTS3Operation id %s" % ftsSchedule) self.log.info("%d files have been scheduled to FTS3" % len(fts3Files)) for ftsFile in fts3Files: opFile = rmsFilesIds[ftsFile.rmsFileID] gMonitor.addMark("FTSScheduleOK", 1) opFile.Status = "Scheduled" self.log.debug("%s has been scheduled for FTS" % opFile.LFN) else: self.log.info("No files to schedule after metadata checks") # Just in case some transfers could not be scheduled, try them with RM return self.dmTransfer(fromFTS=True)
def dmTransfer(self, fromFTS=False): """ replicate and register using dataManager """ # # get waiting files. If none just return # # source SE sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS(sourceSE, 'ReadAccess') if not bannedSource["OK"]: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info(self.operation.Error) return S_OK(self.operation.Error) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark("ReplicateAndRegisterAtt", len(self.operation)) gMonitor.addMark("ReplicateFail", len(self.operation)) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join(bannedTargets['Value']) return S_OK(self.operation.Error) # Can continue now self.log.verbose("No targets banned for writing") waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files if fromFTS: self.log.info("Trying transfer using replica manager as FTS failed") else: self.log.info("Transferring files using Data manager...") errors = defaultdict(int) delayExecution = 0 for opFile in waitingFiles: if opFile.Error in ("Couldn't get metadata", "File doesn't exist", 'No active replica found', "All replicas have a bad checksum",): err = "File already in error status" errors[err] += 1 gMonitor.addMark("ReplicateAndRegisterAtt", 1) opFile.Error = '' lfn = opFile.LFN # Check if replica is at the specified source replicas = self._filterReplicas(opFile) if not replicas["OK"]: self.log.error('Failed to check replicas', replicas["Message"]) continue replicas = replicas["Value"] validReplicas = replicas.get("Valid") noMetaReplicas = replicas.get("NoMetadata") noReplicas = replicas.get('NoReplicas') badReplicas = replicas.get('Bad') noActiveReplicas = replicas.get('NoActiveReplicas') if not validReplicas: gMonitor.addMark("ReplicateFail") if noMetaReplicas: err = "Couldn't get metadata" errors[err] += 1 self.log.verbose( "unable to replicate '%s', couldn't get metadata at %s" % (opFile.LFN, ','.join(noMetaReplicas))) opFile.Error = err elif noReplicas: err = "File doesn't exist" errors[err] += 1 self.log.verbose( "Unable to replicate", "File %s doesn't exist at %s" % (opFile.LFN, ','.join(noReplicas))) opFile.Error = err opFile.Status = 'Failed' elif badReplicas: err = "All replicas have a bad checksum" errors[err] += 1 self.log.error( "Unable to replicate", "%s, all replicas have a bad checksum at %s" % (opFile.LFN, ','.join(badReplicas))) opFile.Error = err opFile.Status = 'Failed' elif noActiveReplicas: err = "No active replica found" errors[err] += 1 self.log.verbose("Unable to schedule transfer", "%s, %s at %s" % (opFile.LFN, err, ','.join(noActiveReplicas))) opFile.Error = err # All source SEs are banned, delay execution by 1 hour delayExecution = 60 continue # # get the first one in the list if sourceSE not in validReplicas: if sourceSE: err = "File not at specified source" errors[err] += 1 self.log.warn( "%s is not at specified sourceSE %s, changed to %s" % (lfn, sourceSE, validReplicas[0])) sourceSE = validReplicas[0] # # loop over targetSE catalogs = self.operation.Catalog if catalogs: catalogs = [cat.strip() for cat in catalogs.split(',')] for targetSE in self.operation.targetSEList: # # call DataManager if targetSE in validReplicas: self.log.warn("Request to replicate %s to an existing location: %s" % (lfn, targetSE)) opFile.Status = 'Done' continue res = self.dm.replicateAndRegister(lfn, targetSE, sourceSE=sourceSE, catalog=catalogs) if res["OK"]: if lfn in res["Value"]["Successful"]: if "replicate" in res["Value"]["Successful"][lfn]: repTime = res["Value"]["Successful"][lfn]["replicate"] prString = "file %s replicated at %s in %s s." % (lfn, targetSE, repTime) gMonitor.addMark("ReplicateOK", 1) if "register" in res["Value"]["Successful"][lfn]: gMonitor.addMark("RegisterOK", 1) regTime = res["Value"]["Successful"][lfn]["register"] prString += ' and registered in %s s.' % regTime self.log.info(prString) else: gMonitor.addMark("RegisterFail", 1) prString += " but failed to register" self.log.warn(prString) opFile.Error = "Failed to register" # # add register replica operation registerOperation = self.getRegisterOperation( opFile, targetSE, type='RegisterReplica') self.request.insertAfter(registerOperation, self.operation) else: self.log.error("Failed to replicate", "%s to %s" % (lfn, targetSE)) gMonitor.addMark("ReplicateFail", 1) opFile.Error = "Failed to replicate" else: gMonitor.addMark("ReplicateFail", 1) reason = res["Value"]["Failed"][lfn] self.log.error( "Failed to replicate and register", "File %s at %s:" % (lfn, targetSE), reason) opFile.Error = reason else: gMonitor.addMark("ReplicateFail", 1) opFile.Error = "DataManager error: %s" % res["Message"] self.log.error("DataManager error", res["Message"]) if not opFile.Error: if len(self.operation.targetSEList) > 1: self.log.info("file %s has been replicated to all targetSEs" % lfn) opFile.Status = "Done" # Log error counts if delayExecution: self.log.info("Delay execution of the request by %d minutes" % delayExecution) self.request.delayNextExecution(delayExecution) for error, count in errors.iteritems(): self.log.error(error, 'for %d files' % count) return S_OK()
def __call__( self ): """ PutAndRegister operation processing """ # # list of targetSEs targetSEs = self.operation.targetSEList if len( targetSEs ) != 1: self.log.error( "Wrong value for TargetSE list, should contain only one target!", "%s" % targetSEs ) self.operation.Error = "Wrong parameters: TargetSE should contain only one targetSE" for opFile in self.operation: opFile.Status = "Failed" opFile.Error = "Wrong parameters: TargetSE should contain only one targetSE" gMonitor.addMark( "PutAtt", 1 ) gMonitor.addMark( "PutFail", 1 ) return S_ERROR( "TargetSE should contain only one target, got %s" % targetSEs ) targetSE = targetSEs[0] bannedTargets = self.checkSEsRSS( targetSE ) if not bannedTargets['OK']: gMonitor.addMark( "PutAtt" ) gMonitor.addMark( "PutFail" ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) ) # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files for opFile in waitingFiles: # # get LFN lfn = opFile.LFN self.log.info( "processing file %s" % lfn ) gMonitor.addMark( "PutAtt", 1 ) pfn = opFile.PFN guid = opFile.GUID checksum = opFile.Checksum # # call DataManager passing a list of requested catalogs catalogs = self.operation.Catalog if catalogs: catalogs = [ cat.strip() for cat in catalogs.split( ',' ) ] putAndRegister = DataManager( catalogs = catalogs ).putAndRegister( lfn, pfn, targetSE, guid = guid, checksum = checksum ) if not putAndRegister["OK"]: gMonitor.addMark( "PutFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" ) self.log.error( "Completely failed to put and register file", putAndRegister["Message"] ) opFile.Error = str( putAndRegister["Message"] ) self.operation.Error = str( putAndRegister["Message"] ) continue putAndRegister = putAndRegister["Value"] if lfn in putAndRegister["Failed"]: gMonitor.addMark( "PutFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" ) reason = putAndRegister["Failed"][lfn] self.log.error( "Failed to put and register file", " %s at %s: %s" % ( lfn, targetSE, reason ) ) opFile.Error = str( reason ) self.operation.Error = str( reason ) continue putAndRegister = putAndRegister["Successful"] if lfn in putAndRegister: if "put" not in putAndRegister[lfn]: gMonitor.addMark( "PutFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "PutFail", targetSE, "", "PutAndRegister" ) self.log.info( "failed to put %s to %s" % ( lfn, targetSE ) ) opFile.Error = "put failed" self.operation.Error = "put failed" continue if "register" not in putAndRegister[lfn]: gMonitor.addMark( "PutOK", 1 ) gMonitor.addMark( "RegisterFail", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" ) # self.dataLoggingClient().addFileRecord( lfn, "RegisterFail", targetSE, "", "PutAndRegister" ) self.log.info( "put of %s to %s took %s seconds" % ( lfn, targetSE, putAndRegister[lfn]["put"] ) ) self.log.error( "Register of lfn to SE failed", "%s to %s" % ( lfn, targetSE ) ) opFile.Error = "failed to register %s at %s" % ( lfn, targetSE ) opFile.Status = "Failed" self.log.info( opFile.Error ) registerOperation = self.getRegisterOperation( opFile, targetSE ) self.request.insertAfter( registerOperation, self.operation ) continue gMonitor.addMark( "PutOK", 1 ) gMonitor.addMark( "RegisterOK", 1 ) # self.dataLoggingClient().addFileRecord( lfn, "Put", targetSE, "", "PutAndRegister" ) # self.dataLoggingClient().addFileRecord( lfn, "Register", targetSE, "", "PutAndRegister" ) opFile.Status = "Done" for op in ( "put", "register" ): self.log.info( "%s of %s to %s took %s seconds" % ( op, lfn, targetSE, putAndRegister[lfn][op] ) ) return S_OK()
def __call__( self ): """ call me maybe """ # # check replicas first res = self.__checkReplicas() if not res["OK"]: self.log.error( 'Failed to check replicas', res["Message"] ) sourceSE = self.operation.SourceSE if self.operation.SourceSE else None if sourceSE: # # check source se for read bannedSource = self.checkSEsRSS( sourceSE, 'ReadAccess' ) if not bannedSource["OK"]: gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) ) gMonitor.addMark( "ReplicateFail", len( self.operation ) ) return bannedSource if bannedSource["Value"]: self.operation.Error = "SourceSE %s is banned for reading" % sourceSE self.log.info( self.operation.Error ) return S_OK( self.operation.Error ) # # check targetSEs for write bannedTargets = self.checkSEsRSS() if not bannedTargets['OK']: gMonitor.addMark( "ReplicateAndRegisterAtt", len( self.operation ) ) gMonitor.addMark( "ReplicateFail", len( self.operation ) ) return bannedTargets if bannedTargets['Value']: self.operation.Error = "%s targets are banned for writing" % ",".join( bannedTargets['Value'] ) return S_OK( self.operation.Error ) # Can continue now self.log.verbose( "No targets banned for writing" ) # # check sourceSEs for removal # # for removal the targetSEs are the sourceSEs of the replication targetSEs = self.operation.sourceSEList bannedTargets = self.checkSEsRSS( targetSEs, access = 'RemoveAccess' ) if not bannedTargets['OK']: gMonitor.addMark( "RemoveReplicaAtt" ) gMonitor.addMark( "RemoveReplicaFail" ) return bannedTargets if bannedTargets['Value']: return S_OK( "%s targets are banned for removal" % ",".join( bannedTargets['Value'] ) ) # Can continue now self.log.verbose( "No targets banned for removal" ) ## Do the transfer # # get waiting files. If none just return waitingFiles = self.getWaitingFilesList() if not waitingFiles: return S_OK() # # loop over files self.log.info( "Transferring files using Data manager..." ) for opFile in waitingFiles: res = self.dmTransfer(opFile) if not res["OK"]: continue else: ## Do the replica removal self.log.info( "Removing files using Data manager..." ) toRemoveDict = dict( [ ( opFile.LFN, opFile ) for opFile in waitingFiles ] ) self.log.info( "todo: %s replicas to delete from %s sites" % ( len( toRemoveDict ), len( targetSEs ) ) ) self.dmRemoval(toRemoveDict,targetSEs) return S_OK()
def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict
def __call__( self ): """ call me maybe """ # # counter for failed replicas failedReplicas = 0 # # catalog to use catalog = self.operation.Catalog # # get waiting files waitingFiles = self.getWaitingFilesList() # # loop over files registerOperations = {} for opFile in waitingFiles: gMonitor.addMark( "RegisterReplicaAtt", 1 ) # # get LFN lfn = opFile.LFN # # and others targetSE = self.operation.targetSEList[0] replicaTuple = ( lfn , opFile.PFN, targetSE ) # # call ReplicaManager registerReplica = self.dm.registerReplica( replicaTuple, catalog ) # # check results if not registerReplica["OK"] or lfn in registerReplica["Value"]["Failed"]: # There have been some errors gMonitor.addMark( "RegisterReplicaFail", 1 ) self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaFail", catalog, "", "RegisterReplica" ) reason = registerReplica.get( "Message", registerReplica.get( "Value", {} ).get( "Failed", {} ).get( lfn, 'Unknown' ) ) errorStr = "failed to register LFN %s: %s" % ( lfn, reason ) if lfn in registerReplica["Value"].get( "Successful", {} ) and type( reason ) == type( {} ): # As we managed, let's create a new operation for just the remaining registration errorStr += ' - adding registerReplica operations to request' for failedCatalog in reason.keys(): key = '%s/%s' % ( targetSE, failedCatalog ) newOperation = self.getRegisterOperation( opFile, targetSE, type = 'RegisterReplica', catalog = failedCatalog ) if key not in registerOperations: registerOperations[key] = newOperation else: registerOperations[key].addFile( newOperation[0] ) opFile.Status = 'Done' else: opFile.Error = errorStr # If one targets explicitly a catalog and it fails if catalog and ( 'file does not exist' in opFile.Error.lower() or 'no such file' in opFile.Error.lower() ) : opFile.Status = 'Failed' failedReplicas += 1 self.log.warn( errorStr ) else: # All is OK gMonitor.addMark( "RegisterReplicaOK", 1 ) self.dataLoggingClient().addFileRecord( lfn, "RegisterReplicaOK", catalog, "", "RegisterReplica" ) self.log.info( "Replica %s has been registered at %s" % ( lfn, catalog ) ) opFile.Status = "Done" # # if we have new replications to take place, put them at the end if registerOperations: self.log.info( "adding %d operations to the request" % len( registerOperations ) ) for operation in registerOperations.values(): self.operation._parent.addOperation( operation ) # # final check if failedReplicas: self.log.info( "all replicas processed, %s replicas failed to register" % failedReplicas ) self.operation.Error = "some replicas failed to register" return S_ERROR( self.operation.Error ) return S_OK()