def __init__( self, csPath = None, ftsSites = None, ftsHistoryViews = None ): """c'tor :param self: self reference :param str csPath: CS path :param list ftsSites: list of FTSSites :param list ftsHistoryViews: list of FTSHistoryViews """ # ## config path self.csPath = csPath # # fts sites ftsSites = ftsSites if ftsSites else [] # # history views ftsHistoryViews = ftsHistoryViews if ftsHistoryViews else [] # # own sub logger self.log = gLogger.getSubLogger( "FTSStrategy", child = True ) self.log.setLevel( gConfig.getValue( self.csPath + "/LogLevel", "DEBUG" ) ) for ftsSite in ftsSites: self.log.info( "FTSSite: %-16s FTSServer=%s" % ( ftsSite.Name, ftsSite.FTSServer ) ) # # CS options self.log.info( "Supported strategies = %s" % ", ".join( self.supportedStrategies ) ) self.activeStrategies = gConfig.getValue( "%s/%s" % ( self.csPath, "ActiveStrategies" ), ["MinimiseTotalWait"] ) self.log.info( "ActiveStrategies = %s" % ", ".join( self.activeStrategies ) ) self.numberOfStrategies = len( self.activeStrategies ) self.log.info( "Number of active strategies = %s" % self.numberOfStrategies ) self.sigma = gConfig.getValue( "%s/%s" % ( self.csPath, "HopSigma" ), 5 ) self.log.info( "HopSigma = %s" % self.sigma ) self.schedulingType = gConfig.getValue( "%s/%s" % ( self.csPath, "SchedulingType" ), "Files" ) self.log.info( "SchedulingType = %s" % self.schedulingType ) self.acceptableFailureRate = gConfig.getValue( "%s/%s" % ( self.csPath, "AcceptableFailureRate" ), 75 ) self.log.info( "AcceptableFailureRate = %s" % self.acceptableFailureRate ) self.acceptableFailedFiles = gConfig.getValue( "%s/%s" % ( self.csPath, "AcceptableFailedFiles" ), 5 ) self.log.info( "AcceptableFailedFiles = %s" % self.acceptableFailedFiles ) # # chosen strategy self.chosenStrategy = 0 # dispatcher self.strategyDispatcher = { "MinimiseTotalWait" : self.minimiseTotalWait, "DynamicThroughput" : self.dynamicThroughput, "Simple" : self.simple, "Swarm" : self.swarm } self.ftsGraph = FTSGraph( "FTSGraph", ftsHistoryViews, self.acceptableFailureRate, self.acceptableFailedFiles, self.schedulingType ) # for node in self.ftsGraph.nodes(): # self.log.debug( node ) # for edge in self.ftsGraph.edges(): # self.log.debug( edge ) # # if we land here everything is OK self.log.info( "%s has been constructed" % self.__class__.__name__ )
def test(self): """ test case """ graph = FTSGraph("ftsGraph", self.ftsHistoryViews) self.assertEqual(type(graph), FTSGraph, "c'tor failed") site = graph.findSiteForSE("CERN-FOO") self.assertEqual(site["OK"], False, "findSiteForSE call failed for unknown SE") sourceSite = graph.findSiteForSE("CERN-USER") self.assertEqual(sourceSite["OK"], True, "findSiteForSE call failed for target SE") targetSite = graph.findSiteForSE("RAL-USER") self.assertEqual(targetSite["OK"], True, "findSiteForSE call failed for source SE") route = graph.findRoute("RAL-USER", "CERN-USER") self.assertEqual(route["OK"], True, "findRoute failed for known source and target SEs") route = graph.findRoute("RAL-FOO", "CERN-BAR") self.assertEqual(route["OK"], False, "findRoute failed for unknown source and target SEs")
def resetFTSGraph(self): """ create fts graph """ log = gLogger.getSubLogger("ftsGraph") ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error("unable to get FTS history: %s" % ftsHistory["Message"]) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph("FTSGraph", ftsHistory) finally: self.updateLock().release() log.debug("FTSSites: %s" % len(self.__ftsGraph.nodes())) for i, site in enumerate(self.__ftsGraph.nodes()): log.debug(" [%02d] FTSSite: %-25s FTSServer: %s" % (i, site.name, site.FTSServer)) log.debug("FTSRoutes: %s" % len(self.__ftsGraph.edges())) for i, route in enumerate(self.__ftsGraph.edges()): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % (i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs)) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now( ) + datetime.timedelta(seconds=self.FTSGRAPH_REFRESH) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now( ) + datetime.timedelta(seconds=self.RW_REFRESH) return S_OK()
def resetGraph(cls, ftsHistoryViews): """ reset graph :param list ftsHistoryViews: list of FTSHistoryViews """ ftsGraph = None try: cls.graphLock().acquire() ftsGraph = FTSGraph("FTSGraph", ftsHistoryViews, cls.acceptableFailureRate, cls.acceptableFailedFiles, cls.schedulingType) if ftsGraph: cls.ftsGraph = ftsGraph finally: cls.graphLock().release() return S_OK()
def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history: %s" % ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory ) finally: self.updateLock().release() log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK()
def test( self ): """ test case """ graph = FTSGraph( "ftsGraph", self.ftsHistoryViews ) self.assertEqual( type( graph ), FTSGraph, "c'tor failed" ) site = graph.findSiteForSE( "CERN-FOO" ) self.assertEqual( site["OK"], False, "findSiteForSE call failed for unknown SE" ) sourceSite = graph.findSiteForSE( "CERN-USER" ) self.assertEqual( sourceSite["OK"], True, "findSiteForSE call failed for target SE" ) targetSite = graph.findSiteForSE( "RAL-USER" ) self.assertEqual( targetSite["OK"], True, "findSiteForSE call failed for source SE" ) route = graph.findRoute( "RAL-USER", "CERN-USER" ) self.assertEqual( route["OK"], True, "findRoute failed for known source and target SEs" ) route = graph.findRoute( "RAL-FOO", "CERN-BAR" ) self.assertEqual( route["OK"], False, "findRoute failed for unknown source and target SEs" )
class FTSAgent(AgentModule): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag STAGE_FILES = False # # replica manager __replicaManager = None # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock(self): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock("FTSAgentLock") return self.__updateLock @classmethod def requestClient(cls): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient(cls): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def replicaManager(cls): """ replica manager getter """ if not cls.__replicaManager: cls.__replicaManager = ReplicaManager() return cls.__replicaManager @classmethod def rssClient(cls): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE(cls, seName): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement(seName) return cls.__seCache[seName] @classmethod def getRequest(cls, reqName): """ keep Requests in cache """ if reqName not in cls.__reqCache: getRequest = cls.requestClient().getRequest(reqName) if not getRequest["OK"]: return getRequest getRequest = getRequest["Value"] if not getRequest: return S_ERROR("request of name '%s' not found in ReqDB" % reqName) cls.__reqCache[reqName] = getRequest return S_OK(cls.__reqCache[reqName]) @classmethod def putRequest(cls, request): """ put request back to ReqDB :param Request request: Request instance also finalize request if status == Done """ # # put back request put = cls.requestClient().putRequest(request) if not put["OK"]: return put # # finalize first is possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if request.RequestName in cls.__reqCache: del cls.__reqCache[request.RequestName] return S_OK() @classmethod def putFTSJobs(cls, ftsJobsList): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob(ftsJob) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict(ftsFilesDict, toUpdateDict): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get(category, []): if ftsFile not in ftsFileList: ftsFileList.append(ftsFile) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool(self): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool(self.MIN_THREADS, self.MAX_THREADS) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph(self): """ create fts graph """ log = gLogger.getSubLogger("ftsGraph") ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error("unable to get FTS history: %s" % ftsHistory["Message"]) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph("FTSGraph", ftsHistory) finally: self.updateLock().release() log.debug("FTSSites: %s" % len(self.__ftsGraph.nodes())) for i, site in enumerate(self.__ftsGraph.nodes()): log.debug(" [%02d] FTSSite: %-25s FTSServer: %s" % (i, site.name, site.FTSServer)) log.debug("FTSRoutes: %s" % len(self.__ftsGraph.edges())) for i, route in enumerate(self.__ftsGraph.edges()): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % (i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs)) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now( ) + datetime.timedelta(seconds=self.FTSGRAPH_REFRESH) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now( ) + datetime.timedelta(seconds=self.RW_REFRESH) return S_OK() def initialize(self): """ agent's initialization """ log = self.log.getSubLogger("initialize") self.FTSGRAPH_REFRESH = self.am_getOption("FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH) log.info("FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH) self.RW_REFRESH = self.am_getOption("RWAccessValidityPeriod", self.RW_REFRESH) log.info("SEs R/W access validity period = %s s" % self.RW_REFRESH) self.STAGE_FILES = self.am_getOption("StageFiles", self.STAGE_FILES) log.info("Stage files before submission = %s" % { True: "yes", False: "no" }[bool(self.STAGE_FILES)]) self.MAX_ACTIVE_JOBS = self.am_getOption("MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS) log.info("Max active FTSJobs/route = %s" % self.MAX_ACTIVE_JOBS) self.MAX_FILES_PER_JOB = self.am_getOption("MaxFilesPerJob", self.MAX_FILES_PER_JOB) log.info("Max FTSFiles/FTSJob = %d" % self.MAX_FILES_PER_JOB) self.MAX_ATTEMPT = self.am_getOption("MaxTransferAttempts", self.MAX_ATTEMPT) log.info("Max transfer attempts = %s" % self.MAX_ATTEMPT) # # thread pool self.MIN_THREADS = self.am_getOption("MinThreads", self.MIN_THREADS) self.MAX_THREADS = self.am_getOption("MaxThreads", self.MAX_THREADS) minmax = (abs(self.MIN_THREADS), abs(self.MAX_THREADS)) self.MIN_THREADS, self.MAX_THREADS = min(minmax), max(minmax) log.info("ThreadPool min threads = %s" % self.MIN_THREADS) log.info("ThreadPool max threads = %s" % self.MAX_THREADS) log.info("initialize: creation of FTSGraph...") createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error("initialize: %s" % createGraph["Message"]) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') log.info("will use DataManager proxy") # # gMonitor stuff here gMonitor.registerActivity("RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM) pollingTime = self.am_getOption("PollingTime", 60) for status in list(FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES): gMonitor.registerActivity("FTSJobs%s" % status, "FTSJobs %s" % status, "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime) gMonitor.registerActivity("FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN) gMonitor.registerActivity("FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN) gMonitor.registerActivity("FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN) return S_OK() def finalize(self): """ finalize processing """ log = self.log.getSubLogger("finalize") for request in self.__reqCache.values(): put = self.requestClient().putRequest(request) if not put["OK"]: log.error("unable to put back request '%s': %s" % (request.RequestName, put["Message"])) return S_OK() def execute(self): """ one cycle execution """ log = gLogger.getSubLogger("execute") # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info("resetting expired FTS graph...") resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error("FTSGraph recreation error: %s" % resetFTSGraph["Message"]) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds=self.FTSGRAPH_REFRESH) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info("updating expired R/W access for SEs...") try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds=self.RW_REFRESH) requestNames = self.requestClient().getRequestNamesList(["Scheduled"]) if not requestNames["OK"]: log.error("unable to read scheduled request names: %s" % requestNames["Message"]) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [req[0] for req in requestNames["Value"]] requestNames = list(set(requestNames + self.__reqCache.keys())) if not requestNames: log.info("no 'Scheduled' requests to process") return S_OK() log.info("found %s requests to process:" % len(requestNames)) log.info(" => from internal cache: %s" % (len(self.__reqCache))) log.info(" => new read from RMS: %s" % (len(requestNames) - len(self.__reqCache))) for requestName in requestNames: request = self.getRequest(requestName) if not request["OK"]: log.error(request["Message"]) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args=(request, ), sTJId=sTJId) if queue["OK"]: log.info("request '%s' enqueued for execution" % sTJId) gMonitor.addMark("RequestsAtt", 1) break time.sleep(1) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest(self, request): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger(request.RequestName) operation = request.getWaiting() if not operation["OK"]: log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest(request) operation = operation["Value"] if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type) return self.putRequest(request) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status) return self.putRequest(request) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest(request.RequestID) if not ftsJobs["OK"]: log.error(ftsJobs["Message"]) return ftsJobs ftsJobs = ftsJobs["Value"] if ftsJobs["Value"] else [] # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict([(k, list()) for k in ("toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate")]) if ftsJobs: log.info("==> found %s FTSJobs to monitor" % len(ftsJobs)) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob(request, ftsJob) if not monitor["OK"]: log.error("unable to monitor FTSJob %s: %s" % (ftsJob.FTSJobID, monitor["Message"])) ftsJob.Status = "Submitted" continue ftsFilesDict = self.updateFTSFileDict(ftsFilesDict, monitor["Value"]) log.info("monitoring of FTSJobs completed") for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug(" => %s FTSFiles to %s" % (len(ftsFiles), key[2:].lower())) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas(request, operation) if not missingReplicas["OK"]: log.error(missingReplicas["Message"]) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status != 'Done': log.warn("Should be set! %s is replicated at all targets" % opFile.LFN) opFile.Status = "Done" toFail = ftsFilesDict.get("toFail", []) toReschedule = ftsFilesDict.get("toReschedule", []) toSubmit = ftsFilesDict.get("toSubmit", []) toRegister = ftsFilesDict.get("toRegister", []) toUpdate = ftsFilesDict.get("toUpdate", []) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, request execution cannot proceed..." % len(toFail)) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len(toFail) # # requets.Status should be Failed at this stage "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error("request is set to 'Failed'") return self.putRequest(request) # # PHASE THREE - update Waiting#SourceSE FTSFiles if toUpdate: log.info("==> found %s possible FTSFiles to update..." % (len(toUpdate))) byTarget = {} for ftsFile in toUpdate: if ftsFile.TargetSE not in byTarget: byTarget.setdefault(ftsFile.TargetSE, []) byTarget[ftsFile.TargetSE].append(ftsFile.FileID) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList) if not update["OK"]: log.error("update FTSFiles failed: %s" % update["Message"]) continue # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %s Files waiting for registration, adding 'RegisterReplica' operations" ) registerFiles = self.__register(request, operation, toRegister) if not registerFiles["OK"]: log.error("unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"]) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to RMS") return self.putRequest(request) # # PHASE FIVE - reschedule operation files if toReschedule: log.info("==> found %s Files to reschedule" % len(toReschedule)) rescheduleFiles = self.__reschedule(request, operation, toReschedule) if not rescheduleFiles["OK"]: log.error(rescheduleFiles["Message"]) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to ReqDB") return self.putRequest(request) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, ["Waiting"]) if not ftsFiles["OK"]: log.error(ftsFiles["Message"]) else: retryIds = list(set([ftsFile.FTSFileID for ftsFile in toSubmit])) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: toSubmit.append(ftsFile) retryIds.append(ftsFile.FTSFileID) # # submit new ftsJobs if operation.Status == "Scheduled" and toSubmit: log.info("==> found %s FTSFiles to submit" % len(toSubmit)) submit = self.__submit(request, operation, toSubmit) if not submit["OK"]: log.error(submit["Message"]) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": put = self.putRequest(request) if not put["OK"]: log.error("unable to put back request: %s" % put["Message"]) return put # # put back jobs if ftsJobs: putJobs = self.putFTSJobs(ftsJobs) if not putJobs["OK"]: log.error("unable to put back FTSJobs: %s" % putJobs["Message"]) return putJobs return S_OK() def __reschedule(self, request, operation, toReschedule): """ reschedule list of :toReschedule: files in request for operation :operation: :param Request request: :param Operation operation: :param list toReschedule: list of FTSFiles """ log = self.log.getSubLogger("%s/reschedule" % request.RequestName) log.info("found %s files to reschedule" % len(toReschedule)) for opFile in operation: for ftsFile in toReschedule: if opFile.FileID == ftsFile.FileID: opFile.Status = "Waiting" toSchedule = [] # # filter files for opFile in operation.getWaitingFilesList(): replicas = self.__filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] if not replicas["Valid"] and replicas["Banned"]: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) continue validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] if not validReplicas and bannedReplicas: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) continue if validReplicas: validTargets = list( set(operation.targetSEList) - set(validReplicas)) if not validTargets: log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" continue toSchedule.append( (opFile.toJSON()["Value"], validReplicas, validTargets)) # # do real schedule here if toSchedule: ftsSchedule = self.ftsClient().ftsSchedule(request.RequestID, operation.OperationID, toSchedule) if not ftsSchedule["OK"]: self.log.error(ftsSchedule["Message"]) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for fileID in ftsSchedule["Successful"]: for opFile in operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" for fileID, reason in ftsSchedule["Failed"]: for opFile in operation: if fileID == opFile.FileID: opFile.Error = reason return S_OK() def __submit(self, request, operation, toSubmit): """ create and submit new FTSJobs using list of FTSFiles :param Request request: ReqDB.Request instance :param list ftsFiles: list of FTSFile instances :return: [ FTSJob, FTSJob, ...] """ log = self.log.getSubLogger("%s/submit" % request.RequestName) bySourceAndTarget = {} for ftsFile in toSubmit: if ftsFile.SourceSE not in bySourceAndTarget: bySourceAndTarget.setdefault(ftsFile.SourceSE, {}) if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]: bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, []) bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile) ftsJobs = [] for source, targetDict in bySourceAndTarget.items(): for target, ftsFileList in targetDict.items(): log.info("found %s files to submit from %s to %s" % (len(ftsFileList), source, target)) route = self.__ftsGraph.findRoute(source, target) if not route["OK"]: log.error(route["Message"]) continue route = route["Value"] sourceRead = route.fromNode.SEs[source]["read"] if not sourceRead: log.error("SourceSE %s is banned for reading right now" % source) continue targetWrite = route.toNode.SEs[target]["write"] if not targetWrite: log.error("TargetSE %s is banned for writing right now" % target) continue if route.ActiveJobs > route.toNode.MaxActiveJobs: log.warn( "unable to submit new FTS job, max active jobs reached" ) continue # # create FTSJob ftsJob = FTSJob() ftsJob.RequestID = request.RequestID ftsJob.OperationID = operation.OperationID ftsJob.SourceSE = source ftsJob.TargetSE = target sourceSE = self.getSE(source) sourceToken = sourceSE.getStorageParameters("SRM2") if not sourceToken["OK"]: log.error("unable to get sourceSE '%s' parameters: %s" % (source, sourceToken["Message"])) continue ftsJob.SourceToken = sourceToken["Value"].get("SpaceToken", "") targetSE = self.getSE(target) targetToken = targetSE.getStorageParameters("SRM2") if not targetToken["OK"]: log.error("unable to get targetSE '%s' parameters: %s" % (target, targetToken["Message"])) continue ftsJob.TargetToken = targetToken["Value"].get("SpaceToken", "") ftsJob.FTSServer = route.toNode.FTSServer for ftsFile in ftsFileList: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile(ftsFile) submit = ftsJob.submitFTS2(self.STAGE_FILES) if not submit["OK"]: log.error("unable to submit FTSJob: %s" % submit["Message"]) continue log.info("FTSJob '%s'@'%s' has been submitted" % (ftsJob.FTSGUID, ftsJob.FTSServer)) # # update statuses for job files for ftsFile in ftsJob: ftsFile.FTSGUID = ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 # # update graph route try: self.updateLock().acquire() route.ActiveJobs += 1 finally: self.updateLock().release() ftsJobs.append(ftsJob) log.info("%s new FTSJobs have been submitted" % len(ftsJobs)) return S_OK(ftsJobs) def __monitorJob(self, request, ftsJob): """ execute FTSJob.monitorFTS2 for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger("%s/monitor/%s" % (request.RequestName, ftsJob.FTSGUID)) log.info("FTSJob '%s'@'%s'" % (ftsJob.FTSGUID, ftsJob.FTSServer)) # # this will be returned ftsFilesDict = dict([(k, list()) for k in ("toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate")]) monitor = ftsJob.monitorFTS2() if not monitor["OK"]: gMonitor.addMark("FTSMonitorFail", 1) log.error(monitor["Message"]) if "getTransferJobSummary2: Not authorised to query request" in monitor[ "Message"]: log.error("FTSJob not known (expired on server?)") for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"] = ftsFile return S_OK(ftsFilesDict) return monitor monitor = monitor["Value"] log.info("FTSJob Status = %s Completeness = %s" % (ftsJob.Status, ftsJob.Completeness)) # # monitor status change gMonitor.addMark("FTSJobs%s" % ftsJob.Status, 1) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob(request, ftsJob) if not finalizeFTSJob["OK"]: log.error(finalizeFTSJob["Message"]) return finalizeFTSJob ftsFilesDict = self.updateFTSFileDict(ftsFilesDict, finalizeFTSJob["Value"]) return S_OK(ftsFilesDict) def __finalizeFTSJob(self, request, ftsJob): """ finalize FTSJob :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger("%s/monitor/%s/finalize" % (request.RequestName, ftsJob.FTSJobID)) log.info("finalizing FTSJob %s@%s" % (ftsJob.FTSGUID, ftsJob.FTSServer)) # # this will be returned ftsFilesDict = dict([(k, list()) for k in ("toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate")]) monitor = ftsJob.monitorFTS2(full=True) if not monitor["OK"]: log.error(monitor["Message"]) return monitor # # split FTSFiles to different categories processFiles = self.__filterFiles(ftsJob) if not processFiles["OK"]: log.error(processFiles["Message"]) return processFiles ftsFilesDict = self.updateFTSFileDict(ftsFilesDict, processFiles["Value"]) # # send accounting record for this job self.__sendAccounting(ftsJob, request.OwnerDN) # # update graph - remove this job from graph route = self.__ftsGraph.findRoute(ftsJob.SourceSE, ftsJob.TargetSE) if route["OK"]: try: self.updateLock().acquire() route["Value"].ActiveJobs -= 1 finally: self.updateLock().release() log.info("FTSJob is finalized") return S_OK(ftsFilesDict) def __filterFiles(self, ftsJob): """ process ftsFiles from finished ftsJob :param FTSJob ftsJob: monitored FTSJob instance """ # # lists for different categories toUpdate = [] toReschedule = [] toRegister = [] toSubmit = [] toFail = [] # # loop over files in fts job for ftsFile in ftsJob: # # successful files if ftsFile.Status == "Finished": if ftsFile.Error == "AddCatalogReplicaFailed": toRegister.append(ftsFile) toUpdate.append(ftsFile) continue if ftsFile.Status == "Failed": if ftsFile.Error == "MissingSource": toReschedule.append(ftsFile) else: if ftsFile.Attempt < self.MAX_ATTEMPT: toSubmit.append(ftsFile) else: toFail.append(ftsFile) ftsFile.Error = "Max attempts reached" return S_OK({ "toUpdate": toUpdate, "toSubmit": toSubmit, "toRegister": toRegister, "toReschedule": toReschedule, "toFail": toFail }) def __register(self, request, operation, toRegister): """ add RegisterReplica operation :param Request request: request instance :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register """ log = self.log.getSubLogger("%s/registerFiles" % request.RequestName) byTarget = {} for ftsFile in toRegister: if ftsFile.TargetSE not in byTarget: byTarget.setdefault(ftsFile.TargetSE, []) byTarget[ftsFile.TargetSE].append(ftsFile) log.info("will create %s 'RegisterReplica' operations" % len(byTarget)) for target, ftsFileList in byTarget.items(): log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % (target, len(ftsFileList))) registerOperation = Operation() registerOperation.Type = "RegisterReplica" registerOperation.Status = "Waiting" registerOperation.TargetSE = target targetSE = self.getSE(target) for ftsFile in ftsFileList: opFile = File() opFile.LFN = ftsFile.LFN pfn = targetSE.getPfnForProtocol(ftsFile.TargetSURL, "SRM2", withPort=False) if not pfn["OK"]: continue opFile.PFN = pfn["Value"] registerOperation.addFile(opFile) request.insertBefore(registerOperation, operation) return S_OK() @staticmethod def __sendAccounting(ftsJob, ownerDN): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime(fromString(ftsJob.SubmitTime)) dataOp.setEndTime(fromString(ftsJob.LastUpdate)) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN(ownerDN) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict[ "Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower( ) else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES]) accountingDict["TransferTotal"] = len(ftsJob) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob]) dataOp.setValuesFromDict(accountingDict) dataOp.commit() def __checkReadyReplicas(self, request, operation): """ check ready replicas for transferOperation """ log = self.log.getSubLogger("%s/checkReadyReplicas" % request.RequestName) targetSESet = set(operation.targetSEList) # # { LFN: [ targetSE, ... ] } missingReplicas = {} scheduledFiles = dict([(opFile.LFN, opFile) for opFile in operation if opFile.Status in ("Scheduled", "Waiting")]) # # get replicas replicas = self.replicaManager().getCatalogReplicas( scheduledFiles.keys()) if not replicas["OK"]: self.log.error(replicas["Message"]) return replicas replicas = replicas["Value"] fullyReplicated = 0 missingSEs = {} for successfulLFN in replicas["Successful"]: reps = set(replicas['Successful'][successfulLFN]) if targetSESet.issubset(reps): log.info("%s has been replicated to all targets" % successfulLFN) fullyReplicated += 1 scheduledFiles[successfulLFN].Status = "Done" else: missingReplicas[successfulLFN] = sorted(targetSESet - reps) ses = ",".join(missingReplicas[successfulLFN]) missingSEs[ses] = missingSEs.setdefault(ses, 0) + 1 log.verbose("%s is still missing at %s" % (successfulLFN, ses)) if fullyReplicated: log.info("%d new files have been replicated to all targets" % fullyReplicated) if missingSEs: for ses in missingSEs: log.info("%d replicas still missing at %s" % (missingSEs[ses], ses)) reMissing = re.compile("no such file or directory") for failedLFN, errStr in replicas["Failed"].items(): scheduledFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): log.error("%s is missing, setting its status to 'Failed'" % failedLFN) scheduledFiles[failedLFN].Status = "Failed" else: log.warn("unable to read replicas for %s: %s" % (failedLFN, errStr)) return S_OK(missingReplicas) def __filterReplicas(self, opFile): """ filter out banned/invalid source SEs """ log = self.log.getSubLogger("filterReplicas") ret = {"Valid": [], "Banned": [], "Bad": []} replicas = self.replicaManager().getActiveReplicas(opFile.LFN) if not replicas["OK"]: log.error(replicas["Message"]) reNotExists = re.compile("not such file or directory") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"][ opFile.LFN] if opFile.LFN in replicas["Successful"] else {} for repSEName in replicas: repSE = self.getSE(repSEName) pfn = repSE.getPfnForLfn(opFile.LFN) if not pfn["OK"]: log.warn("unable to create pfn for %s lfn: %s" % (opFile.LFN, pfn["Message"])) ret["Banned"].append(repSEName) continue pfn = pfn["Value"] repSEMetadata = repSE.getFileMetadata(pfn, singleFile=True) if not repSEMetadata["OK"]: self.log.warn(repSEMetadata["Message"]) ret["Banned"].append(repSEName) continue repSEMetadata = repSEMetadata["Value"] seChecksum = repSEMetadata["Checksum"].replace( "x", "0").zfill(8) if "Checksum" in repSEMetadata else None if opFile.Checksum and opFile.Checksum != seChecksum: self.log.warn(" %s checksum mismatch: %s %s:%s" % (opFile.LFN, opFile.Checksum, repSE, seChecksum)) ret["Bad"].append(repSEName) continue # # if we're here repSE is OK ret["Valid"].append(repSEName) return S_OK(ret)
class FTSStrategy( object ): """ .. class:: FTSStrategy helper class to create replication forest for a given file and it's replicas using several different strategies """ # # make it singleton __metaclass__ = DIRACSingleton # # list of supported strategies __supportedStrategies = [ 'Simple', 'DynamicThroughput', 'Swarm', 'MinimiseTotalWait' ] # # FTS graph __ftsGraph = None # # lock __graphLock = None # # resources __resources = None # # rss client __rssClient = None # # acceptable failure rate acceptableFailureRate = 75 # # acceptable failed files acceptableFailedFiles = 5 # # scheduling type schedulingType = "Files" def __init__( self, csPath = None, ftsSites = None, ftsHistoryViews = None ): """c'tor :param self: self reference :param str csPath: CS path :param list ftsSites: list of FTSSites :param list ftsHistoryViews: list of FTSHistoryViews """ # ## config path self.csPath = csPath # # fts sites ftsSites = ftsSites if ftsSites else [] # # history views ftsHistoryViews = ftsHistoryViews if ftsHistoryViews else [] # # own sub logger self.log = gLogger.getSubLogger( "FTSStrategy", child = True ) self.log.setLevel( gConfig.getValue( self.csPath + "/LogLevel", "DEBUG" ) ) for ftsSite in ftsSites: self.log.info( "FTSSite: %-16s FTSServer=%s" % ( ftsSite.Name, ftsSite.FTSServer ) ) # # CS options self.log.info( "Supported strategies = %s" % ", ".join( self.supportedStrategies ) ) self.activeStrategies = gConfig.getValue( "%s/%s" % ( self.csPath, "ActiveStrategies" ), ["MinimiseTotalWait"] ) self.log.info( "ActiveStrategies = %s" % ", ".join( self.activeStrategies ) ) self.numberOfStrategies = len( self.activeStrategies ) self.log.info( "Number of active strategies = %s" % self.numberOfStrategies ) self.sigma = gConfig.getValue( "%s/%s" % ( self.csPath, "HopSigma" ), 5 ) self.log.info( "HopSigma = %s" % self.sigma ) self.schedulingType = gConfig.getValue( "%s/%s" % ( self.csPath, "SchedulingType" ), "Files" ) self.log.info( "SchedulingType = %s" % self.schedulingType ) self.acceptableFailureRate = gConfig.getValue( "%s/%s" % ( self.csPath, "AcceptableFailureRate" ), 75 ) self.log.info( "AcceptableFailureRate = %s" % self.acceptableFailureRate ) self.acceptableFailedFiles = gConfig.getValue( "%s/%s" % ( self.csPath, "AcceptableFailedFiles" ), 5 ) self.log.info( "AcceptableFailedFiles = %s" % self.acceptableFailedFiles ) # # chosen strategy self.chosenStrategy = 0 # dispatcher self.strategyDispatcher = { "MinimiseTotalWait" : self.minimiseTotalWait, "DynamicThroughput" : self.dynamicThroughput, "Simple" : self.simple, "Swarm" : self.swarm } self.ftsGraph = FTSGraph( "FTSGraph", ftsHistoryViews, self.acceptableFailureRate, self.acceptableFailedFiles, self.schedulingType ) # for node in self.ftsGraph.nodes(): # self.log.debug( node ) # for edge in self.ftsGraph.edges(): # self.log.debug( edge ) # # if we land here everything is OK self.log.info( "%s has been constructed" % self.__class__.__name__ ) @classmethod def graphLock( cls ): """ get graph lock """ if not cls.__graphLock: cls.__graphLock = LockRing().getLock( "FTSGraphLock" ) return cls.__graphLock @classmethod def resetGraph( cls, ftsHistoryViews ): """ reset graph :param list ftsHistoryViews: list of FTSHistoryViews """ ftsGraph = None try: cls.graphLock().acquire() ftsGraph = FTSGraph( "FTSGraph", ftsHistoryViews, cls.acceptableFailureRate, cls.acceptableFailedFiles, cls.schedulingType ) if ftsGraph: cls.ftsGraph = ftsGraph finally: cls.graphLock().release() return S_OK() def updateRWAccess( self ): """ update RW access in FTS graph """ updateRWAccess = S_OK() try: self.graphLock().acquire() updateRWAccess = self.ftsGraph.updateRWAccess() if not updateRWAccess["OK"]: self.log.error( updateRWAccess["Message"] ) finally: self.graphLock().release() return updateRWAccess def addTreeToGraph( self, replicationTree = None, size = 0.0 ): """ update rw access for nodes (sites) and size anf files for edges (channels) """ replicationTree = replicationTree if replicationTree else {} size = size if size else 0.0 if replicationTree: try: self.graphLock().acquire() for route in self.ftsGraph.edges(): if route.routeName in replicationTree: route.WaitingSize += size route.WaitingFiles += 1 finally: self.graphLock().release() return S_OK() def simple( self, sourceSEs, targetSEs ): """ simple strategy - one source, many targets :param list sourceSEs: list with only one sourceSE name :param list targetSEs: list with target SE names :param str lfn: logical file name """ # # make targetSEs list unique if len( sourceSEs ) != 1: return S_ERROR( "simple: wrong argument supplied for sourceSEs, only one sourceSE allowed" ) sourceSE = sourceSEs[0] tree = {} for targetSE in targetSEs: route = self.ftsGraph.findRoute( sourceSE, targetSE ) if not route["OK"]: return S_ERROR( route["Message"] ) route = route["Value"] if not route.fromNode.SEs[sourceSE]["read"]: return S_ERROR( "simple: sourceSE '%s' in banned for reading right now" % sourceSE ) if not route.toNode.SEs[targetSE]["write"]: return S_ERROR( "simple: targetSE '%s' is banned for writing right now" % targetSE ) if route.name in tree: return S_ERROR( "simple: unable to create replication tree, route '%s' cannot be used twice" % \ route.name ) tree[route.name] = { "Ancestor" : False, "SourceSE" : sourceSE, "TargetSE" : targetSE, "Strategy" : "Simple" } return S_OK( tree ) def swarm( self, sourceSEs, targetSEs ): """ swarm strategy - one target, many sources, pick up the fastest :param list sourceSEs: list of source SE :param str targetSEs: on element list with name of target SE :param str lfn: logical file name """ tree = {} routes = [] if len( targetSEs ) > 1: return S_ERROR( "swarm: wrong argument supplied for targetSEs, only one targetSE allowed" ) targetSE = targetSEs[0] # # find channels for sourceSE in sourceSEs: route = self.ftsGraph.findRoute( sourceSE, targetSE ) if not route["OK"]: self.log.warn( "swarm: %s" % route["Message"] ) continue routes.append( ( sourceSE, route["Value"] ) ) # # exit - no channels if not routes: return S_ERROR( "swarm: unable to find FTS routes between '%s' and '%s'" % ( ",".join( sourceSEs ), targetSE ) ) # # filter out non active channels routes = [ ( sourceSE, route ) for sourceSE, route in routes if route.fromNode.SEs[sourceSE]["read"] and route.toNode.SEs[targetSE]["write"] and route.timeToStart < float( "inf" ) ] # # exit - no active channels if not routes: return S_ERROR( "swarm: no active routes found between %s and %s" % ( sourceSEs, targetSE ) ) # # find min timeToStart minTimeToStart = float( "inf" ) selSourceSE = selRoute = None for sourceSE, route in routes: if route.timeToStart < minTimeToStart: minTimeToStart = route.timeToStart selSourceSE = sourceSE selRoute = route if not selSourceSE: return S_ERROR( "swarm: no active routes found between %s and %s" % ( sourceSEs, targetSE ) ) tree[selRoute.name] = { "Ancestor" : False, "SourceSE" : selSourceSE, "TargetSE" : targetSE, "Strategy" : "Swarm" } return S_OK( tree ) def minimiseTotalWait( self, sourceSEs, targetSEs ): """ find dag minimizing start time :param list sourceSEs: list of avialable source SEs :param list targetSEs: list of target SEs :param str lfn: logical file name """ tree = {} primarySources = sourceSEs while targetSEs: minTimeToStart = float( "inf" ) channels = [] self.log.verbose( "minimiseTotalWait: searching routes between %s and %s" % ( ",".join( sourceSEs ), ",".join( targetSEs ) ) ) for targetSE in targetSEs: for sourceSE in sourceSEs: ftsChannel = self.ftsGraph.findRoute( sourceSE, targetSE ) if not ftsChannel["OK"]: self.log.warn( "minimiseTotalWait: %s" % ftsChannel["Message"] ) continue ftsChannel = ftsChannel["Value"] channels.append( ( ftsChannel, sourceSE, targetSE ) ) if not channels: msg = "minimiseTotalWait: FTS route between these SEs are not defined" self.log.error( msg ) return S_ERROR( msg ) self.log.verbose( "minimiseTotalWait: found %s candidate routes, checking RSS status" % len( channels ) ) for ch, s, t in channels: self.log.verbose( "%s %s %s" % ( ch.routeName, ch.fromNode.SEs[s]["read"], ch.toNode.SEs[t]["write"] ) ) channels = [ ( channel, sourceSE, targetSE ) for channel, sourceSE, targetSE in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.timeToStart < float( "inf" ) ] if not channels: self.log.error( "minimiseTotalWait: no active FTS routes found" ) return S_ERROR( "minimiseTotalWait: no active FTS routes found" ) candidates = [] for channel, sourceSE, targetSE in channels: timeToStart = channel.timeToStart if sourceSE not in primarySources: timeToStart += self.sigma # # local found if channel.fromNode == channel.toNode: self.log.debug( "minimiseTotalWait: found local route '%s'" % channel.routeName ) candidates = [ ( channel, sourceSE, targetSE ) ] break if timeToStart <= minTimeToStart: minTimeToStart = timeToStart candidates = [ ( channel, sourceSE, targetSE ) ] elif timeToStart == minTimeToStart: candidates.append( ( channel, sourceSE, targetSE ) ) if not candidates: return S_ERROR( "minimiseTotalWait: unable to find candidate FTS routes minimizing total wait time" ) random.shuffle( candidates ) selChannel, selSourceSE, selTargetSE = candidates[0] ancestor = False for routeName, treeItem in tree.items(): if selSourceSE in treeItem["TargetSE"]: ancestor = treeItem["TargetSE"] tree[selChannel.routeName] = { "Ancestor" : ancestor, "SourceSE" : selSourceSE, "TargetSE" : selTargetSE, "Strategy" : "MinimiseTotalWait" } sourceSEs.append( selTargetSE ) targetSEs.remove( selTargetSE ) return S_OK( tree ) def dynamicThroughput( self, sourceSEs, targetSEs ): """ dynamic throughput - many sources, many targets - find dag minimizing overall throughput :param list sourceSEs: list of available source SE names :param list targetSE: list of target SE names :param str lfn: logical file name """ tree = {} primarySources = sourceSEs timeToSite = {} while targetSEs: minTimeToStart = float( "inf" ) channels = [] for targetSE in targetSEs: for sourceSE in sourceSEs: ftsChannel = self.ftsGraph.findRoute( sourceSE, targetSE ) if not ftsChannel["OK"]: self.log.warn( "dynamicThroughput: %s" % ftsChannel["Message"] ) continue ftsChannel = ftsChannel["Value"] channels.append( ( ftsChannel, sourceSE, targetSE ) ) # # no candidate channels found if not channels: msg = "dynamicThroughput: FTS routes between %s and %s are not defined" % ( ",".join( sourceSEs ), ",".join( targetSEs ) ) self.log.error( msg ) return S_ERROR( msg ) # # filter out already used channels channels = [ ( channel, sourceSE, targetSE ) for channel, sourceSE, targetSE in channels if channel.routeName not in tree ] if not channels: msg = "dynamicThroughput: all FTS routes between %s and %s are already used in tree" % ( ",".join( sourceSEs ), ",".join( targetSEs ) ) self.log.error( msg ) return S_ERROR( msg ) # # filter out non-active channels self.log.debug( "dynamicThroughput: found %s candidate routes, checking activity" % len( channels ) ) channels = [ ( channel, sourceSE, targetSE ) for channel, sourceSE, targetSE in channels if channel.fromNode.SEs[sourceSE]["read"] and channel.toNode.SEs[targetSE]["write"] and channel.timeToStart < float( "inf" ) ] if not channels: self.log.warn( "dynamicThroughput: active candidate routes not found" ) return S_ERROR( "dynamicThroughput: no active candidate FTS routes found" ) candidates = [] selTimeToStart = None for channel, sourceSE, targetSE in channels: timeToStart = channel.timeToStart if sourceSE not in primarySources: timeToStart += self.sigma if sourceSE in timeToSite: timeToStart += timeToSite[sourceSE] # # local found if channel.fromNode == channel.toNode: self.log.debug( "dynamicThroughput: found local route '%s'" % channel.channelName ) candidates = [ ( channel, sourceSE, targetSE ) ] selTimeToStart = timeToStart break if timeToStart <= minTimeToStart: selTimeToStart = timeToStart minTimeToStart = timeToStart candidates = [ ( channel, sourceSE, targetSE ) ] elif timeToStart == minTimeToStart: candidates.append( ( channel, sourceSE, targetSE ) ) if not candidates: return S_ERROR( "dynamicThroughput: unable to find candidate FTS routes" ) random.shuffle( candidates ) selChannel, selSourceSE, selTargetSE = candidates[0] ancestor = False for routeName, treeItem in tree.items(): if selSourceSE in treeItem["TargetSE"]: ancestor = treeItem["TargetSE"] tree[selChannel.name] = { "Ancestor": ancestor, "SourceSE": selSourceSE, "TargetSE": selTargetSE, "Strategy": "DynamicThroughput" } timeToSite[selTargetSE] = selTimeToStart sourceSEs.append( selTargetSE ) targetSEs.remove( selTargetSE ) return S_OK( tree ) def reset( self ): """ reset :chosenStrategy: :param self: self reference """ self.chosenStrategy = 0 @property def supportedStrategies( self ): """ Get supported strategies. :param self: self reference """ return self.__supportedStrategies def replicationTree( self, sourceSEs, targetSEs, size, strategy = None ): """ get replication tree :param str lfn: LFN :param list sourceSEs: list of sources SE names to use :param list targetSEs: list of target SE names to use :param long size: file size :param str strategy: strategy name """ # # get strategy strategy = strategy if strategy else self.__selectStrategy() if strategy not in self.activeStrategies: return S_ERROR( "replicationTree: inactive or unsupported strategy '%s'" % strategy ) self.log.verbose( "replicationTree: strategy=%s sourceSEs=%s targetSEs=%s size=%s" % \ ( strategy, sourceSEs, targetSEs, size ) ) # # fire action from dispatcher tree = self.strategyDispatcher[strategy]( sourceSEs, targetSEs ) if not tree["OK"]: self.log.error( "replicationTree: %s" % tree["Message"] ) return tree # # update graph edges self.log.verbose( "replicationTree: %s" % tree["Value"] ) update = self.addTreeToGraph( replicationTree = tree["Value"], size = size ) if not update["OK"]: self.log.error( "replicationTree: unable to update FTS graph: %s" % update["Message"] ) return update return tree def __selectStrategy( self ): """ If more than one active strategy use one after the other. :param self: self reference """ chosenStrategy = self.activeStrategies[self.chosenStrategy] self.chosenStrategy += 1 if self.chosenStrategy == self.numberOfStrategies: self.chosenStrategy = 0 return chosenStrategy
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag PIN_TIME = 0 # # FTS submission command SUBMIT_COMMAND = 'glite-transfer-submit' # # FTS monitoring command MONITOR_COMMAND = 'glite-transfer-status' # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE( cls, seName ): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement( seName ) return cls.__seCache[seName] @classmethod def getSECache( cls ): return cls.__seCache @classmethod def getRequest( cls, reqName ): """ get Requests systematically and refresh cache """ getRequest = cls.requestClient().getRequest( reqName ) if not getRequest["OK"]: cls.__reqCache.pop( reqName, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqName, None ) return S_ERROR( "request of name '%s' not found in ReqDB" % reqName ) cls.__reqCache[reqName] = getRequest return S_OK( cls.__reqCache[reqName] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestName not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestName, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history: %s" % ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory ) finally: self.updateLock().release() log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH ) log.info( "FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH ) self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH ) log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH ) self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND ) log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND ) self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND ) log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) ) self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME ) log.info( "Stage files before submission = %s" % {True: "yes", False: "no"}[bool( self.PIN_TIME )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = %s" % self.MAX_ACTIVE_JOBS ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = %d" % self.MAX_FILES_PER_JOB ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = %s" % self.MAX_ATTEMPT ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = %s" % self.MIN_THREADS ) log.info( "ThreadPool max threads = %s" % self.MAX_THREADS ) log.info( "initialize: creation of FTSGraph..." ) createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error( "initialize: %s" % createGraph["Message"] ) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "unable to read scheduled request names: %s" % requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestNames ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( request["Message"] ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "request '%s' enqueued for execution" % sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( request.RequestName ) operation = request.getWaiting() if not operation["OK"]: log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "waiting returned operation is not an operation: %s" % type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returnin try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) if ftsJobs: log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) # Recover files not in FTSDB toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # Recover files with target not in FTSDB toSchedule = set( [missing for missing, missingSEs in missingReplicas.items() if not [ftsFile for ftsFile in ftsFiles if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] ) if toSchedule: log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) # # putRequest is done by the finally: clause... Not good to do it twice raise escapeTry # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed: %s" % update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status in ( 'Failed', 'Canceled' ): # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestName, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except escapeTry: # This clause is raised when one wants to return from within the try: clause pass except Exception, exceptMessage: log.exception( "Exception in processRequest", lException = exceptMessage ) finally:
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag PIN_TIME = 0 # # FTS submission command SUBMIT_COMMAND = 'glite-transfer-submit' # # FTS monitoring command MONITOR_COMMAND = 'glite-transfer-status' # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE( cls, seName ): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement( seName ) return cls.__seCache[seName] @classmethod def getSECache( cls ): return cls.__seCache @classmethod def getRequest( cls, reqName ): """ get Requests systematically and refresh cache """ getRequest = cls.requestClient().getRequest( reqName ) if not getRequest["OK"]: cls.__reqCache.pop( reqName, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqName, None ) return S_ERROR( "request of name '%s' not found in ReqDB" % reqName ) cls.__reqCache[reqName] = getRequest return S_OK( cls.__reqCache[reqName] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestName not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestName, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history:", ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory, maxActiveJobs = self.MAX_ACTIVE_JOBS ) finally: self.updateLock().release() log.debug( "FTSSites:", len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH ) log.info( "FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH ) self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH ) log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH ) self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND ) log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND ) self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND ) log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) ) self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME ) log.info( "Stage files before submission = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = ", str( self.MAX_ACTIVE_JOBS ) ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = ", str( self.MAX_FILES_PER_JOB ) ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = ", str( self.MAX_ATTEMPT ) ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = ", str( self.MIN_THREADS ) ) log.info( "ThreadPool max threads = ", str( self.MAX_THREADS ) ) log.info( "initialize: creation of FTSGraph..." ) createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error( "initialize: ", createGraph["Message"] ) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: ", resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "Unable to read scheduled request names: ", requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "No 'Scheduled' requests to process" ) return S_OK() log.info( "Found requests to process:", str( len( requestNames ) ) ) log.info( " => from internal cache:", str( ( len( self.__reqCache ) ) ) ) log.info( " => new read from RMS:", str( ( len( requestNames ) - len( self.__reqCache ) ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestName, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( request.RequestName ) operation = request.getWaiting() if not operation["OK"]: log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "Waiting returned operation is not an operation:", type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returnin try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) if ftsJobs: log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) # Recover files not in FTSDB toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # Recover files with target not in FTSDB toSchedule = set( [missing for missing, missingSEs in missingReplicas.items() if not [ftsFile for ftsFile in ftsFiles if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] ) if toSchedule: log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) # # putRequest is done by the finally: clause... Not good to do it twice raise escapeTry # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed:", update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status in ( 'Failed', 'Canceled' ): # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestName, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except escapeTry: # This clause is raised when one wants to return from within the try: clause pass except Exception, exceptMessage: log.exception( "Exception in processRequest", lException = exceptMessage ) finally:
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag STAGE_FILES = False # # replica manager __replicaManager = None # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def replicaManager( cls ): """ replica manager getter """ if not cls.__replicaManager: cls.__replicaManager = ReplicaManager() return cls.__replicaManager @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE( cls, seName ): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement( seName ) return cls.__seCache[seName] @classmethod def getRequest( cls, reqName ): """ keep Requests in cache """ if reqName not in cls.__reqCache: getRequest = cls.requestClient().getRequest( reqName ) if not getRequest["OK"]: return getRequest getRequest = getRequest["Value"] if not getRequest: return S_ERROR( "request of name '%s' not found in ReqDB" % reqName ) cls.__reqCache[reqName] = getRequest return S_OK( cls.__reqCache[reqName] ) @classmethod def putRequest( cls, request ): """ put request back to ReqDB :param Request request: Request instance also finalize request if status == Done """ # # put back request put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first is possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if request.RequestName in cls.__reqCache: del cls.__reqCache[ request.RequestName ] return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history: %s" % ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory ) finally: self.updateLock().release() log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ log = self.log.getSubLogger( "initialize" ) self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH ) log.info( "FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH ) self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH ) log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH ) self.STAGE_FILES = self.am_getOption( "StageFiles", self.STAGE_FILES ) log.info( "Stage files before submission = %s" % {True: "yes", False: "no"}[bool( self.STAGE_FILES )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = %s" % self.MAX_ACTIVE_JOBS ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = %d" % self.MAX_FILES_PER_JOB ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = %s" % self.MAX_ATTEMPT ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = %s" % self.MIN_THREADS ) log.info( "ThreadPool max threads = %s" % self.MAX_THREADS ) log.info( "initialize: creation of FTSGraph..." ) createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error( "initialize: %s" % createGraph["Message"] ) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ log = self.log.getSubLogger( "finalize" ) for request in self.__reqCache.values(): put = self.requestClient().putRequest( request ) if not put["OK"]: log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "unable to read scheduled request names: %s" % requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestNames ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( request["Message"] ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "request '%s' enqueued for execution" % sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( request.RequestName ) operation = request.getWaiting() if not operation["OK"]: log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status ) return self.putRequest( request ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = ftsJobs["Value"] if ftsJobs["Value"] else [] # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) if ftsJobs: log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" continue ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status != 'Done': log.warn( "Should be set! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, request execution cannot proceed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed at this stage "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) return self.putRequest( request ) # # PHASE THREE - update Waiting#SourceSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: if ftsFile.TargetSE not in byTarget: byTarget.setdefault( ftsFile.TargetSE, [] ) byTarget[ftsFile.TargetSE].append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed: %s" % update["Message"] ) continue # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %s Files waiting for registration, adding 'RegisterReplica' operations" ) registerFiles = self.__register( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"] ) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to RMS" ) return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( rescheduleFiles["Message"] ) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to ReqDB" ) return self.putRequest( request ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting" ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = list( set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: toSubmit.append( ftsFile ) retryIds.append( ftsFile.FTSFileID ) # # submit new ftsJobs if operation.Status == "Scheduled" and toSubmit: log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": put = self.putRequest( request ) if not put["OK"]: log.error( "unable to put back request: %s" % put["Message"] ) return put # # put back jobs if ftsJobs: putJobs = self.putFTSJobs( ftsJobs ) if not putJobs["OK"]: log.error( "unable to put back FTSJobs: %s" % putJobs["Message"] ) return putJobs return S_OK() def __reschedule( self, request, operation, toReschedule ): """ reschedule list of :toReschedule: files in request for operation :operation: :param Request request: :param Operation operation: :param list toReschedule: list of FTSFiles """ log = self.log.getSubLogger( "%s/reschedule" % request.RequestName ) log.info( "found %s files to reschedule" % len( toReschedule ) ) for opFile in operation: for ftsFile in toReschedule: if opFile.FileID == ftsFile.FileID: opFile.Status = "Waiting" toSchedule = [] # # filter files for opFile in operation.getWaitingFilesList(): replicas = self.__filterReplicas( opFile ) if not replicas["OK"]: continue replicas = replicas["Value"] if not replicas["Valid"] and replicas["Banned"]: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN ) continue validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] if not validReplicas and bannedReplicas: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN ) continue if validReplicas: validTargets = list( set( operation.targetSEList ) - set( validReplicas ) ) if not validTargets: log.info( "file %s is already present at all targets" % opFile.LFN ) opFile.Status = "Done" continue toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) ) # # do real schedule here if toSchedule: ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID, operation.OperationID, toSchedule ) if not ftsSchedule["OK"]: self.log.error( ftsSchedule["Message"] ) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for fileID in ftsSchedule["Successful"]: for opFile in operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" for fileID, reason in ftsSchedule["Failed"]: for opFile in operation: if fileID == opFile.FileID: opFile.Error = reason return S_OK() def __submit( self, request, operation, toSubmit ): """ create and submit new FTSJobs using list of FTSFiles :param Request request: ReqDB.Request instance :param list ftsFiles: list of FTSFile instances :return: [ FTSJob, FTSJob, ...] """ log = self.log.getSubLogger( "%s/submit" % request.RequestName ) bySourceAndTarget = {} for ftsFile in toSubmit: if ftsFile.SourceSE not in bySourceAndTarget: bySourceAndTarget.setdefault( ftsFile.SourceSE, {} ) if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]: bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] ) bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile ) ftsJobs = [] for source, targetDict in bySourceAndTarget.items(): for target, ftsFileList in targetDict.items(): log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) ) route = self.__ftsGraph.findRoute( source, target ) if not route["OK"]: log.error( route["Message"] ) continue route = route["Value"] sourceRead = route.fromNode.SEs[source]["read"] if not sourceRead: log.error( "SourceSE %s is banned for reading right now" % source ) continue targetWrite = route.toNode.SEs[target]["write"] if not targetWrite: log.error( "TargetSE %s is banned for writing right now" % target ) continue if route.ActiveJobs > route.toNode.MaxActiveJobs: log.warn( "unable to submit new FTS job, max active jobs reached" ) continue # # create FTSJob ftsJob = FTSJob() ftsJob.RequestID = request.RequestID ftsJob.OperationID = operation.OperationID ftsJob.SourceSE = source ftsJob.TargetSE = target sourceSE = self.getSE( source ) sourceToken = sourceSE.getStorageParameters( "SRM2" ) if not sourceToken["OK"]: log.error( "unable to get sourceSE '%s' parameters: %s" % ( source, sourceToken["Message"] ) ) continue ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" ) targetSE = self.getSE( target ) targetToken = targetSE.getStorageParameters( "SRM2" ) if not targetToken["OK"]: log.error( "unable to get targetSE '%s' parameters: %s" % ( target, targetToken["Message"] ) ) continue ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" ) ftsJob.FTSServer = route.toNode.FTSServer for ftsFile in ftsFileList: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile( ftsFile ) submit = ftsJob.submitFTS2( self.STAGE_FILES ) if not submit["OK"]: log.error( "unable to submit FTSJob: %s" % submit["Message"] ) continue log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # update statuses for job files for ftsFile in ftsJob: ftsFile.FTSGUID = ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 # # update graph route try: self.updateLock().acquire() route.ActiveJobs += 1 finally: self.updateLock().release() ftsJobs.append( ftsJob ) log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) ) return S_OK( ftsJobs ) def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS2 for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "%s/monitor/%s" % ( request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) monitor = ftsJob.monitorFTS2() if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"]: log.error( "FTSJob not known (expired on server?)" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"] = ftsFile return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict ) def __finalizeFTSJob( self, request, ftsJob ): """ finalize FTSJob :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "%s/monitor/%s/finalize" % ( request.RequestName, ftsJob.FTSJobID ) ) log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) monitor = ftsJob.monitorFTS2( full = True ) if not monitor["OK"]: log.error( monitor["Message"] ) return monitor # # split FTSFiles to different categories processFiles = self.__filterFiles( ftsJob ) if not processFiles["OK"]: log.error( processFiles["Message"] ) return processFiles ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles["Value"] ) # # send accounting record for this job self.__sendAccounting( ftsJob, request.OwnerDN ) # # update graph - remove this job from graph route = self.__ftsGraph.findRoute( ftsJob.SourceSE, ftsJob.TargetSE ) if route["OK"]: try: self.updateLock().acquire() route["Value"].ActiveJobs -= 1 finally: self.updateLock().release() log.info( "FTSJob is finalized" ) return S_OK( ftsFilesDict ) def __filterFiles( self, ftsJob ): """ process ftsFiles from finished ftsJob :param FTSJob ftsJob: monitored FTSJob instance """ # # lists for different categories toUpdate = [] toReschedule = [] toRegister = [] toSubmit = [] toFail = [] # # loop over files in fts job for ftsFile in ftsJob: # # successful files if ftsFile.Status == "Finished": if ftsFile.Error == "AddCatalogReplicaFailed": toRegister.append( ftsFile ) toUpdate.append( ftsFile ) continue if ftsFile.Status == "Failed": if ftsFile.Error == "MissingSource": toReschedule.append( ftsFile ) else: if ftsFile.Attempt < self.MAX_ATTEMPT: toSubmit.append( ftsFile ) else: toFail.append( ftsFile ) ftsFile.Error = "Max attempts reached" return S_OK( { "toUpdate": toUpdate, "toSubmit": toSubmit, "toRegister": toRegister, "toReschedule": toReschedule, "toFail": toFail } ) def __register( self, request, operation, toRegister ): """ add RegisterReplica operation :param Request request: request instance :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register """ log = self.log.getSubLogger( "%s/registerFiles" % request.RequestName ) byTarget = {} for ftsFile in toRegister: if ftsFile.TargetSE not in byTarget: byTarget.setdefault( ftsFile.TargetSE, [] ) byTarget[ftsFile.TargetSE].append( ftsFile ) log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) ) for target, ftsFileList in byTarget.items(): log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target, len( ftsFileList ) ) ) registerOperation = Operation() registerOperation.Type = "RegisterReplica" registerOperation.Status = "Waiting" registerOperation.TargetSE = target targetSE = self.getSE( target ) for ftsFile in ftsFileList: opFile = File() opFile.LFN = ftsFile.LFN pfn = targetSE.getPfnForProtocol( ftsFile.TargetSURL, "SRM2", withPort = False ) if not pfn["OK"]: continue opFile.PFN = pfn["Value"] registerOperation.addFile( opFile ) request.insertBefore( registerOperation, operation ) return S_OK() @staticmethod def __sendAccounting( ftsJob, ownerDN ): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime( fromString( ftsJob.SubmitTime ) ) dataOp.setEndTime( fromString( ftsJob.LastUpdate ) ) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN( ownerDN ) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] ) accountingDict["TransferTotal"] = len( ftsJob ) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob]) dataOp.setValuesFromDict( accountingDict ) dataOp.commit() def __checkReadyReplicas( self, request, operation ): """ check ready replicas for transferOperation """ log = self.log.getSubLogger( "%s/checkReadyReplicas" % request.RequestName ) targetSESet = set( operation.targetSEList ) # # { LFN: [ targetSE, ... ] } missingReplicas = {} scheduledFiles = dict( [ ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) ] ) # # get replicas replicas = self.replicaManager().getCatalogReplicas( scheduledFiles.keys() ) if not replicas["OK"]: self.log.error( replicas["Message"] ) return replicas replicas = replicas["Value"] fullyReplicated = 0 missingSEs = {} for successfulLFN in replicas["Successful"]: reps = set( replicas['Successful'][successfulLFN] ) if targetSESet.issubset( reps ): log.info( "%s has been replicated to all targets" % successfulLFN ) fullyReplicated += 1 scheduledFiles[successfulLFN].Status = "Done" else: missingReplicas[successfulLFN] = sorted( targetSESet - reps ) ses = ",".join( missingReplicas[ successfulLFN ] ) missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1 log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) ) if fullyReplicated: log.info( "%d new files have been replicated to all targets" % fullyReplicated ) if missingSEs: for ses in missingSEs: log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) ) reMissing = re.compile( "no such file or directory" ) for failedLFN, errStr in replicas["Failed"].items(): scheduledFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): log.error( "%s is missing, setting its status to 'Failed'" % failedLFN ) scheduledFiles[failedLFN].Status = "Failed" else: log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) ) return S_OK( missingReplicas ) def __filterReplicas( self, opFile ): """ filter out banned/invalid source SEs """ log = self.log.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "Banned" : [], "Bad" : [] } replicas = self.replicaManager().getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( replicas["Message"] ) reNotExists = re.compile( "not such file or directory" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"][opFile.LFN] if opFile.LFN in replicas["Successful"] else {} for repSEName in replicas: repSE = self.getSE( repSEName ) pfn = repSE.getPfnForLfn( opFile.LFN ) if not pfn["OK"]: log.warn( "unable to create pfn for %s lfn: %s" % ( opFile.LFN, pfn["Message"] ) ) ret["Banned"].append( repSEName ) continue pfn = pfn["Value"] repSEMetadata = repSE.getFileMetadata( pfn, singleFile = True ) if not repSEMetadata["OK"]: self.log.warn( repSEMetadata["Message"] ) ret["Banned"].append( repSEName ) continue repSEMetadata = repSEMetadata["Value"] seChecksum = repSEMetadata["Checksum"].replace( "x", "0" ).zfill( 8 ) if "Checksum" in repSEMetadata else None if opFile.Checksum and opFile.Checksum != seChecksum: self.log.warn( " %s checksum mismatch: %s %s:%s" % ( opFile.LFN, opFile.Checksum, repSE, seChecksum ) ) ret["Bad"].append( repSEName ) continue # # if we're here repSE is OK ret["Valid"].append( repSEName ) return S_OK( ret )