def __addPool( self, poolName ): # create a new thread Pool, by default it has 2 executing threads and 40 requests # in the Queue if not poolName: return None if poolName in self.pools: return None pool = ThreadPool( self.am_getOption( 'minThreadsInPool' ), self.am_getOption( 'maxThreadsInPool' ), self.am_getOption( 'totalThreadsInPool' ) ) # Daemonize except "Default" pool if poolName != 'Default': pool.daemonize() self.pools[poolName] = pool return poolName
def __addPool(self, poolName): # create a new thread Pool, by default it has 2 executing threads and 40 requests # in the Queue if not poolName: return None if poolName in self.pools: return None pool = ThreadPool(self.am_getOption('minThreadsInPool'), self.am_getOption('maxThreadsInPool'), self.am_getOption('totalThreadsInPool')) # Daemonize except "Default" pool if poolName != 'Default': pool.daemonize() self.pools[poolName] = pool return poolName
def __addPool(self, poolName): """ create a new thread Pool, by default it has 2 executing threads and 40 requests in the Queue """ if not poolName: return None if poolName in self.pools: return None pool = ThreadPool( self.am_getOption("minThreadsInPool"), self.am_getOption("maxThreadsInPool"), self.am_getOption("totalThreadsInPool"), ) # Daemonize except "Default" pool if poolName != "Default": pool.daemonize() self.pools[poolName] = pool return poolName
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts placement refresh in seconds FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag PIN_TIME = 0 # # FTS submission command SUBMIT_COMMAND = 'glite-transfer-submit' # # FTS monitoring command MONITOR_COMMAND = 'glite-transfer-status' # Max number of requests fetched from the RMS MAX_REQUESTS = 100 # Minimum interval (seconds) between 2 job monitoring MONITORING_INTERVAL = 600 # # placeholder for FTS client __ftsClient = None # # placeholder for the FTS version __ftsVersion = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSPlacement __ftsPlacement = None # # placement regeneration time delta __ftsPlacementValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getRequest( cls, reqID ): """ get Requests systematically and refresh cache """ getRequest = cls.requestClient().getRequest( reqID ) if not getRequest["OK"]: cls.__reqCache.pop( reqID, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqID, None ) return S_ERROR( "request of id '%s' not found in ReqDB" % reqID ) cls.__reqCache[reqID] = getRequest return S_OK( cls.__reqCache[reqID] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestID not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestID, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.iteritems(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSPlacement( self ): """ create fts Placement """ ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: self.log.error( "unable to get FTS history:", ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() if not self.__ftsPlacement: self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory ) else: self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory ) finally: self.updateLock().release() # # save time stamp self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH ) log.info( "FTSPlacement validity period = %s s" % self.FTSPLACEMENT_REFRESH ) self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND ) log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND ) self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND ) log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) ) self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME ) log.info( "Stage files before submission = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = ", str( self.MAX_ACTIVE_JOBS ) ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = ", str( self.MAX_FILES_PER_JOB ) ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = ", str( self.MAX_ATTEMPT ) ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = ", str( self.MIN_THREADS ) ) log.info( "ThreadPool max threads = ", str( self.MAX_THREADS ) ) self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS ) log.info( "Max Requests fetched = ", str( self.MAX_REQUESTS ) ) self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL ) log.info( "Minimum monitoring interval = ", str( self.MONITORING_INTERVAL ) ) self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' ) log.info( "FTSVersion : %s" % self.__ftsVersion ) log.info( "initialize: creation of FTSPlacement..." ) createPlacement = self.resetFTSPlacement() if not createPlacement["OK"]: log.error( "initialize:", createPlacement["Message"] ) return createPlacement # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) self.registrationProtocols = getRegistrationProtocols() # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ # Don't use the server certificate otherwise the DFC wont let us write gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) log = gLogger.getSubLogger( "execute" ) # # reset FTSPlacement if expired now = datetime.datetime.now() if now > self.__ftsPlacementValidStamp: log.info( "resetting expired FTS placement..." ) resetFTSPlacement = self.resetFTSPlacement() if not resetFTSPlacement["OK"]: log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] ) return resetFTSPlacement self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS ) if not requestIDs["OK"]: log.error( "unable to read scheduled request ids" , requestIDs["Message"] ) return requestIDs if not requestIDs["Value"]: requestIDs = [] else: requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ] requestIDs += self.__reqCache.keys() if not requestIDs: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestIDs ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) ) for requestID in requestIDs: request = self.getRequest( requestID ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestID while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) ) operation = request.getWaiting() if not operation["OK"]: log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "Waiting returned operation is not an operation:", type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returning try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) now = datetime.datetime.utcnow() jobsToMonitor = [job for job in ftsJobs if ( now - job.LastUpdate ).seconds > ( self.MONITORING_INTERVAL * ( 3. if StorageElement( job.SourceSE ).getStatus().get( 'Value', {} ).get( 'TapeSE' ) else 1. ) ) ] if jobsToMonitor: log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in jobsToMonitor: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.iteritems(): if ftsFiles: log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) if len( ftsJobs ) != len( jobsToMonitor ): log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) ) if not jobsToMonitor: # Nothing to happen this time, escape raise EscapeTryException # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) # Recover files not in FTSDB toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # Recover files with target not in FTSDB toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems() if not [ftsFile for ftsFile in ftsFiles if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] ) if toSchedule: log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Active transfers for which there is no FTS job any longer and reschedule them for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]: if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]: ftsFilesDict['toReschedule'].append( ftsFile ) # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) # Recover files that are Failed but were not spotted for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]: reschedule, submit, fail = self.__checkFailed( ftsFile ) if fail and ftsFile not in ftsFilesDict['toFail']: ftsFilesDict['toFail'].append( ftsFile ) elif reschedule and ftsFile not in ftsFilesDict['toReschedule']: ftsFilesDict['toReschedule'].append( ftsFile ) elif submit and ftsFile not in ftsFilesDict['toSubmit']: ftsFilesDict['toSubmit'].append( ftsFile ) # If all transfers are finished for unregistered files and there is already a registration operation, set it Done ftsLFNs = [f.LFN for f in ftsFiles] for lfn in missingReplicas: # We make sure here that the file is being processed by FTS if lfn in ftsLFNs: if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]: for opFile in operation: if opFile.LFN == lfn: opFile.Status = 'Done' break else: # Temporary log log.warn( "File with missing replica not in FTS files", lfn ) for key, ftsFiles in ftsFilesDict.iteritems(): if ftsFiles: log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) # # putRequest is done by the finally: clause... Not good to do it twice raise EscapeTryException # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.iteritems(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed:", update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( 'Failed to reschedule files', rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status in ( 'Failed', 'Canceled' ): # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # should not put back jobs that have not been monitored this time ftsJobs = jobsToMonitor # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestID, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except EscapeTryException: # This clause is raised when one wants to return from within the try: clause # only put back jobs that were monitored ftsJobs = jobsToMonitor except Exception as exceptMessage: log.exception( "Exception in processRequest", lException = exceptMessage ) finally: putRequest = self.putRequest( request, clearCache = ( request.Status != "Scheduled" ) ) if not putRequest["OK"]: log.error( "unable to put back request:", putRequest["Message"] ) # # put back jobs in all cases if ftsJobs: for ftsJob in list( ftsJobs ): if not len( ftsJob ): log.warn( 'FTS job empty, removed: %s' % ftsJob.FTSGUID ) self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) ftsJobs.remove( ftsJob ) putJobs = self.putFTSJobs( ftsJobs ) if not putJobs["OK"]: log.error( "unable to put back FTSJobs:", putJobs["Message"] ) putRequest = putJobs # This is where one returns from after execution of the finally: block return putRequest def __checkDuplicates( self, reqID, toSubmit ): """ Check in a list of FTSFiles whether there are duplicates """ tupleList = [] log = self.log.getSubLogger( "%s/checkDuplicates" % reqID ) for ftsFile in list( toSubmit ): fTuple = ( ftsFile.LFN, ftsFile.SourceSE, ftsFile.TargetSE ) if fTuple in tupleList: log.warn( "Duplicate file to submit, removed:", ', '.join( fTuple ) ) toSubmit.remove( ftsFile ) self.ftsClient().deleteFTSFiles( ftsFile.OperationID, [ftsFile.FileID] ) else: tupleList.append( fTuple ) def __reschedule( self, request, operation, toReschedule ): """ reschedule list of :toReschedule: files in request for operation :operation: :param Request request: :param Operation operation: :param list toReschedule: list of FTSFiles """ log = self.log.getSubLogger( "req_%s/%s/reschedule" % ( request.RequestID, request.RequestName ) ) ftsFileIDs = [ftsFile.FileID for ftsFile in toReschedule] for opFile in operation: if opFile.FileID in ftsFileIDs: opFile.Status = "Waiting" toSchedule = [] # # filter files for opFile in [ opFile for opFile in operation if opFile.Status == "Waiting" ]: replicas = self.__filterReplicas( opFile ) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas["NoReplicas"] badReplicas = replicas['Bad'] if validReplicas: validTargets = list( set( operation.targetSEList ) - set( validReplicas ) ) if not validTargets: log.info( "file %s is already present at all targets" % opFile.LFN ) opFile.Status = "Done" else: toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) ) elif noMetaReplicas: log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) ) elif noReplicas: log.warn( "unable to schedule %s, file doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) ) opFile.Status = 'Failed' elif badReplicas: log.warn( "unable to schedule %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) ) opFile.Status = 'Failed' # # do real schedule here if toSchedule: log.info( "Rescheduling %d files" % len( toReschedule ) ) ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID, operation.OperationID, toSchedule ) if not ftsSchedule["OK"]: log.error( "Error scheduling files", ftsSchedule["Message"] ) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for opFile in operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: opFile.Status = "Scheduled" elif fileID in ftsSchedule["Failed"]: opFile.Error = ftsSchedule["Failed"][fileID] log.error( "Error scheduling file %s" % opFile.LFN, opFile.Error ) return S_OK() def __submit( self, request, operation, toSubmit ): """ create and submit new FTSJobs using list of FTSFiles :param Request request: ReqDB.Request instance :param list ftsFiles: list of FTSFile instances :return: [ FTSJob, FTSJob, ...] """ log = self.log.getSubLogger( "req_%s/%s/submit" % ( request.RequestID, request.RequestName ) ) bySourceAndTarget = {} for ftsFile in toSubmit: if ftsFile.SourceSE not in bySourceAndTarget: bySourceAndTarget.setdefault( ftsFile.SourceSE, {} ) if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]: bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] ) bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile ) ftsJobs = [] for source, targetDict in bySourceAndTarget.iteritems(): for target, ftsFileList in targetDict.iteritems(): log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) ) route = self.__ftsPlacement.findRoute( source, target ) if not route["OK"]: log.error( route["Message"] ) continue route = route["Value"] routeValid = self.__ftsPlacement.isRouteValid( route ) if not routeValid['OK']: log.error( "Route invalid : %s" % routeValid['Message'] ) continue sourceSE = StorageElement( source ) sourceToken = sourceSE.getStorageParameters( protocol = 'srm' ) if not sourceToken["OK"]: log.error( "unable to get sourceSE parameters:", "(%s) %s" % ( source, sourceToken["Message"] ) ) continue seStatus = sourceSE.getStatus()['Value'] targetSE = StorageElement( target ) targetToken = targetSE.getStorageParameters( protocol = 'srm' ) if not targetToken["OK"]: log.error( "unable to get targetSE parameters:", "(%s) %s" % ( target, targetToken["Message"] ) ) continue # # create FTSJob for fileList in breakListIntoChunks( ftsFileList, self.MAX_FILES_PER_JOB ): ftsJob = FTSJob() ftsJob.RequestID = request.RequestID ftsJob.OperationID = operation.OperationID ftsJob.SourceSE = source ftsJob.TargetSE = target ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" ) ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" ) ftsJob.FTSServer = route.ftsServer for ftsFile in fileList: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile( ftsFile ) submit = ftsJob.submitFTS( self.__ftsVersion, command = self.SUBMIT_COMMAND, pinTime = self.PIN_TIME if seStatus['TapeSE'] else 0 ) if not submit["OK"]: log.error( "unable to submit FTSJob:", submit["Message"] ) continue log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # update statuses for job files for ftsFile in ftsJob: ftsFile.FTSGUID = ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 # # update placement route try: self.updateLock().acquire() self.__ftsPlacement.startTransferOnRoute( route ) finally: self.updateLock().release() ftsJobs.append( ftsJob ) log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) ) return S_OK( ftsJobs ) def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND ) if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \ 'was not found' in monitor['Message'] or\ "Not found" in monitor['Message'] or\ 'Unknown transfer state' in monitor['Message']: log.error( "FTSJob not known (expired on server?): delete it" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: if 'Unknown transfer state' in finalizeFTSJob['Message']: for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) else: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict ) def __finalizeFTSJob( self, request, ftsJob ): """ finalize FTSJob :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s/finalize" % ( request.RequestID, request.RequestName, ftsJob.FTSJobID ) ) log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) monitor = ftsJob.monitorFTS( self.__ftsVersion, command = self.MONITOR_COMMAND, full = True ) if not monitor["OK"]: log.error( monitor["Message"] ) return monitor # # split FTSFiles to different categories processFiles = self.__filterFiles( ftsJob ) if not processFiles["OK"]: log.error( processFiles["Message"] ) return processFiles processFiles = processFiles['Value'] if processFiles['toRegister']: log.error( "Some files could not be registered in FC:", len( processFiles['toRegister'] ) ) ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles ) # # send accounting record for this job self.__sendAccounting( ftsJob, request.OwnerDN ) # # update placement - remove this job from placement route = self.__ftsPlacement.findRoute( ftsJob.SourceSE, ftsJob.TargetSE ) if route["OK"]: try: self.updateLock().acquire() self.__ftsPlacement.finishTransferOnRoute( route['Value'] ) finally: self.updateLock().release() log.info( "FTSJob is finalized" ) return S_OK( ftsFilesDict ) def __checkFailed( self, ftsFile ): reschedule = False submit = False fail = False if ftsFile.Status in ( "Failed", 'Canceled' ): if ftsFile.Error == "MissingSource": reschedule = True else: if ftsFile.Attempt < self.MAX_ATTEMPT: submit = True else: fail = True return reschedule, submit, fail def __filterFiles( self, ftsJob ): """ process ftsFiles from finished ftsJob :param FTSJob ftsJob: monitored FTSJob instance """ # # lists for different categories toUpdate = [] toReschedule = [] toRegister = [] toSubmit = [] toFail = [] # # loop over files in fts job for ftsFile in ftsJob: # # successful files if ftsFile.Status == "Finished": if ftsFile.Error == "AddCatalogReplicaFailed": toRegister.append( ftsFile ) toUpdate.append( ftsFile ) continue reschedule, submit, fail = self.__checkFailed( ftsFile ) if reschedule: toReschedule.append( ftsFile ) elif submit: toSubmit.append( ftsFile ) elif fail: toFail.append( ftsFile ) return S_OK( { "toUpdate": toUpdate, "toSubmit": toSubmit, "toRegister": toRegister, "toReschedule": toReschedule, "toFail": toFail } ) def __insertRegisterOperation( self, request, operation, toRegister ): """ add RegisterReplica operation :param Request request: request instance :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register """ log = self.log.getSubLogger( "req_%s/%s/registerFiles" % ( request.RequestID, request.RequestName ) ) byTarget = {} for ftsFile in toRegister: if ftsFile.TargetSE not in byTarget: byTarget.setdefault( ftsFile.TargetSE, [] ) byTarget[ftsFile.TargetSE].append( ftsFile ) log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) ) for target, ftsFileList in byTarget.iteritems(): log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target, len( ftsFileList ) ) ) registerOperation = Operation() registerOperation.Type = "RegisterReplica" registerOperation.Status = "Waiting" registerOperation.TargetSE = target targetSE = StorageElement( target ) for ftsFile in ftsFileList: opFile = File() opFile.LFN = ftsFile.LFN pfn = returnSingleResult( targetSE.getURL( ftsFile.LFN, protocol = self.registrationProtocols ) ) if not pfn["OK"]: continue opFile.PFN = pfn["Value"] registerOperation.addFile( opFile ) request.insertBefore( registerOperation, operation ) return S_OK() @staticmethod def __sendAccounting( ftsJob, ownerDN ): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime( fromString( ftsJob.SubmitTime ) ) dataOp.setEndTime( fromString( ftsJob.LastUpdate ) ) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN( ownerDN ) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] ) accountingDict["TransferTotal"] = len( ftsJob ) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE # dt = ftsJob.LastUpdate - ftsJob.SubmitTime # transferTime = dt.days * 86400 + dt.seconds # accountingDict["TransferTime"] = transferTime accountingDict['TransferTime'] = sum( int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ) dataOp.setValuesFromDict( accountingDict ) dataOp.commit() def __checkReadyReplicas( self, request, operation ): """ check ready replicas for transferOperation """ log = self.log.getSubLogger( "req_%s/%s/checkReadyReplicas" % ( request.RequestID, request.RequestName ) ) targetSESet = set( operation.targetSEList ) # # { LFN: [ targetSE, ... ] } missingReplicas = {} scheduledFiles = dict( ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) ) # # get replicas replicas = FileCatalog().getReplicas( scheduledFiles.keys() ) if not replicas["OK"]: self.log.error( replicas["Message"] ) return replicas replicas = replicas["Value"] fullyReplicated = 0 missingSEs = {} for successfulLFN in replicas["Successful"]: reps = set( replicas['Successful'][successfulLFN] ) if targetSESet.issubset( reps ): log.verbose( "%s has been replicated to all targets" % successfulLFN ) fullyReplicated += 1 scheduledFiles[successfulLFN].Status = "Done" else: missingReplicas[successfulLFN] = sorted( targetSESet - reps ) ses = ",".join( missingReplicas[ successfulLFN ] ) missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1 log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) ) if fullyReplicated: log.info( "%d new files have been replicated to all targets" % fullyReplicated ) if missingSEs: for ses in missingSEs: log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) ) reMissing = re.compile( "no such file or directory" ) for failedLFN, errStr in replicas["Failed"].iteritems(): scheduledFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): log.error( "%s is missing, setting its status to 'Failed'" % failedLFN ) scheduledFiles[failedLFN].Status = "Failed" else: log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) ) return S_OK( missingReplicas ) def __filterReplicas( self, opFile ): """ filter out banned/invalid source SEs """ from DIRAC.DataManagementSystem.Agent.RequestOperations.ReplicateAndRegister import filterReplicas return filterReplicas( opFile, logger = self.log, dataManager = self.dataManager )
class Service: SVC_VALID_ACTIONS = { 'RPC' : 'export', 'FileTransfer': 'transfer', 'Message' : 'msg', 'Connection' : 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__( self, serviceData ): self._svcData = serviceData self._name = serviceData[ 'loadName' ] self._startTime = Time.dateTime() self._validNames = [ serviceData[ 'modName' ] ] if serviceData[ 'loadName' ] not in self._validNames: self._validNames.append( serviceData[ 'loadName' ] ) self._cfg = ServiceConfiguration( list( self._validNames ) ) if serviceData[ 'standalone' ]: self._monitor = gMonitor else: self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = { 'queries' : 0, 'connections' : 0 } self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection( serviceData[ 'loadName' ] ) ) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 def setCloneProcessId( self, cloneId ): self.__cloneId = cloneId self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) ) def _isMetaAction( self, action ): referedAction = Service.SVC_VALID_ACTIONS[ action ] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize( self ): #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR( "Could not build service URL for %s" % self._name ) gLogger.verbose( "Service URL is %s" % self._url ) #Load handler result = self._loadHandlerInit() if not result[ 'OK' ]: return result self._handler = result[ 'Value' ] #Initialize lock manager self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() ) self._initMonitoring() self._threadPool = ThreadPool( 1, max( 0, self._cfg.getMaxThreads() ), self._cfg.getMaxWaitingPetitions() ) self._threadPool.daemonize() self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool ) #Create static dict self._serviceInfoDict = { 'serviceName' : self._name, 'serviceSectionPath' : PathFinder.getServiceSection( self._name ), 'URL' : self._cfg.getURL(), 'messageSender' : MessageSender( self._name, self._msgBroker ), 'validNames' : self._validNames, 'csPaths' : [ PathFinder.getServiceSection( svcName ) for svcName in self._validNames ] } #Call static initialization function try: self._handler[ 'class' ]._rh__initializeClass( dict( self._serviceInfoDict ), self._lockManager, self._msgBroker, self._monitor ) if self._handler[ 'init' ]: for initFunc in self._handler[ 'init' ]: gLogger.verbose( "Executing initialization function" ) try: result = initFunc( dict( self._serviceInfoDict ) ) except Exception, excp: gLogger.exception( "Exception while calling initialization function" ) return S_ERROR( "Exception while calling initialization function: %s" % str( excp ) ) if not isReturnStructure( result ): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc ) if not result[ 'OK' ]: return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) ) except Exception, e: errMsg = "Exception while initializing %s" % self._name gLogger.exception( errMsg ) return S_ERROR( errMsg ) #Load actions after the handler has initialized itself result = self._loadActions() if not result[ 'OK' ]: return result self._actions = result[ 'Value' ] gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents ) return S_OK()
class OutputDataExecutor: def __init__(self, csPath=""): self.log = gLogger.getSubLogger("OutputDataExecutor") if not csPath: vo = gConfig.getValue("/DIRAC/VirtualOrganization", "") self.__transfersCSPath = '/Operations/%s/OutputData' % vo else: self.__transfersCSPath = csPath self.log.verbose("Reading transfer paths from %s" % self.__transfersCSPath) self.__requiredCSOptions = [ 'InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE' ] self.__threadPool = ThreadPool( gConfig.getValue("%s/MinTransfers" % self.__transfersCSPath, 1), gConfig.getValue("%s/MaxTransfers" % self.__transfersCSPath, 4), gConfig.getValue("%s/MaxQueuedTransfers" % self.__transfersCSPath, 100)) self.__threadPool.daemonize() self.__processingFiles = set() self.__okTransferredFiles = 0 self.__okTransferredBytes = 0 self.__failedFiles = {} def getNumOKTransferredFiles(self): return self.__okTransferredFiles def getNumOKTransferredBytes(self): return self.__okTransferredBytes def transfersPending(self): return self.__threadPool.isWorking() def getDefinedTransferPaths(self): result = gConfig.getSections(self.__transfersCSPath) if not result['OK']: self.log.info('No Input/Output Pair defined in CS') return S_OK() pathList = result['Value'] tPaths = {} for name in pathList: csPath = self.__transfersCSPath + '/%s' % name result = gConfig.getOptionsDict(csPath) if not result['OK']: continue transferDict = result['Value'] ok = True for i in self.__requiredCSOptions: if i not in transferDict: self.log.error('Missing Option %s in %s' % (i, csPath)) ok = False break if not ok: continue tPaths[name] = transferDict return S_OK(tPaths) def getNumLocalOutgoingFiles(self): result = self.getDefinedTransferPaths() if not result['OK']: return 0 localOutgoing = 0 tPaths = result['Value'] for name in tPaths: transferDict = tPaths[name] if 'LocalDisk' != transferDict['InputFC']: continue localOutgoing += len(self.getOutgoingFiles(transferDict)) return localOutgoing def getOutgoingFiles(self, transferDict): """ Get list of files to be processed from InputPath """ inputFCName = transferDict['InputFC'] inputPath = transferDict['InputPath'] if inputFCName == 'LocalDisk': files = [] try: for file in os.listdir(inputPath): if os.path.isfile(os.path.join(inputPath, file)): files.append(file) except: pass return files inputFC = FileCatalog([inputFCName]) result = inputFC.listDirectory(inputPath, True) if not result['OK']: self.log.error(result['Message']) return [] if not inputPath in result['Value']['Successful']: self.log.error(result['Value']['Failed'][inputPath]) return [] subDirs = result['Value']['Successful'][inputPath]['SubDirs'] files = result['Value']['Successful'][inputPath]['Files'] for dir in subDirs: self.log.info('Ignoring subdirectory:', dir) return files.keys() def checkForTransfers(self): """ Check for transfers to do and start them """ result = self.getDefinedTransferPaths() if not result['OK']: return result tPaths = result['Value'] for name in tPaths: transferPath = tPaths[name] self.log.verbose("Checking %s transfer path" % name) filesToTransfer = self.getOutgoingFiles(tPaths[name]) self.log.info("Transfer path %s has %d files" % (name, len(filesToTransfer))) ret = self.__addFilesToThreadPool(filesToTransfer, transferPath) if not ret['OK']: # The thread pool got full break def processAllPendingTransfers(self): self.__threadPool.processAllResults() @transferSync def __addFilesToThreadPool(self, files, transferDict): for file in files: file = os.path.basename(file) if file in self.__processingFiles: continue self.__processingFiles.add(file) time.sleep(1) ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered, args=(file, transferDict), oCallback=self.transferCallback, blocking=False) if not ret['OK']: # The thread pool got full return ret return S_OK() def __transferIfNotRegistered(self, file, transferDict): result = self.isRegisteredInOutputCatalog(file, transferDict) if not result['OK']: self.log.error(result['Message']) return result #Already registered. Need to delete if result['Value']: self.log.info( "Transfer file %s is already registered in the output catalog" % file) #Delete filePath = os.path.join(transferDict['InputPath'], file) if transferDict['InputFC'] == 'LocalDisk': os.unlink(filePath) else: inputFC = FileCatalog([transferDict['InputFC']]) replicaDict = inputFC.getReplicas(filePath) if not replicaDict['OK']: self.log.error("Error deleting file", replicaDict['Message']) elif not inFile in replicaDict['Value']['Successful']: self.log.error("Error deleting file", replicaDict['Value']['Failed'][inFile]) else: seList = replicaDict['Value']['Successful'][inFile].keys() for se in seList: se = StorageElement(se) self.log.info('Removing from %s:' % se.name, inFile) se.removeFile(inFile) inputFC.removeFile(file) self.log.info("File %s deleted from %s" % (file, transferDict['InputFC'])) self.__processingFiles.discard(file) return S_OK(file) #Do the transfer return self.__retrieveAndUploadFile(file, transferDict) def isRegisteredInOutputCatalog(self, file, transferDict): fc = FileCatalog([transferDict['OutputFC']]) lfn = os.path.join(transferDict['OutputPath'], os.path.basename(file)) result = fc.getReplicas(lfn) if not result['OK']: return result if lfn not in result['Value']['Successful']: return S_OK(False) replicas = result['Value']['Successful'][lfn] for seName in List.fromChar(transferDict['OutputSE'], ","): if seName in replicas: self.log.verbose( "Transfer file %s is already registered in %s SE" % (file, seName)) return S_OK(True) return S_OK(False) def __retrieveAndUploadFile(self, file, outputDict): """ Retrieve, Upload, and remove """ fileName = file inputPath = outputDict['InputPath'] inputFCName = outputDict['InputFC'] inBytes = 0 if inputFCName == 'LocalDisk': inFile = file file = os.path.join(inputPath, file) else: inputFC = FileCatalog([inputFCName]) inFile = os.path.join(inputPath, file) replicaDict = inputFC.getReplicas(inFile) if not replicaDict['OK']: self.log.error(replicaDict['Message']) return S_ERROR(fileName) if not inFile in replicaDict['Value']['Successful']: self.log.error(replicaDict['Value']['Failed'][inFile]) return S_ERROR(fileName) seList = replicaDict['Value']['Successful'][inFile].keys() inputSE = StorageElement(seList[0]) self.log.info('Retrieving from %s:' % inputSE.name, inFile) # ret = inputSE.getFile( inFile ) # lcg_util binding prevent multithreading, use subprocess instead res = pythonCall(2 * 3600, inputSE.getFile, inFile) if not res['OK']: self.log.error(res['Message']) return S_ERROR(fileName) ret = res['Value'] if not ret['OK']: self.log.error(ret['Message']) return S_ERROR(fileName) if not inFile in ret['Value']['Successful']: self.log.error(ret['Value']['Failed'][inFile]) return S_ERROR(fileName) if os.path.isfile(file): inBytes = os.stat(file)[6] outputPath = outputDict['OutputPath'] outputFCName = outputDict['OutputFC'] replicaManager = ReplicaManager() outFile = os.path.join(outputPath, os.path.basename(file)) transferOK = False for outputSEName in List.fromChar(outputDict['OutputSE'], ","): outputSE = StorageElement(outputSEName) self.log.info('Trying to upload to %s:' % outputSE.name, outFile) # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName ) # lcg_util binding prevent multithreading, use subprocess instead result = pythonCall(2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath(file), outputSE.name, catalog=outputFCName) if result['OK'] and result['Value']['OK']: if outFile in result['Value']['Value']['Successful']: transferOK = True break else: self.log.error(result['Value']['Value']['Failed'][outFile]) else: if result['OK']: self.log.error(result['Value']['Message']) else: self.log.error(result['Message']) if not transferOK: return S_ERROR(fileName) if result['OK'] or not inputFCName == 'LocalDisk': os.unlink(file) if not result['OK']: self.log.error(ret['Message']) return S_ERROR(fileName) self.log.info("Finished transferring %s [%s bytes]" % (inFile, inBytes)) self.__okTransferredFiles += 1 self.__okTransferredBytes += inBytes if inputFCName == 'LocalDisk': return S_OK(fileName) # Now the file is on final SE/FC, remove from input SE/FC for se in seList: se = StorageElement(se) self.log.info('Removing from %s:' % se.name, inFile) se.removeFile(inFile) inputFC.removeFile(inFile) return S_OK(fileName) @transferSync def transferCallback(self, threadedJob, submitResult): if not submitResult['OK']: file = submitResult['Message'] if file not in self.__failedFiles: self.__failedFiles[file] = 0 self.__failedFiles[file] += 1 else: file = submitResult['Value'] if file in self.__failedFiles: del self.__failedFiles[file] #Take out from processing files if file in self.__processingFiles: self.__processingFiles.discard(file)
class FTSAgent(AgentModule): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag STAGE_FILES = False # # replica manager __replicaManager = None # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock(self): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock("FTSAgentLock") return self.__updateLock @classmethod def requestClient(cls): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient(cls): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def replicaManager(cls): """ replica manager getter """ if not cls.__replicaManager: cls.__replicaManager = ReplicaManager() return cls.__replicaManager @classmethod def rssClient(cls): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE(cls, seName): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement(seName) return cls.__seCache[seName] @classmethod def getRequest(cls, reqName): """ keep Requests in cache """ if reqName not in cls.__reqCache: getRequest = cls.requestClient().getRequest(reqName) if not getRequest["OK"]: return getRequest getRequest = getRequest["Value"] if not getRequest: return S_ERROR("request of name '%s' not found in ReqDB" % reqName) cls.__reqCache[reqName] = getRequest return S_OK(cls.__reqCache[reqName]) @classmethod def putRequest(cls, request): """ put request back to ReqDB :param Request request: Request instance also finalize request if status == Done """ # # put back request put = cls.requestClient().putRequest(request) if not put["OK"]: return put # # finalize first is possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if request.RequestName in cls.__reqCache: del cls.__reqCache[request.RequestName] return S_OK() @classmethod def putFTSJobs(cls, ftsJobsList): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob(ftsJob) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict(ftsFilesDict, toUpdateDict): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get(category, []): if ftsFile not in ftsFileList: ftsFileList.append(ftsFile) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool(self): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool(self.MIN_THREADS, self.MAX_THREADS) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph(self): """ create fts graph """ log = gLogger.getSubLogger("ftsGraph") ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error("unable to get FTS history: %s" % ftsHistory["Message"]) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph("FTSGraph", ftsHistory) finally: self.updateLock().release() log.debug("FTSSites: %s" % len(self.__ftsGraph.nodes())) for i, site in enumerate(self.__ftsGraph.nodes()): log.debug(" [%02d] FTSSite: %-25s FTSServer: %s" % (i, site.name, site.FTSServer)) log.debug("FTSRoutes: %s" % len(self.__ftsGraph.edges())) for i, route in enumerate(self.__ftsGraph.edges()): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % (i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs)) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now( ) + datetime.timedelta(seconds=self.FTSGRAPH_REFRESH) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now( ) + datetime.timedelta(seconds=self.RW_REFRESH) return S_OK() def initialize(self): """ agent's initialization """ log = self.log.getSubLogger("initialize") self.FTSGRAPH_REFRESH = self.am_getOption("FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH) log.info("FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH) self.RW_REFRESH = self.am_getOption("RWAccessValidityPeriod", self.RW_REFRESH) log.info("SEs R/W access validity period = %s s" % self.RW_REFRESH) self.STAGE_FILES = self.am_getOption("StageFiles", self.STAGE_FILES) log.info("Stage files before submission = %s" % { True: "yes", False: "no" }[bool(self.STAGE_FILES)]) self.MAX_ACTIVE_JOBS = self.am_getOption("MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS) log.info("Max active FTSJobs/route = %s" % self.MAX_ACTIVE_JOBS) self.MAX_FILES_PER_JOB = self.am_getOption("MaxFilesPerJob", self.MAX_FILES_PER_JOB) log.info("Max FTSFiles/FTSJob = %d" % self.MAX_FILES_PER_JOB) self.MAX_ATTEMPT = self.am_getOption("MaxTransferAttempts", self.MAX_ATTEMPT) log.info("Max transfer attempts = %s" % self.MAX_ATTEMPT) # # thread pool self.MIN_THREADS = self.am_getOption("MinThreads", self.MIN_THREADS) self.MAX_THREADS = self.am_getOption("MaxThreads", self.MAX_THREADS) minmax = (abs(self.MIN_THREADS), abs(self.MAX_THREADS)) self.MIN_THREADS, self.MAX_THREADS = min(minmax), max(minmax) log.info("ThreadPool min threads = %s" % self.MIN_THREADS) log.info("ThreadPool max threads = %s" % self.MAX_THREADS) log.info("initialize: creation of FTSGraph...") createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error("initialize: %s" % createGraph["Message"]) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') log.info("will use DataManager proxy") # # gMonitor stuff here gMonitor.registerActivity("RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM) gMonitor.registerActivity("FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM) pollingTime = self.am_getOption("PollingTime", 60) for status in list(FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES): gMonitor.registerActivity("FTSJobs%s" % status, "FTSJobs %s" % status, "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime) gMonitor.registerActivity("FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN) gMonitor.registerActivity("FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN) gMonitor.registerActivity("FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN) return S_OK() def finalize(self): """ finalize processing """ log = self.log.getSubLogger("finalize") for request in self.__reqCache.values(): put = self.requestClient().putRequest(request) if not put["OK"]: log.error("unable to put back request '%s': %s" % (request.RequestName, put["Message"])) return S_OK() def execute(self): """ one cycle execution """ log = gLogger.getSubLogger("execute") # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info("resetting expired FTS graph...") resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error("FTSGraph recreation error: %s" % resetFTSGraph["Message"]) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds=self.FTSGRAPH_REFRESH) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info("updating expired R/W access for SEs...") try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds=self.RW_REFRESH) requestNames = self.requestClient().getRequestNamesList(["Scheduled"]) if not requestNames["OK"]: log.error("unable to read scheduled request names: %s" % requestNames["Message"]) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [req[0] for req in requestNames["Value"]] requestNames = list(set(requestNames + self.__reqCache.keys())) if not requestNames: log.info("no 'Scheduled' requests to process") return S_OK() log.info("found %s requests to process:" % len(requestNames)) log.info(" => from internal cache: %s" % (len(self.__reqCache))) log.info(" => new read from RMS: %s" % (len(requestNames) - len(self.__reqCache))) for requestName in requestNames: request = self.getRequest(requestName) if not request["OK"]: log.error(request["Message"]) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args=(request, ), sTJId=sTJId) if queue["OK"]: log.info("request '%s' enqueued for execution" % sTJId) gMonitor.addMark("RequestsAtt", 1) break time.sleep(1) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest(self, request): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger(request.RequestName) operation = request.getWaiting() if not operation["OK"]: log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest(request) operation = operation["Value"] if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type) return self.putRequest(request) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status) return self.putRequest(request) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest(request.RequestID) if not ftsJobs["OK"]: log.error(ftsJobs["Message"]) return ftsJobs ftsJobs = ftsJobs["Value"] if ftsJobs["Value"] else [] # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict([(k, list()) for k in ("toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate")]) if ftsJobs: log.info("==> found %s FTSJobs to monitor" % len(ftsJobs)) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob(request, ftsJob) if not monitor["OK"]: log.error("unable to monitor FTSJob %s: %s" % (ftsJob.FTSJobID, monitor["Message"])) ftsJob.Status = "Submitted" continue ftsFilesDict = self.updateFTSFileDict(ftsFilesDict, monitor["Value"]) log.info("monitoring of FTSJobs completed") for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug(" => %s FTSFiles to %s" % (len(ftsFiles), key[2:].lower())) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas(request, operation) if not missingReplicas["OK"]: log.error(missingReplicas["Message"]) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status != 'Done': log.warn("Should be set! %s is replicated at all targets" % opFile.LFN) opFile.Status = "Done" toFail = ftsFilesDict.get("toFail", []) toReschedule = ftsFilesDict.get("toReschedule", []) toSubmit = ftsFilesDict.get("toSubmit", []) toRegister = ftsFilesDict.get("toRegister", []) toUpdate = ftsFilesDict.get("toUpdate", []) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, request execution cannot proceed..." % len(toFail)) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len(toFail) # # requets.Status should be Failed at this stage "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error("request is set to 'Failed'") return self.putRequest(request) # # PHASE THREE - update Waiting#SourceSE FTSFiles if toUpdate: log.info("==> found %s possible FTSFiles to update..." % (len(toUpdate))) byTarget = {} for ftsFile in toUpdate: if ftsFile.TargetSE not in byTarget: byTarget.setdefault(ftsFile.TargetSE, []) byTarget[ftsFile.TargetSE].append(ftsFile.FileID) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList) if not update["OK"]: log.error("update FTSFiles failed: %s" % update["Message"]) continue # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %s Files waiting for registration, adding 'RegisterReplica' operations" ) registerFiles = self.__register(request, operation, toRegister) if not registerFiles["OK"]: log.error("unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"]) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to RMS") return self.putRequest(request) # # PHASE FIVE - reschedule operation files if toReschedule: log.info("==> found %s Files to reschedule" % len(toReschedule)) rescheduleFiles = self.__reschedule(request, operation, toReschedule) if not rescheduleFiles["OK"]: log.error(rescheduleFiles["Message"]) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to ReqDB") return self.putRequest(request) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, ["Waiting"]) if not ftsFiles["OK"]: log.error(ftsFiles["Message"]) else: retryIds = list(set([ftsFile.FTSFileID for ftsFile in toSubmit])) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: toSubmit.append(ftsFile) retryIds.append(ftsFile.FTSFileID) # # submit new ftsJobs if operation.Status == "Scheduled" and toSubmit: log.info("==> found %s FTSFiles to submit" % len(toSubmit)) submit = self.__submit(request, operation, toSubmit) if not submit["OK"]: log.error(submit["Message"]) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": put = self.putRequest(request) if not put["OK"]: log.error("unable to put back request: %s" % put["Message"]) return put # # put back jobs if ftsJobs: putJobs = self.putFTSJobs(ftsJobs) if not putJobs["OK"]: log.error("unable to put back FTSJobs: %s" % putJobs["Message"]) return putJobs return S_OK() def __reschedule(self, request, operation, toReschedule): """ reschedule list of :toReschedule: files in request for operation :operation: :param Request request: :param Operation operation: :param list toReschedule: list of FTSFiles """ log = self.log.getSubLogger("%s/reschedule" % request.RequestName) log.info("found %s files to reschedule" % len(toReschedule)) for opFile in operation: for ftsFile in toReschedule: if opFile.FileID == ftsFile.FileID: opFile.Status = "Waiting" toSchedule = [] # # filter files for opFile in operation.getWaitingFilesList(): replicas = self.__filterReplicas(opFile) if not replicas["OK"]: continue replicas = replicas["Value"] if not replicas["Valid"] and replicas["Banned"]: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) continue validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] if not validReplicas and bannedReplicas: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN) continue if validReplicas: validTargets = list( set(operation.targetSEList) - set(validReplicas)) if not validTargets: log.info("file %s is already present at all targets" % opFile.LFN) opFile.Status = "Done" continue toSchedule.append( (opFile.toJSON()["Value"], validReplicas, validTargets)) # # do real schedule here if toSchedule: ftsSchedule = self.ftsClient().ftsSchedule(request.RequestID, operation.OperationID, toSchedule) if not ftsSchedule["OK"]: self.log.error(ftsSchedule["Message"]) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for fileID in ftsSchedule["Successful"]: for opFile in operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" for fileID, reason in ftsSchedule["Failed"]: for opFile in operation: if fileID == opFile.FileID: opFile.Error = reason return S_OK() def __submit(self, request, operation, toSubmit): """ create and submit new FTSJobs using list of FTSFiles :param Request request: ReqDB.Request instance :param list ftsFiles: list of FTSFile instances :return: [ FTSJob, FTSJob, ...] """ log = self.log.getSubLogger("%s/submit" % request.RequestName) bySourceAndTarget = {} for ftsFile in toSubmit: if ftsFile.SourceSE not in bySourceAndTarget: bySourceAndTarget.setdefault(ftsFile.SourceSE, {}) if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]: bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, []) bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile) ftsJobs = [] for source, targetDict in bySourceAndTarget.items(): for target, ftsFileList in targetDict.items(): log.info("found %s files to submit from %s to %s" % (len(ftsFileList), source, target)) route = self.__ftsGraph.findRoute(source, target) if not route["OK"]: log.error(route["Message"]) continue route = route["Value"] sourceRead = route.fromNode.SEs[source]["read"] if not sourceRead: log.error("SourceSE %s is banned for reading right now" % source) continue targetWrite = route.toNode.SEs[target]["write"] if not targetWrite: log.error("TargetSE %s is banned for writing right now" % target) continue if route.ActiveJobs > route.toNode.MaxActiveJobs: log.warn( "unable to submit new FTS job, max active jobs reached" ) continue # # create FTSJob ftsJob = FTSJob() ftsJob.RequestID = request.RequestID ftsJob.OperationID = operation.OperationID ftsJob.SourceSE = source ftsJob.TargetSE = target sourceSE = self.getSE(source) sourceToken = sourceSE.getStorageParameters("SRM2") if not sourceToken["OK"]: log.error("unable to get sourceSE '%s' parameters: %s" % (source, sourceToken["Message"])) continue ftsJob.SourceToken = sourceToken["Value"].get("SpaceToken", "") targetSE = self.getSE(target) targetToken = targetSE.getStorageParameters("SRM2") if not targetToken["OK"]: log.error("unable to get targetSE '%s' parameters: %s" % (target, targetToken["Message"])) continue ftsJob.TargetToken = targetToken["Value"].get("SpaceToken", "") ftsJob.FTSServer = route.toNode.FTSServer for ftsFile in ftsFileList: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile(ftsFile) submit = ftsJob.submitFTS2(self.STAGE_FILES) if not submit["OK"]: log.error("unable to submit FTSJob: %s" % submit["Message"]) continue log.info("FTSJob '%s'@'%s' has been submitted" % (ftsJob.FTSGUID, ftsJob.FTSServer)) # # update statuses for job files for ftsFile in ftsJob: ftsFile.FTSGUID = ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 # # update graph route try: self.updateLock().acquire() route.ActiveJobs += 1 finally: self.updateLock().release() ftsJobs.append(ftsJob) log.info("%s new FTSJobs have been submitted" % len(ftsJobs)) return S_OK(ftsJobs) def __monitorJob(self, request, ftsJob): """ execute FTSJob.monitorFTS2 for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger("%s/monitor/%s" % (request.RequestName, ftsJob.FTSGUID)) log.info("FTSJob '%s'@'%s'" % (ftsJob.FTSGUID, ftsJob.FTSServer)) # # this will be returned ftsFilesDict = dict([(k, list()) for k in ("toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate")]) monitor = ftsJob.monitorFTS2() if not monitor["OK"]: gMonitor.addMark("FTSMonitorFail", 1) log.error(monitor["Message"]) if "getTransferJobSummary2: Not authorised to query request" in monitor[ "Message"]: log.error("FTSJob not known (expired on server?)") for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"] = ftsFile return S_OK(ftsFilesDict) return monitor monitor = monitor["Value"] log.info("FTSJob Status = %s Completeness = %s" % (ftsJob.Status, ftsJob.Completeness)) # # monitor status change gMonitor.addMark("FTSJobs%s" % ftsJob.Status, 1) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob(request, ftsJob) if not finalizeFTSJob["OK"]: log.error(finalizeFTSJob["Message"]) return finalizeFTSJob ftsFilesDict = self.updateFTSFileDict(ftsFilesDict, finalizeFTSJob["Value"]) return S_OK(ftsFilesDict) def __finalizeFTSJob(self, request, ftsJob): """ finalize FTSJob :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger("%s/monitor/%s/finalize" % (request.RequestName, ftsJob.FTSJobID)) log.info("finalizing FTSJob %s@%s" % (ftsJob.FTSGUID, ftsJob.FTSServer)) # # this will be returned ftsFilesDict = dict([(k, list()) for k in ("toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate")]) monitor = ftsJob.monitorFTS2(full=True) if not monitor["OK"]: log.error(monitor["Message"]) return monitor # # split FTSFiles to different categories processFiles = self.__filterFiles(ftsJob) if not processFiles["OK"]: log.error(processFiles["Message"]) return processFiles ftsFilesDict = self.updateFTSFileDict(ftsFilesDict, processFiles["Value"]) # # send accounting record for this job self.__sendAccounting(ftsJob, request.OwnerDN) # # update graph - remove this job from graph route = self.__ftsGraph.findRoute(ftsJob.SourceSE, ftsJob.TargetSE) if route["OK"]: try: self.updateLock().acquire() route["Value"].ActiveJobs -= 1 finally: self.updateLock().release() log.info("FTSJob is finalized") return S_OK(ftsFilesDict) def __filterFiles(self, ftsJob): """ process ftsFiles from finished ftsJob :param FTSJob ftsJob: monitored FTSJob instance """ # # lists for different categories toUpdate = [] toReschedule = [] toRegister = [] toSubmit = [] toFail = [] # # loop over files in fts job for ftsFile in ftsJob: # # successful files if ftsFile.Status == "Finished": if ftsFile.Error == "AddCatalogReplicaFailed": toRegister.append(ftsFile) toUpdate.append(ftsFile) continue if ftsFile.Status == "Failed": if ftsFile.Error == "MissingSource": toReschedule.append(ftsFile) else: if ftsFile.Attempt < self.MAX_ATTEMPT: toSubmit.append(ftsFile) else: toFail.append(ftsFile) ftsFile.Error = "Max attempts reached" return S_OK({ "toUpdate": toUpdate, "toSubmit": toSubmit, "toRegister": toRegister, "toReschedule": toReschedule, "toFail": toFail }) def __register(self, request, operation, toRegister): """ add RegisterReplica operation :param Request request: request instance :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register """ log = self.log.getSubLogger("%s/registerFiles" % request.RequestName) byTarget = {} for ftsFile in toRegister: if ftsFile.TargetSE not in byTarget: byTarget.setdefault(ftsFile.TargetSE, []) byTarget[ftsFile.TargetSE].append(ftsFile) log.info("will create %s 'RegisterReplica' operations" % len(byTarget)) for target, ftsFileList in byTarget.items(): log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % (target, len(ftsFileList))) registerOperation = Operation() registerOperation.Type = "RegisterReplica" registerOperation.Status = "Waiting" registerOperation.TargetSE = target targetSE = self.getSE(target) for ftsFile in ftsFileList: opFile = File() opFile.LFN = ftsFile.LFN pfn = targetSE.getPfnForProtocol(ftsFile.TargetSURL, "SRM2", withPort=False) if not pfn["OK"]: continue opFile.PFN = pfn["Value"] registerOperation.addFile(opFile) request.insertBefore(registerOperation, operation) return S_OK() @staticmethod def __sendAccounting(ftsJob, ownerDN): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime(fromString(ftsJob.SubmitTime)) dataOp.setEndTime(fromString(ftsJob.LastUpdate)) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN(ownerDN) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict[ "Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower( ) else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES]) accountingDict["TransferTotal"] = len(ftsJob) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob]) dataOp.setValuesFromDict(accountingDict) dataOp.commit() def __checkReadyReplicas(self, request, operation): """ check ready replicas for transferOperation """ log = self.log.getSubLogger("%s/checkReadyReplicas" % request.RequestName) targetSESet = set(operation.targetSEList) # # { LFN: [ targetSE, ... ] } missingReplicas = {} scheduledFiles = dict([(opFile.LFN, opFile) for opFile in operation if opFile.Status in ("Scheduled", "Waiting")]) # # get replicas replicas = self.replicaManager().getCatalogReplicas( scheduledFiles.keys()) if not replicas["OK"]: self.log.error(replicas["Message"]) return replicas replicas = replicas["Value"] fullyReplicated = 0 missingSEs = {} for successfulLFN in replicas["Successful"]: reps = set(replicas['Successful'][successfulLFN]) if targetSESet.issubset(reps): log.info("%s has been replicated to all targets" % successfulLFN) fullyReplicated += 1 scheduledFiles[successfulLFN].Status = "Done" else: missingReplicas[successfulLFN] = sorted(targetSESet - reps) ses = ",".join(missingReplicas[successfulLFN]) missingSEs[ses] = missingSEs.setdefault(ses, 0) + 1 log.verbose("%s is still missing at %s" % (successfulLFN, ses)) if fullyReplicated: log.info("%d new files have been replicated to all targets" % fullyReplicated) if missingSEs: for ses in missingSEs: log.info("%d replicas still missing at %s" % (missingSEs[ses], ses)) reMissing = re.compile("no such file or directory") for failedLFN, errStr in replicas["Failed"].items(): scheduledFiles[failedLFN].Error = errStr if reMissing.search(errStr.lower()): log.error("%s is missing, setting its status to 'Failed'" % failedLFN) scheduledFiles[failedLFN].Status = "Failed" else: log.warn("unable to read replicas for %s: %s" % (failedLFN, errStr)) return S_OK(missingReplicas) def __filterReplicas(self, opFile): """ filter out banned/invalid source SEs """ log = self.log.getSubLogger("filterReplicas") ret = {"Valid": [], "Banned": [], "Bad": []} replicas = self.replicaManager().getActiveReplicas(opFile.LFN) if not replicas["OK"]: log.error(replicas["Message"]) reNotExists = re.compile("not such file or directory") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"][ opFile.LFN] if opFile.LFN in replicas["Successful"] else {} for repSEName in replicas: repSE = self.getSE(repSEName) pfn = repSE.getPfnForLfn(opFile.LFN) if not pfn["OK"]: log.warn("unable to create pfn for %s lfn: %s" % (opFile.LFN, pfn["Message"])) ret["Banned"].append(repSEName) continue pfn = pfn["Value"] repSEMetadata = repSE.getFileMetadata(pfn, singleFile=True) if not repSEMetadata["OK"]: self.log.warn(repSEMetadata["Message"]) ret["Banned"].append(repSEName) continue repSEMetadata = repSEMetadata["Value"] seChecksum = repSEMetadata["Checksum"].replace( "x", "0").zfill(8) if "Checksum" in repSEMetadata else None if opFile.Checksum and opFile.Checksum != seChecksum: self.log.warn(" %s checksum mismatch: %s %s:%s" % (opFile.LFN, opFile.Checksum, repSE, seChecksum)) ret["Bad"].append(repSEName) continue # # if we're here repSE is OK ret["Valid"].append(repSEName) return S_OK(ret)
class Service( object ): SVC_VALID_ACTIONS = { 'RPC' : 'export', 'FileTransfer': 'transfer', 'Message' : 'msg', 'Connection' : 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__( self, serviceData ): self._svcData = serviceData self._name = serviceData[ 'modName' ] self._startTime = Time.dateTime() self._validNames = [ serviceData[ 'modName' ] ] if serviceData[ 'loadName' ] not in self._validNames: self._validNames.append( serviceData[ 'loadName' ] ) self._cfg = ServiceConfiguration( list( self._validNames ) ) if serviceData[ 'standalone' ]: self._monitor = gMonitor else: self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = { 'queries' : 0, 'connections' : 0 } self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection( serviceData[ 'loadName' ] ) ) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 def setCloneProcessId( self, cloneId ): self.__cloneId = cloneId self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) ) def _isMetaAction( self, action ): referedAction = Service.SVC_VALID_ACTIONS[ action ] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize( self ): #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR( "Could not build service URL for %s" % self._name ) gLogger.verbose( "Service URL is %s" % self._url ) #Load handler result = self._loadHandlerInit() if not result[ 'OK' ]: return result self._handler = result[ 'Value' ] #Initialize lock manager self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() ) self._initMonitoring() self._threadPool = ThreadPool( max( 1, self._cfg.getMinThreads() ), max( 0, self._cfg.getMaxThreads() ), self._cfg.getMaxWaitingPetitions() ) self._threadPool.daemonize() self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool ) #Create static dict self._serviceInfoDict = { 'serviceName' : self._name, 'serviceSectionPath' : PathFinder.getServiceSection( self._name ), 'URL' : self._cfg.getURL(), 'messageSender' : MessageSender( self._name, self._msgBroker ), 'validNames' : self._validNames, 'csPaths' : [ PathFinder.getServiceSection( svcName ) for svcName in self._validNames ] } #Call static initialization function try: self._handler[ 'class' ]._rh__initializeClass( dict( self._serviceInfoDict ), self._lockManager, self._msgBroker, self._monitor ) if self._handler[ 'init' ]: for initFunc in self._handler[ 'init' ]: gLogger.verbose( "Executing initialization function" ) try: result = initFunc( dict( self._serviceInfoDict ) ) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException = excp ) return S_ERROR( "Exception while calling initialization function: %s" % str( excp ) ) if not isReturnStructure( result ): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc ) if not result[ 'OK' ]: return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) ) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception( e ) gLogger.exception( errMsg ) return S_ERROR( errMsg ) #Load actions after the handler has initialized itself result = self._loadActions() if not result[ 'OK' ]: return result self._actions = result[ 'Value' ] gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents ) return S_OK() def __searchInitFunctions( self, handlerClass, currentClass = None ): if not currentClass: currentClass = handlerClass initFuncs = [] ancestorHasInit = False for ancestor in currentClass.__bases__: initFuncs += self.__searchInitFunctions( handlerClass, ancestor ) if 'initializeHandler' in dir( ancestor ): ancestorHasInit = True if ancestorHasInit: initFuncs.append( super( currentClass, handlerClass ).initializeHandler ) if currentClass == handlerClass and 'initializeHandler' in dir( handlerClass ): initFuncs.append( handlerClass.initializeHandler ) return initFuncs def _loadHandlerInit( self ): handlerClass = self._svcData[ 'classObj' ] handlerName = handlerClass.__name__ handlerInitMethods = self.__searchInitFunctions( handlerClass ) try: handlerInitMethods.append( getattr( self._svcData[ 'moduleObj' ], "initialize%s" % handlerName ) ) except AttributeError: gLogger.verbose( "Not found global initialization function for service" ) if handlerInitMethods: gLogger.info( "Found %s initialization methods" % len( handlerInitMethods ) ) handlerInfo = {} handlerInfo[ "name" ] = handlerName handlerInfo[ "module" ] = self._svcData[ 'moduleObj' ] handlerInfo[ "class" ] = handlerClass handlerInfo[ "init" ] = handlerInitMethods return S_OK( handlerInfo ) def _loadActions( self ): handlerClass = self._handler[ 'class' ] authRules = {} typeCheck = {} methodsList = {} for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction( actionType ): continue authRules[ actionType ] = {} typeCheck[ actionType ] = {} methodsList[ actionType ] = [] handlerAttributeList = dir( handlerClass ) for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction( actionType ): continue methodPrefix = '%s_' % Service.SVC_VALID_ACTIONS[ actionType ] for attribute in handlerAttributeList: if attribute.find( methodPrefix ) != 0: continue exportedName = attribute[ len( methodPrefix ) : ] methodsList[ actionType ].append( exportedName ) gLogger.verbose( "+ Found %s method %s" % ( actionType, exportedName ) ) #Create lock for method self._lockManager.createLock( "%s/%s" % ( actionType, exportedName ), self._cfg.getMaxThreadsForMethod( actionType, exportedName ) ) #Look for type and auth rules if actionType == 'RPC': typeAttr = "types_%s" % exportedName authAttr = "auth_%s" % exportedName else: typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[ actionType ], exportedName ) authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[ actionType ], exportedName ) if typeAttr in handlerAttributeList: obj = getattr( handlerClass, typeAttr ) gLogger.verbose( "|- Found type definition %s: %s" % ( typeAttr, str( obj ) ) ) typeCheck[ actionType ][ exportedName ] = obj if authAttr in handlerAttributeList: obj = getattr( handlerClass, authAttr ) gLogger.verbose( "|- Found auth rules %s: %s" % ( authAttr, str( obj ) ) ) authRules[ actionType ][ exportedName ] = obj for actionType in Service.SVC_VALID_ACTIONS: referedAction = self._isMetaAction( actionType ) if not referedAction: continue gLogger.verbose( "Action %s is a meta action for %s" % ( actionType, referedAction ) ) authRules[ actionType ] = [] for method in authRules[ referedAction ]: for prop in authRules[ referedAction ][ method ]: if prop not in authRules[ actionType ]: authRules[ actionType ].append( prop ) gLogger.verbose( "Meta action %s props are %s" % ( actionType, authRules[ actionType ] ) ) return S_OK( { 'methods' : methodsList, 'auth' : authRules, 'types' : typeCheck } ) def _initMonitoring( self ): #Init extra bits of monitoring self._monitor.setComponentType( MonitoringClient.COMPONENT_SERVICE ) self._monitor.setComponentName( self._name ) self._monitor.setComponentLocation( self._cfg.getURL() ) self._monitor.initialize() self._monitor.registerActivity( "Connections", "Connections received", "Framework", "connections", MonitoringClient.OP_RATE ) self._monitor.registerActivity( "Queries", "Queries served", "Framework", "queries", MonitoringClient.OP_RATE ) self._monitor.registerActivity( 'CPU', "CPU Usage", 'Framework', "CPU,%", MonitoringClient.OP_MEAN, 600 ) self._monitor.registerActivity( 'MEM', "Memory Usage", 'Framework', 'Memory,MB', MonitoringClient.OP_MEAN, 600 ) self._monitor.registerActivity( 'PendingQueries', "Pending queries", 'Framework', 'queries', MonitoringClient.OP_MEAN ) self._monitor.registerActivity( 'ActiveQueries', "Active queries", 'Framework', 'threads', MonitoringClient.OP_MEAN ) self._monitor.registerActivity( 'RunningThreads', "Running threads", 'Framework', 'threads', MonitoringClient.OP_MEAN ) self._monitor.registerActivity( 'MaxFD', "Max File Descriptors", 'Framework', 'fd', MonitoringClient.OP_MEAN ) self._monitor.setComponentExtraParam( 'DIRACVersion', DIRAC.version ) self._monitor.setComponentExtraParam( 'platform', DIRAC.getPlatform() ) self._monitor.setComponentExtraParam( 'startTime', Time.dateTime() ) for prop in ( ( "__RCSID__", "version" ), ( "__doc__", "description" ) ): try: value = getattr( self._handler[ 'module' ], prop[0] ) except Exception as e: gLogger.exception( e ) gLogger.error( "Missing property", prop[0] ) value = 'unset' self._monitor.setComponentExtraParam( prop[1], value ) for secondaryName in self._cfg.registerAlsoAs(): gLogger.info( "Registering %s also as %s" % ( self._name, secondaryName ) ) self._validNames.append( secondaryName ) return S_OK() def __reportThreadPoolContents( self ): self._monitor.addMark( 'PendingQueries', self._threadPool.pendingJobs() ) self._monitor.addMark( 'ActiveQueries', self._threadPool.numWorkingThreads() ) self._monitor.addMark( 'RunningThreads', threading.activeCount() ) self._monitor.addMark( 'MaxFD', self.__maxFD ) self.__maxFD = 0 def getConfig( self ): return self._cfg #End of initialization functions def handleConnection( self, clientTransport ): self._stats[ 'connections' ] += 1 self._monitor.setComponentExtraParam( 'queries', self._stats[ 'connections' ] ) self._threadPool.generateJobAndQueueIt( self._processInThread, args = ( clientTransport, ) ) #Threaded process function def _processInThread( self, clientTransport ): self.__maxFD = max( self.__maxFD, clientTransport.oSocket.fileno() ) self._lockManager.lockGlobal() try: monReport = self.__startReportToMonitoring() except Exception: monReport = False try: #Handshake try: result = clientTransport.handshake() if not result[ 'OK' ]: clientTransport.close() return except: return #Add to the transport pool trid = self._transportPool.add( clientTransport ) if not trid: return #Receive and check proposal result = self._receiveAndCheckProposal( trid ) if not result[ 'OK' ]: self._transportPool.sendAndClose( trid, result ) return proposalTuple = result[ 'Value' ] #Instantiate handler result = self._instantiateHandler( trid, proposalTuple ) if not result[ 'OK' ]: self._transportPool.sendAndClose( trid, result ) return handlerObj = result[ 'Value' ] #Execute the action result = self._processProposal( trid, proposalTuple, handlerObj ) #Close the connection if required if result[ 'closeTransport' ] or not result[ 'OK' ]: if not result[ 'OK' ]: gLogger.error( "Error processing proposal", result[ 'Message' ] ) self._transportPool.close( trid ) return result finally: self._lockManager.unlockGlobal() if monReport: self.__endReportToMonitoring( *monReport ) def _createIdentityString( self, credDict, clientTransport = None ): if 'username' in credDict: if 'group' in credDict: identity = "[%s:%s]" % ( credDict[ 'username' ], credDict[ 'group' ] ) else: identity = "[%s:unknown]" % credDict[ 'username' ] else: identity = 'unknown' if clientTransport: addr = clientTransport.getRemoteAddress() if addr: addr = "{%s:%s}" % ( addr[0], addr[1] ) if 'DN' in credDict: identity += "(%s)" % credDict[ 'DN' ] return identity def _receiveAndCheckProposal( self, trid ): clientTransport = self._transportPool.get( trid ) #Get the peer credentials credDict = clientTransport.getConnectingCredentials() #Receive the action proposal retVal = clientTransport.receiveData( 1024 ) if not retVal[ 'OK' ]: gLogger.error( "Invalid action proposal", "%s %s" % ( self._createIdentityString( credDict, clientTransport ), retVal[ 'Message' ] ) ) return S_ERROR( "Invalid action proposal" ) proposalTuple = retVal[ 'Value' ] gLogger.debug( "Received action from client", "/".join( list( proposalTuple[1] ) ) ) #Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials( proposalTuple[2] ) #Check if this is the requested service requestedService = proposalTuple[0][0] if requestedService not in self._validNames: return S_ERROR( "%s is not up in this server" % requestedService ) #Check if the action is valid requestedActionType = proposalTuple[1][0] if requestedActionType not in Service.SVC_VALID_ACTIONS: return S_ERROR( "%s is not a known action type" % requestedActionType ) #Check if it's authorized result = self._authorizeProposal( proposalTuple[1], trid, credDict ) if not result[ 'OK' ]: return result #Proposal is OK return S_OK( proposalTuple ) def _authorizeProposal( self, actionTuple, trid, credDict ): #Find CS path for the Auth rules referedAction = self._isMetaAction( actionTuple[0] ) if referedAction: csAuthPath = "%s/Default" % actionTuple[0] hardcodedMethodAuth = self._actions[ 'auth' ][ actionTuple[0] ] else: if actionTuple[0] == 'RPC': csAuthPath = actionTuple[1] else: csAuthPath = "/".join( actionTuple ) #Find if there are hardcoded auth rules in the code hardcodedMethodAuth = False if actionTuple[0] in self._actions[ 'auth' ]: hardcodedRulesByType = self._actions[ 'auth' ][ actionTuple[0] ] if actionTuple[0] == "FileTransfer": methodName = actionTuple[1][0].lower() + actionTuple[1][1:] else: methodName = actionTuple[1] if methodName in hardcodedRulesByType: hardcodedMethodAuth = hardcodedRulesByType[ methodName ] #Auth time! if not self._authMgr.authQuery( csAuthPath, credDict, hardcodedMethodAuth ): #Get the identity string identity = self._createIdentityString( credDict ) fromHost = "unknown host" tr = self._transportPool.get( trid ) if tr: fromHost = '/'.join( [ str( item ) for item in tr.getRemoteAddress() ] ) gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % ( self._name, "/".join( actionTuple ), identity, fromHost ) ) result = S_ERROR( "Unauthorized query" ) else: result = S_OK() #Security log tr = self._transportPool.get( trid ) if not tr: return S_ERROR( "Client disconnected" ) sourceAddress = tr.getRemoteAddress() identity = self._createIdentityString( credDict ) Service.SVC_SECLOG_CLIENT.addMessage( result[ 'OK' ], sourceAddress[0], sourceAddress[1], identity, self._cfg.getHostname(), self._cfg.getPort(), self._name, "/".join( actionTuple ) ) return result def _instantiateHandler( self, trid, proposalTuple = None ): """ Generate an instance of the handler for a given service """ #Generate the client params clientParams = { 'serviceStartTime' : self._startTime } if proposalTuple: clientParams[ 'clientSetup' ] = proposalTuple[0][1] if len( proposalTuple[0] ) < 3: clientParams[ 'clientVO' ] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown" ) else: clientParams[ 'clientVO' ] = proposalTuple[0][2] clientTransport = self._transportPool.get( trid ) if clientTransport: clientParams[ 'clientAddress' ] = clientTransport.getRemoteAddress() #Generate handler dict with per client info handlerInitDict = dict( self._serviceInfoDict ) for key in clientParams: handlerInitDict[ key ] = clientParams[ key ] #Instantiate and initialize try: handlerInstance = self._handler[ 'class' ]( handlerInitDict, trid ) handlerInstance.initialize() except Exception as e: gLogger.exception( "Server error while loading handler: %s" % str( e ) ) return S_ERROR( "Server error while loading handler" ) return S_OK( handlerInstance ) def _processProposal( self, trid, proposalTuple, handlerObj ): #Notify the client we're ready to execute the action retVal = self._transportPool.send( trid, S_OK() ) if not retVal[ 'OK' ]: return retVal messageConnection = False if proposalTuple[1] == ( 'Connection', 'new' ): messageConnection = True if messageConnection: if self._msgBroker.getNumConnections() > self._cfg.getMaxMessagingConnections(): result = S_ERROR( "Maximum number of connections reached. Try later" ) result[ 'closeTransport' ] = True return result #This is a stable connection self._msgBroker.addTransportId( trid, self._name, receiveMessageCallback = self._mbReceivedMsg, disconnectCallback = self._mbDisconnect, listenToConnection = False ) result = self._executeAction( trid, proposalTuple, handlerObj ) if result[ 'OK' ] and messageConnection: self._msgBroker.listenToTransport( trid ) result = self._mbConnect( trid, handlerObj ) if not result[ 'OK' ]: self._msgBroker.removeTransport( trid ) result[ 'closeTransport' ] = not messageConnection or not result[ 'OK' ] return result def _mbConnect( self, trid, handlerObj = None ): if not handlerObj: result = self._instantiateHandler( trid ) if not result[ 'OK' ]: return result handlerObj = result[ 'Value' ] return handlerObj._rh_executeConnectionCallback( 'connected' ) def _executeAction( self, trid, proposalTuple, handlerObj ): try: return handlerObj._rh_executeAction( proposalTuple ) except Exception as e: gLogger.exception( "Exception while executing handler action" ) return S_ERROR( "Server error while executing action: %s" % str( e ) ) def _mbReceivedMsg( self, trid, msgObj ): result = self._authorizeProposal( ( 'Message', msgObj.getName() ), trid, self._transportPool.get( trid ).getConnectingCredentials() ) if not result[ 'OK' ]: return result result = self._instantiateHandler( trid ) if not result[ 'OK' ]: return result handlerObj = result[ 'Value' ] return handlerObj._rh_executeMessageCallback( msgObj ) def _mbDisconnect( self, trid ): result = self._instantiateHandler( trid ) if not result[ 'OK' ]: return result handlerObj = result[ 'Value' ] return handlerObj._rh_executeConnectionCallback( 'drop' ) def __startReportToMonitoring( self ): self._monitor.addMark( "Queries" ) now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 0: return ( now, cpuTime ) # Send CPU consumption mark wallClock = now - self.__monitorLastStatsUpdate self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB( 'VmRSS:' ) if membytes: mem = membytes / ( 1024. * 1024. ) self._monitor.addMark( 'MEM', mem ) return ( now, cpuTime ) def __endReportToMonitoring( self, initialWallTime, initialCPUTime ): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100. if percentage > 0: self._monitor.addMark( 'CPU', percentage )
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag STAGE_FILES = False # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE( cls, seName ): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement( seName ) return cls.__seCache[seName] @classmethod def getSECache( cls ): return cls.__seCache @classmethod def getRequest( cls, reqName ): """ get Requests systematically and refresh cache """ getRequest = cls.requestClient().getRequest( reqName ) if not getRequest["OK"]: cls.__reqCache.pop( reqName, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqName, None ) return S_ERROR( "request of name '%s' not found in ReqDB" % reqName ) cls.__reqCache[reqName] = getRequest return S_OK( cls.__reqCache[reqName] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestName not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestName, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history: %s" % ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory ) finally: self.updateLock().release() log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH ) log.info( "FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH ) self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH ) log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH ) self.STAGE_FILES = self.am_getOption( "StageFiles", self.STAGE_FILES ) log.info( "Stage files before submission = %s" % {True: "yes", False: "no"}[bool( self.STAGE_FILES )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = %s" % self.MAX_ACTIVE_JOBS ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = %d" % self.MAX_FILES_PER_JOB ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = %s" % self.MAX_ATTEMPT ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = %s" % self.MIN_THREADS ) log.info( "ThreadPool max threads = %s" % self.MAX_THREADS ) log.info( "initialize: creation of FTSGraph..." ) createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error( "initialize: %s" % createGraph["Message"] ) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "unable to read scheduled request names: %s" % requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestNames ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( request["Message"] ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "request '%s' enqueued for execution" % sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( request.RequestName ) operation = request.getWaiting() if not operation["OK"]: log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "waiting returned operation is not an operation: %s" % type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returnin try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) if ftsJobs: log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) return self.putRequest( request ) # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed: %s" % update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status == 'Failed': # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestName, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except Exception, exceptMessage: log.exception( "Exception in processRequest", exceptMessage ) finally:
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts placement refresh in seconds FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag PIN_TIME = 0 # # FTS submission command SUBMIT_COMMAND = 'glite-transfer-submit' # # FTS monitoring command MONITOR_COMMAND = 'glite-transfer-status' # Max number of requests fetched from the RMS MAX_REQUESTS = 100 # Minimum interval (seconds) between 2 job monitoring MONITORING_INTERVAL = 600 # # placeholder for FTS client __ftsClient = None # # placeholder for the FTS version __ftsVersion = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSPlacement __ftsPlacement = None # # placement regeneration time delta __ftsPlacementValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getRequest( cls, reqID ): """ get Requests systematically and refresh cache """ # Make sure the request is Scheduled res = cls.requestClient().getRequestStatus( reqID ) if not res['OK']: cls.__reqCache.pop( reqID, None ) return res status = res['Value'] if status != 'Scheduled': cls.__reqCache.pop( reqID, None ) return S_ERROR( "Request with id %s is not Scheduled:%s" % ( reqID, status ) ) getRequest = cls.requestClient().getRequest( reqID ) if not getRequest["OK"]: cls.__reqCache.pop( reqID, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqID, None ) return S_ERROR( "request of id '%s' not found in ReqDB" % reqID ) cls.__reqCache[reqID] = getRequest return S_OK( cls.__reqCache[reqID] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestID not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestID, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.iteritems(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSPlacement( self ): """ create fts Placement """ ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: self.log.error( "unable to get FTS history:", ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() if not self.__ftsPlacement: self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory ) else: self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory ) finally: self.updateLock().release() # # save time stamp self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH ) log.info( "FTSPlacement validity period = %s s" % self.FTSPLACEMENT_REFRESH ) self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND ) log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND ) self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND ) log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) ) self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME ) log.info( "Stage files before submission = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = ", str( self.MAX_ACTIVE_JOBS ) ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = ", str( self.MAX_FILES_PER_JOB ) ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = ", str( self.MAX_ATTEMPT ) ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = ", str( self.MIN_THREADS ) ) log.info( "ThreadPool max threads = ", str( self.MAX_THREADS ) ) self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS ) log.info( "Max Requests fetched = ", str( self.MAX_REQUESTS ) ) self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL ) log.info( "Minimum monitoring interval = ", str( self.MONITORING_INTERVAL ) ) self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' ) log.info( "FTSVersion : %s" % self.__ftsVersion ) log.info( "initialize: creation of FTSPlacement..." ) createPlacement = self.resetFTSPlacement() if not createPlacement["OK"]: log.error( "initialize:", createPlacement["Message"] ) return createPlacement # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) self.registrationProtocols = DMSHelpers().getRegistrationProtocols() # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ # Don't use the server certificate otherwise the DFC wont let us write gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) log = gLogger.getSubLogger( "execute" ) # # reset FTSPlacement if expired now = datetime.datetime.now() if now > self.__ftsPlacementValidStamp: log.info( "resetting expired FTS placement..." ) resetFTSPlacement = self.resetFTSPlacement() if not resetFTSPlacement["OK"]: log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] ) return resetFTSPlacement self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS ) if not requestIDs["OK"]: log.error( "unable to read scheduled request ids" , requestIDs["Message"] ) return requestIDs if not requestIDs["Value"]: requestIDs = [] else: requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ] requestIDs += self.__reqCache.keys() if not requestIDs: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestIDs ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) ) for requestID in requestIDs: request = self.getRequest( requestID ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestID while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) ) operation = request.getWaiting() if not operation["OK"]: log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "Waiting returned operation is not an operation:", type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returning try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) now = datetime.datetime.utcnow() jobsToMonitor = [job for job in ftsJobs if ( now - job.LastUpdate ).seconds > ( self.MONITORING_INTERVAL * ( 3. if StorageElement( job.SourceSE ).getStatus().get( 'Value', {} ).get( 'TapeSE' ) else 1. ) ) ] if jobsToMonitor: log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in jobsToMonitor: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.iteritems(): if ftsFiles: log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) if len( ftsJobs ) != len( jobsToMonitor ): log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) ) if not jobsToMonitor: # Nothing to happen this time, escape raise EscapeTryException # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) # Recover files not in FTSDB toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # Recover files with target not in FTSDB toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems() if not [ftsFile for ftsFile in ftsFiles if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] ) if toSchedule: log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Active transfers for which there is no FTS job any longer and reschedule them for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]: if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]: ftsFilesDict['toReschedule'].append( ftsFile ) # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) # Recover files that are Failed but were not spotted for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]: reschedule, submit, fail = self.__checkFailed( ftsFile ) if fail and ftsFile not in ftsFilesDict['toFail']: ftsFilesDict['toFail'].append( ftsFile ) elif reschedule and ftsFile not in ftsFilesDict['toReschedule']: ftsFilesDict['toReschedule'].append( ftsFile ) elif submit and ftsFile not in ftsFilesDict['toSubmit']: ftsFilesDict['toSubmit'].append( ftsFile ) # If all transfers are finished for unregistered files and there is already a registration operation, set it Done ftsLFNs = [f.LFN for f in ftsFiles] for lfn in missingReplicas: # We make sure here that the file is being processed by FTS if lfn in ftsLFNs: if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]: for opFile in operation: if opFile.LFN == lfn: opFile.Status = 'Done' break else: # Temporary log log.warn( "File with missing replica not in FTS files", lfn ) for key, ftsFiles in ftsFilesDict.iteritems(): if ftsFiles: log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) # # putRequest is done by the finally: clause... Not good to do it twice raise EscapeTryException # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.iteritems(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed:", update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( 'Failed to reschedule files', rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status in ( 'Failed', 'Canceled' ): # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # should not put back jobs that have not been monitored this time ftsJobs = jobsToMonitor # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestID, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except EscapeTryException: # This clause is raised when one wants to return from within the try: clause # only put back jobs that were monitored ftsJobs = jobsToMonitor except Exception as exceptMessage: log.exception( "Exception in processRequest", lException = exceptMessage ) finally: putRequest = self.putRequest( request, clearCache = ( request.Status != "Scheduled" ) ) if not putRequest["OK"]: log.error( "unable to put back request:", putRequest["Message"] ) # # put back jobs in all cases if ftsJobs: for ftsJob in list( ftsJobs ): if not len( ftsJob ): log.warn( 'FTS job empty, removed: %s' % ftsJob.FTSGUID ) self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) ftsJobs.remove( ftsJob ) putJobs = self.putFTSJobs( ftsJobs ) if not putJobs["OK"]: log.error( "unable to put back FTSJobs:", putJobs["Message"] ) putRequest = putJobs # This is where one returns from after execution of the finally: block return putRequest def __checkDuplicates( self, reqID, toSubmit ): """ Check in a list of FTSFiles whether there are duplicates """ tupleList = [] log = self.log.getSubLogger( "%s/checkDuplicates" % reqID ) for ftsFile in list( toSubmit ): fTuple = ( ftsFile.LFN, ftsFile.SourceSE, ftsFile.TargetSE ) if fTuple in tupleList: log.warn( "Duplicate file to submit, removed:", ', '.join( fTuple ) ) toSubmit.remove( ftsFile ) self.ftsClient().deleteFTSFiles( ftsFile.OperationID, [ftsFile.FileID] ) else: tupleList.append( fTuple ) def __reschedule( self, request, operation, toReschedule ): """ reschedule list of :toReschedule: files in request for operation :operation: :param Request request: :param Operation operation: :param list toReschedule: list of FTSFiles """ log = self.log.getSubLogger( "req_%s/%s/reschedule" % ( request.RequestID, request.RequestName ) ) ftsFileIDs = [ftsFile.FileID for ftsFile in toReschedule] for opFile in operation: if opFile.FileID in ftsFileIDs: opFile.Status = "Waiting" toSchedule = [] # # filter files for opFile in [ opFile for opFile in operation if opFile.Status == "Waiting" ]: replicas = self.__filterReplicas( opFile ) if not replicas["OK"]: continue replicas = replicas["Value"] validReplicas = replicas["Valid"] noMetaReplicas = replicas["NoMetadata"] noReplicas = replicas["NoReplicas"] badReplicas = replicas['Bad'] if validReplicas: validTargets = list( set( operation.targetSEList ) - set( validReplicas ) ) if not validTargets: log.info( "file %s is already present at all targets" % opFile.LFN ) opFile.Status = "Done" else: toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) ) elif noMetaReplicas: log.warn( "unable to schedule '%s', couldn't get metadata at %s" % ( opFile.LFN, ','.join( noMetaReplicas ) ) ) elif noReplicas: log.warn( "unable to schedule %s, file doesn't exist at %s" % ( opFile.LFN, ','.join( noReplicas ) ) ) opFile.Status = 'Failed' elif badReplicas: log.warn( "unable to schedule %s, all replicas have a bad checksum at %s" % ( opFile.LFN, ','.join( badReplicas ) ) ) opFile.Status = 'Failed' # # do real schedule here if toSchedule: log.info( "Rescheduling %d files" % len( toReschedule ) ) ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID, operation.OperationID, toSchedule ) if not ftsSchedule["OK"]: log.error( "Error scheduling files", ftsSchedule["Message"] ) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for opFile in operation: fileID = opFile.FileID if fileID in ftsSchedule["Successful"]: opFile.Status = "Scheduled" elif fileID in ftsSchedule["Failed"]: opFile.Error = ftsSchedule["Failed"][fileID] log.error( "Error scheduling file %s" % opFile.LFN, opFile.Error ) return S_OK() def __submit( self, request, operation, toSubmit ): """ create and submit new FTSJobs using list of FTSFiles :param Request request: ReqDB.Request instance :param list ftsFiles: list of FTSFile instances :return: [ FTSJob, FTSJob, ...] """ log = self.log.getSubLogger( "req_%s/%s/submit" % ( request.RequestID, request.RequestName ) ) bySourceAndTarget = {} for ftsFile in toSubmit: if ftsFile.SourceSE not in bySourceAndTarget: bySourceAndTarget.setdefault( ftsFile.SourceSE, {} ) if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]: bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] ) bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile ) ftsJobs = [] for source, targetDict in bySourceAndTarget.iteritems(): for target, ftsFileList in targetDict.iteritems(): log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) ) route = self.__ftsPlacement.findRoute( source, target ) if not route["OK"]: log.error( route["Message"] ) continue route = route["Value"] routeValid = self.__ftsPlacement.isRouteValid( route ) if not routeValid['OK']: log.error( "Route invalid : %s" % routeValid['Message'] ) continue sourceSE = StorageElement( source ) sourceToken = sourceSE.getStorageParameters( protocol = 'srm' ) if not sourceToken["OK"]: log.error( "unable to get sourceSE parameters:", "(%s) %s" % ( source, sourceToken["Message"] ) ) continue seStatus = sourceSE.getStatus()['Value'] targetSE = StorageElement( target ) targetToken = targetSE.getStorageParameters( protocol = 'srm' ) if not targetToken["OK"]: log.error( "unable to get targetSE parameters:", "(%s) %s" % ( target, targetToken["Message"] ) ) continue # # create FTSJob for fileList in breakListIntoChunks( ftsFileList, self.MAX_FILES_PER_JOB ): ftsJob = FTSJob() ftsJob.RequestID = request.RequestID ftsJob.OperationID = operation.OperationID ftsJob.SourceSE = source ftsJob.TargetSE = target ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" ) ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" ) ftsJob.FTSServer = route.ftsServer for ftsFile in fileList: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile( ftsFile ) submit = ftsJob.submitFTS( self.__ftsVersion, command = self.SUBMIT_COMMAND, pinTime = self.PIN_TIME if seStatus['TapeSE'] else 0 ) if not submit["OK"]: log.error( "unable to submit FTSJob:", submit["Message"] ) continue log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # update statuses for job files for ftsFile in ftsJob: ftsFile.FTSGUID = ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 # # update placement route try: self.updateLock().acquire() self.__ftsPlacement.startTransferOnRoute( route ) finally: self.updateLock().release() ftsJobs.append( ftsJob ) log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) ) return S_OK( ftsJobs ) def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s" % ( request.RequestID, request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) monitor = ftsJob.monitorFTS( self.__ftsVersion , command = self.MONITOR_COMMAND ) if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"] or \ 'was not found' in monitor['Message'] or\ "Not found" in monitor['Message'] or\ 'Unknown transfer state' in monitor['Message']: log.error( "FTSJob not known (expired on server?): delete it" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s%%" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: if 'Unknown transfer state' in finalizeFTSJob['Message']: for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"].append( ftsFile ) # # No way further for that job: delete it res = self.ftsClient().deleteFTSJob( ftsJob.FTSJobID ) if not res['OK']: log.error( "Unable to delete FTSJob", res['Message'] ) else: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict ) def __finalizeFTSJob( self, request, ftsJob ): """ finalize FTSJob :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "req_%s/%s/monitor/%s/finalize" % ( request.RequestID, request.RequestName, ftsJob.FTSJobID ) ) log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ) monitor = ftsJob.monitorFTS( self.__ftsVersion, command = self.MONITOR_COMMAND, full = True ) if not monitor["OK"]: log.error( monitor["Message"] ) return monitor # # split FTSFiles to different categories processFiles = self.__filterFiles( ftsJob ) if not processFiles["OK"]: log.error( processFiles["Message"] ) return processFiles processFiles = processFiles['Value'] if processFiles['toRegister']: log.error( "Some files could not be registered in FC:", len( processFiles['toRegister'] ) ) ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles ) # # send accounting record for this job self.__sendAccounting( ftsJob, request.OwnerDN ) # # update placement - remove this job from placement route = self.__ftsPlacement.findRoute( ftsJob.SourceSE, ftsJob.TargetSE ) if route["OK"]: try: self.updateLock().acquire() self.__ftsPlacement.finishTransferOnRoute( route['Value'] ) finally: self.updateLock().release() log.info( "FTSJob is finalized" ) return S_OK( ftsFilesDict ) def __checkFailed( self, ftsFile ): reschedule = False submit = False fail = False if ftsFile.Status in ( "Failed", 'Canceled' ): if ftsFile.Error == "MissingSource": reschedule = True else: if ftsFile.Attempt < self.MAX_ATTEMPT: submit = True else: fail = True return reschedule, submit, fail def __filterFiles( self, ftsJob ): """ process ftsFiles from finished ftsJob :param FTSJob ftsJob: monitored FTSJob instance """ # # lists for different categories toUpdate = [] toReschedule = [] toRegister = [] toSubmit = [] toFail = [] # # loop over files in fts job for ftsFile in ftsJob: # # successful files if ftsFile.Status == "Finished": if ftsFile.Error == "AddCatalogReplicaFailed": toRegister.append( ftsFile ) toUpdate.append( ftsFile ) continue reschedule, submit, fail = self.__checkFailed( ftsFile ) if reschedule: toReschedule.append( ftsFile ) elif submit: toSubmit.append( ftsFile ) elif fail: toFail.append( ftsFile ) return S_OK( { "toUpdate": toUpdate, "toSubmit": toSubmit, "toRegister": toRegister, "toReschedule": toReschedule, "toFail": toFail } ) def __insertRegisterOperation( self, request, operation, toRegister ): """ add RegisterReplica operation :param Request request: request instance :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register """ log = self.log.getSubLogger( "req_%s/%s/registerFiles" % ( request.RequestID, request.RequestName ) ) byTarget = {} for ftsFile in toRegister: if ftsFile.TargetSE not in byTarget: byTarget.setdefault( ftsFile.TargetSE, [] ) byTarget[ftsFile.TargetSE].append( ftsFile ) log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) ) for target, ftsFileList in byTarget.iteritems(): log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target, len( ftsFileList ) ) ) registerOperation = Operation() registerOperation.Type = "RegisterReplica" registerOperation.Status = "Waiting" registerOperation.TargetSE = target targetSE = StorageElement( target ) for ftsFile in ftsFileList: opFile = File() opFile.LFN = ftsFile.LFN pfn = returnSingleResult( targetSE.getURL( ftsFile.LFN, protocol = self.registrationProtocols ) ) if not pfn["OK"]: continue opFile.PFN = pfn["Value"] registerOperation.addFile( opFile ) request.insertBefore( registerOperation, operation ) return S_OK() @staticmethod def __sendAccounting( ftsJob, ownerDN ): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime( fromString( ftsJob.SubmitTime ) ) dataOp.setEndTime( fromString( ftsJob.LastUpdate ) ) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN( ownerDN ) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] ) accountingDict["TransferTotal"] = len( ftsJob ) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE # dt = ftsJob.LastUpdate - ftsJob.SubmitTime # transferTime = dt.days * 86400 + dt.seconds # accountingDict["TransferTime"] = transferTime accountingDict['TransferTime'] = sum( int( f._duration ) for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ) dataOp.setValuesFromDict( accountingDict ) dataOp.commit() def __checkReadyReplicas( self, request, operation ): """ check ready replicas for transferOperation """ log = self.log.getSubLogger( "req_%s/%s/checkReadyReplicas" % ( request.RequestID, request.RequestName ) ) targetSESet = set( operation.targetSEList ) # # { LFN: [ targetSE, ... ] } missingReplicas = {} scheduledFiles = dict( ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) ) # # get replicas replicas = FileCatalog().getReplicas( scheduledFiles.keys() ) if not replicas["OK"]: self.log.error( replicas["Message"] ) return replicas replicas = replicas["Value"] fullyReplicated = 0 missingSEs = {} for successfulLFN in replicas["Successful"]: reps = set( replicas['Successful'][successfulLFN] ) if targetSESet.issubset( reps ): log.verbose( "%s has been replicated to all targets" % successfulLFN ) fullyReplicated += 1 scheduledFiles[successfulLFN].Status = "Done" else: missingReplicas[successfulLFN] = sorted( targetSESet - reps ) ses = ",".join( missingReplicas[ successfulLFN ] ) missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1 log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) ) if fullyReplicated: log.info( "%d new files have been replicated to all targets" % fullyReplicated ) if missingSEs: for ses in missingSEs: log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) ) reMissing = re.compile( "no such file or directory" ) for failedLFN, errStr in replicas["Failed"].iteritems(): scheduledFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): log.error( "%s is missing, setting its status to 'Failed'" % failedLFN ) scheduledFiles[failedLFN].Status = "Failed" else: log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) ) return S_OK( missingReplicas ) def __filterReplicas( self, opFile ): """ filter out banned/invalid source SEs """ from DIRAC.DataManagementSystem.Agent.RequestOperations.ReplicateAndRegister import filterReplicas return filterReplicas( opFile, logger = self.log, dataManager = self.dataManager )
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag PIN_TIME = 0 # # FTS submission command SUBMIT_COMMAND = 'glite-transfer-submit' # # FTS monitoring command MONITOR_COMMAND = 'glite-transfer-status' # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE( cls, seName ): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement( seName ) return cls.__seCache[seName] @classmethod def getSECache( cls ): return cls.__seCache @classmethod def getRequest( cls, reqName ): """ get Requests systematically and refresh cache """ getRequest = cls.requestClient().getRequest( reqName ) if not getRequest["OK"]: cls.__reqCache.pop( reqName, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqName, None ) return S_ERROR( "request of name '%s' not found in ReqDB" % reqName ) cls.__reqCache[reqName] = getRequest return S_OK( cls.__reqCache[reqName] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestName not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestName, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history:", ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory, maxActiveJobs = self.MAX_ACTIVE_JOBS ) finally: self.updateLock().release() log.debug( "FTSSites:", len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH ) log.info( "FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH ) self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH ) log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH ) self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND ) log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND ) self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND ) log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) ) self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME ) log.info( "Stage files before submission = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = ", str( self.MAX_ACTIVE_JOBS ) ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = ", str( self.MAX_FILES_PER_JOB ) ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = ", str( self.MAX_ATTEMPT ) ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = ", str( self.MIN_THREADS ) ) log.info( "ThreadPool max threads = ", str( self.MAX_THREADS ) ) log.info( "initialize: creation of FTSGraph..." ) createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error( "initialize: ", createGraph["Message"] ) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: ", resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "Unable to read scheduled request names: ", requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "No 'Scheduled' requests to process" ) return S_OK() log.info( "Found requests to process:", str( len( requestNames ) ) ) log.info( " => from internal cache:", str( ( len( self.__reqCache ) ) ) ) log.info( " => new read from RMS:", str( ( len( requestNames ) - len( self.__reqCache ) ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestName, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( request.RequestName ) operation = request.getWaiting() if not operation["OK"]: log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "Waiting returned operation is not an operation:", type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returnin try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) if ftsJobs: log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) # Recover files not in FTSDB toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # Recover files with target not in FTSDB toSchedule = set( [missing for missing, missingSEs in missingReplicas.items() if not [ftsFile for ftsFile in ftsFiles if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] ) if toSchedule: log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) # # putRequest is done by the finally: clause... Not good to do it twice raise escapeTry # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed:", update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status in ( 'Failed', 'Canceled' ): # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestName, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except escapeTry: # This clause is raised when one wants to return from within the try: clause pass except Exception, exceptMessage: log.exception( "Exception in processRequest", lException = exceptMessage ) finally:
class RemovalAgent( AgentModule, RequestAgentMixIn ): """ This Agent takes care of executing "removal" request from the RequestManagement system """ def __init__( self, *args ): """ Initialize the base class and define some extra data members """ AgentModule.__init__( self, *args ) self.requestDBClient = None self.replicaManager = None self.maxNumberOfThreads = 4 self.maxRequestsInQueue = 100 self.threadPool = None self.timeOutCounter = 0 self.pendingRequests = True def initialize( self ): """ Called by the framework upon startup, before any cycle (execute method bellow) """ self.requestDBClient = RequestClient() # the RequestAgentMixIn needs the capitalized version, until is is fixed keep this. self.RequestDBClient = self.requestDBClient self.replicaManager = ReplicaManager() gMonitor.registerActivity( "Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalAtt", "Physical removals attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalDone", "Successful physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalFail", "Failed physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalSize", "Physically removed size", "RemovalAgent", "Bytes", gMonitor.OP_ACUM ) gMonitor.registerActivity( "ReplicaRemovalAtt", "Replica removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalDone", "Successful replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalFail", "Failed replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileAtt", "File removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileDone", "File removal done", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileFail", "File removal failed", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) self.maxNumberOfThreads = self.am_getOption( 'NumberOfThreads', self.maxNumberOfThreads ) self.maxRequestsInQueue = self.am_getOption( 'RequestsInQueue', self.maxRequestsInQueue ) self.threadPool = ThreadPool( 1, self.maxNumberOfThreads, self.maxRequestsInQueue ) # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted self.threadPool.daemonize() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ Fill the TreadPool with ThreadJobs """ self.pendingRequests = True while self.pendingRequests: requestExecutor = ThreadedJob( self.executeRequest ) ret = self.threadPool.queueJob( requestExecutor ) if not ret['OK']: break time.sleep( 0.1 ) if self.timeOutCounter: gLogger.error( 'Timeouts during removal execution:', self.timeOutCounter ) return S_OK() def executeRequest( self ): """ Do the actual work in the Thread """ ################################################ # Get a request from request DB gMonitor.addMark( "Iteration", 1 ) res = self.requestDBClient.getRequest( 'removal' ) if not res['OK']: gLogger.info( "RemovalAgent.execute: Failed to get request from database." ) return S_OK() elif not res['Value']: gLogger.info( "RemovalAgent.execute: No requests to be executed found." ) self.pendingRequests = False return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] sourceServer = res['Value']['Server'] try: jobID = int( res['Value']['JobID'] ) except ValueError: jobID = 0 gLogger.info( "RemovalAgent.execute: Obtained request %s" % requestName ) try: result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer ) if result['OK']: currentOrder = result['Value'] else: gLogger.error( 'Can not get the request execution order' ) self.requestDBClient.updateRequest( requestName, requestString, sourceServer ) return S_OK( 'Can not get the request execution order' ) oRequest = RequestContainer( request = requestString ) ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests( 'removal' ) if not res['OK']: errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests." gLogger.error( errStr, res['Message'] ) return S_OK() gLogger.info( "RemovalAgent.execute: Found %s sub requests." % res['Value'] ) ################################################ # For all the sub-requests in the request modified = False for ind in range( res['Value'] ): gMonitor.addMark( "Execute", 1 ) gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind ) subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal' )['Value'] subExecutionOrder = int( subRequestAttributes['ExecutionOrder'] ) subStatus = subRequestAttributes['Status'] if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal' )['Value'] operation = subRequestAttributes['Operation'] ################################################ # If the sub-request is a physical removal operation if operation == 'physicalRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) physicalFiles = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) pfnToLfn[pfn] = lfn physicalFiles.append( pfn ) gMonitor.addMark( 'PhysicalRemovalAtt', len( physicalFiles ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeStorageFile( physicalFiles, diracSE ) if res['OK']: for pfn in res['Value']['Failed'].keys(): if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = res['Value']['Failed'][pfn] else: errMsg[diracSE] = res['Message'] for pfn in physicalFiles: if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = 'Completely' # Now analyse the results failedPFNs = failed.keys() pfnsOK = [pfn for pfn in physicalFiles if not pfn in failedPFNs] gMonitor.addMark( 'PhysicalRemovalDone', len( pfnsOK ) ) for pfn in pfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( pfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedPFNs ) ) for pfn in failedPFNs: for diracSE in failed[pfn].keys(): if type( failed[pfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[pfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", pfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( pfn, diracSE, failed[pfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'removeFile': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'RemoveFileAtt', len( lfns ) ) res = self.replicaManager.removeFile( lfns ) if res['OK']: gMonitor.addMark( 'RemoveFileDone', len( res['Value']['Successful'].keys() ) ) for lfn in res['Value']['Successful'].keys(): gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True gMonitor.addMark( 'RemoveFileFail', len( res['Value']['Failed'].keys() ) ) for lfn in res['Value']['Failed'].keys(): if type( res['Value']['Failed'][lfn] ) in StringTypes: if re.search( 'no such file or directory', res['Value']['Failed'][lfn].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file:", "%s %s" % ( lfn, res['Value']['Failed'][lfn] ) ) else: gMonitor.addMark( 'RemoveFileFail', len( lfns ) ) errStr = "RemovalAgent.execute: Completely failed to remove files files." gLogger.error( errStr, res['Message'] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'replicaRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'ReplicaRemovalAtt', len( lfns ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeReplica( diracSE, lfns ) if res['OK']: for lfn in res['Value']['Failed'].keys(): errorMessage = str( res['Value']['Failed'][lfn] ) if errorMessage.find( 'Write access not permitted for this credential.' ) != -1: if self.__getProxyAndRemoveReplica( diracSE, lfn ): continue if errorMessage.find( 'seconds timeout for "__gfal_wrapper" call' ) != -1: self.timeOutCounter += 1 if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = res['Value']['Failed'][lfn] else: errMsg[diracSE] = res['Message'] for lfn in lfns: if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = 'Completely' # Now analyse the results failedLFNs = failed.keys() lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs] gMonitor.addMark( 'ReplicaRemovalDone', len( lfnsOK ) ) for lfn in lfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( lfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedLFNs ) ) for lfn in failedLFNs: for diracSE in failed[lfn].keys(): if type( failed[lfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[lfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( lfn, diracSE, failed[lfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a request to the online system to retransfer elif operation == 'reTransfer': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSE = subRequestAttributes['TargetSE'] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) res = self.replicaManager.onlineRetransfer( diracSE, pfn ) if res['OK']: if res['Value']['Successful'].has_key( pfn ): gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: errStr = "RemovalAgent.execute: Failed to request retransfer." gLogger.error( errStr, "%s %s %s" % ( pfn, diracSE, res['Value']['Failed'][pfn] ) ) else: errStr = "RemovalAgent.execute: Completely failed to request retransfer." gLogger.error( errStr, res['Message'] ) else: gLogger.info( "RemovalAgent.execute: File already completed." ) ################################################ # If the sub-request is none of the above types else: gLogger.error( "RemovalAgent.execute: Operation not supported.", operation ) ################################################ # Determine whether there are any active files if oRequest.isSubRequestEmpty( ind, 'removal' )['Value']: oRequest.setSubRequestStatus( ind, 'removal', 'Done' ) gMonitor.addMark( "Done", 1 ) ################################################ # If the sub-request is already in terminal state else: gLogger.info( "RemovalAgent.execute:", "Sub-request %s is status '%s' and not to be executed." % ( ind, subRequestAttributes['Status'] ) ) ################################################ # Generate the new request string after operation newrequestString = oRequest.toXML()['Value'] except: # if something fails return the original request back to the server res = self.requestDBClient.updateRequest( requestName, requestString, sourceServer ) return S_OK() res = self.requestDBClient.updateRequest( requestName, newrequestString, sourceServer ) if modified and jobID: result = self.finalizeRequest( requestName, jobID, sourceServer ) return S_OK() def __getProxyAndRemoveReplica( self, diracSE, lfn ): """ get a proxy from the owner of the file and try to remove it returns True if it succeeds, False otherwise """ result = self.replicaManager.getCatalogDirectoryMetadata( lfn, singleFile = True ) if not result[ 'OK' ]: gLogger.error( "Could not get metadata info", result[ 'Message' ] ) return False ownerRole = result[ 'Value' ][ 'OwnerRole' ] ownerDN = result[ 'Value' ][ 'OwnerDN' ] if ownerRole[0] != "/": ownerRole = "/%s" % ownerRole userProxy = '' for ownerGroup in Registry.getGroupsWithVOMSAttribute( ownerRole ): result = gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited = True, requiredVOMSAttribute = ownerRole ) if not result[ 'OK' ]: gLogger.verbose ( 'Failed to retrieve voms proxy for %s : %s:' % ( ownerDN, ownerRole ), result[ 'Message' ] ) continue userProxy = result[ 'Value' ] gLogger.verbose( "Got proxy for %s@%s [%s]" % ( ownerDN, ownerGroup, ownerRole ) ) break if not userProxy: return False result = userProxy.dumpAllToFile() if not result[ 'OK' ]: gLogger.verbose( result[ 'Message' ] ) return False upFile = result[ 'Value' ] prevProxyEnv = os.environ[ 'X509_USER_PROXY' ] os.environ[ 'X509_USER_PROXY' ] = upFile try: res = self.replicaManager.removeReplica( diracSE, lfn ) if res['OK'] and lfn in res[ 'Value' ]['Successful']: gLogger.verbose( 'Removed %s from %s' % ( lfn, diracSE ) ) return True finally: os.environ[ 'X509_USER_PROXY' ] = prevProxyEnv os.unlink( upFile ) return False def finalize( self ): """ Called by the Agent framework to cleanly end execution. In this case this module will wait until all pending ThreadedJbos in the ThreadPool get executed """ self.threadPool.processAllResults() return S_OK()
class OutputDataExecutor: def __init__( self, csPath = "" ): self.log = gLogger.getSubLogger( "OutputDataExecutor" ) if not csPath: vo = gConfig.getValue( "/DIRAC/VirtualOrganization", "" ) self.__transfersCSPath = '/Operations/%s/OutputData' % vo else: self.__transfersCSPath = csPath self.log.verbose( "Reading transfer paths from %s" % self.__transfersCSPath ) self.__requiredCSOptions = ['InputPath', 'InputFC', 'OutputPath', 'OutputFC', 'OutputSE'] self.__threadPool = ThreadPool( gConfig.getValue( "%s/MinTransfers" % self.__transfersCSPath, 1 ), gConfig.getValue( "%s/MaxTransfers" % self.__transfersCSPath, 4 ), gConfig.getValue( "%s/MaxQueuedTransfers" % self.__transfersCSPath, 100 ) ) self.__threadPool.daemonize() self.__processingFiles = set() self.__okTransferredFiles = 0 self.__okTransferredBytes = 0 self.__failedFiles = {} def getNumOKTransferredFiles( self ): return self.__okTransferredFiles def getNumOKTransferredBytes( self ): return self.__okTransferredBytes def transfersPending( self ): return self.__threadPool.isWorking() def getDefinedTransferPaths( self ): result = gConfig.getSections( self.__transfersCSPath ) if not result['OK']: self.log.info( 'No Input/Output Pair defined in CS' ) return S_OK() pathList = result['Value'] tPaths = {} for name in pathList: csPath = self.__transfersCSPath + '/%s' % name result = gConfig.getOptionsDict( csPath ) if not result['OK']: continue transferDict = result['Value'] ok = True for i in self.__requiredCSOptions: if i not in transferDict: self.log.error( 'Missing Option %s in %s' % ( i, csPath ) ) ok = False break if not ok: continue tPaths[ name ] = transferDict return S_OK( tPaths ) def getNumLocalOutgoingFiles( self ): result = self.getDefinedTransferPaths() if not result[ 'OK' ]: return 0 localOutgoing = 0 tPaths = result[ 'Value' ] for name in tPaths: transferDict = tPaths[ name ] if 'LocalDisk' != transferDict['InputFC']: continue localOutgoing += len( self.getOutgoingFiles( transferDict ) ) return localOutgoing def getOutgoingFiles( self, transferDict ): """ Get list of files to be processed from InputPath """ inputFCName = transferDict['InputFC'] inputPath = transferDict['InputPath'] if inputFCName == 'LocalDisk': files = [] try: for fileName in os.listdir( inputPath ): if os.path.isfile( os.path.join( inputPath, fileName ) ): files.append( fileName ) except: pass return files inputFC = FileCatalog( [inputFCName] ) result = inputFC.listDirectory( inputPath, True ) if not result['OK']: self.log.error( result['Message'] ) return [] if not inputPath in result['Value']['Successful']: self.log.error( result['Value']['Failed'][inputPath] ) return [] subDirs = result['Value']['Successful'][inputPath]['SubDirs'] files = result['Value']['Successful'][inputPath]['Files'] for subDir in subDirs: self.log.info( 'Ignoring subdirectory:', subDir ) return files.keys() def checkForTransfers( self ): """ Check for transfers to do and start them """ result = self.getDefinedTransferPaths() if not result[ 'OK' ]: return result tPaths = result[ 'Value' ] for name in tPaths: transferPath = tPaths[ name ] self.log.verbose( "Checking %s transfer path" % name ) filesToTransfer = self.getOutgoingFiles( tPaths[ name ] ) self.log.info( "Transfer path %s has %d files" % ( name, len( filesToTransfer ) ) ) ret = self.__addFilesToThreadPool( filesToTransfer, transferPath ) if not ret['OK']: # The thread pool got full break def processAllPendingTransfers( self ): self.__threadPool.processAllResults() @transferSync def __addFilesToThreadPool( self, files, transferDict ): for fileName in files: fileName = os.path.basename( fileName ) if fileName in self.__processingFiles: continue self.__processingFiles.add( fileName ) time.sleep( 1 ) ret = self.__threadPool.generateJobAndQueueIt( self.__transferIfNotRegistered, args = ( fileName, transferDict ), oCallback = self.transferCallback, blocking = False ) if not ret['OK']: # The thread pool got full return ret return S_OK() def __transferIfNotRegistered( self, file, transferDict ): result = self.isRegisteredInOutputCatalog( file, transferDict ) if not result[ 'OK' ]: self.log.error( result[ 'Message' ] ) return result #Already registered. Need to delete if result[ 'Value' ]: self.log.info( "Transfer file %s is already registered in the output catalog" % file ) #Delete filePath = os.path.join( transferDict[ 'InputPath' ], file ) if transferDict[ 'InputFC' ] == 'LocalDisk': os.unlink( filePath ) #FIXME: what is inFile supposed to be ?? else: inputFC = FileCatalog( [ transferDict['InputFC'] ] ) replicaDict = inputFC.getReplicas( filePath ) if not replicaDict['OK']: self.log.error( "Error deleting file", replicaDict['Message'] ) elif not inFile in replicaDict['Value']['Successful']: self.log.error( "Error deleting file", replicaDict['Value']['Failed'][inFile] ) else: seList = replicaDict['Value']['Successful'][inFile].keys() for se in seList: se = StorageElement( se ) self.log.info( 'Removing from %s:' % se.name, inFile ) se.removeFile( inFile ) inputFC.removeFile( file ) self.log.info( "File %s deleted from %s" % ( file, transferDict[ 'InputFC' ] ) ) self.__processingFiles.discard( file ) return S_OK( file ) #Do the transfer return self.__retrieveAndUploadFile( file, transferDict ) def isRegisteredInOutputCatalog( self, file, transferDict ): fc = FileCatalog( [ transferDict[ 'OutputFC' ] ] ) lfn = os.path.join( transferDict['OutputPath'], os.path.basename( file ) ) result = fc.getReplicas( lfn ) if not result[ 'OK' ]: return result if lfn not in result[ 'Value' ][ 'Successful' ]: return S_OK( False ) replicas = result[ 'Value' ][ 'Successful' ][ lfn ] for seName in List.fromChar( transferDict[ 'OutputSE' ], "," ): if seName in replicas: self.log.verbose( "Transfer file %s is already registered in %s SE" % ( file, seName ) ) return S_OK( True ) return S_OK( False ) def __retrieveAndUploadFile( self, file, outputDict ): """ Retrieve, Upload, and remove """ fileName = file inputPath = outputDict['InputPath'] inputFCName = outputDict['InputFC'] inBytes = 0 if inputFCName == 'LocalDisk': inFile = file file = os.path.join( inputPath, file ) else: inputFC = FileCatalog( [inputFCName] ) inFile = os.path.join( inputPath, file ) replicaDict = inputFC.getReplicas( inFile ) if not replicaDict['OK']: self.log.error( replicaDict['Message'] ) return S_ERROR( fileName ) if not inFile in replicaDict['Value']['Successful']: self.log.error( replicaDict['Value']['Failed'][inFile] ) return S_ERROR( fileName ) seList = replicaDict['Value']['Successful'][inFile].keys() inputSE = StorageElement( seList[0] ) self.log.info( 'Retrieving from %s:' % inputSE.name, inFile ) # ret = inputSE.getFile( inFile ) # lcg_util binding prevent multithreading, use subprocess instead res = pythonCall( 2 * 3600, inputSE.getFile, inFile ) if not res['OK']: self.log.error( res['Message'] ) return S_ERROR( fileName ) ret = res['Value'] if not ret['OK']: self.log.error( ret['Message'] ) return S_ERROR( fileName ) if not inFile in ret['Value']['Successful']: self.log.error( ret['Value']['Failed'][inFile] ) return S_ERROR( fileName ) if os.path.isfile( file ): inBytes = os.stat( file )[6] outputPath = outputDict['OutputPath'] outputFCName = outputDict['OutputFC'] replicaManager = ReplicaManager() outFile = os.path.join( outputPath, os.path.basename( file ) ) transferOK = False for outputSEName in List.fromChar( outputDict['OutputSE'], "," ): outputSE = StorageElement( outputSEName ) self.log.info( 'Trying to upload to %s:' % outputSE.name, outFile ) # ret = replicaManager.putAndRegister( outFile, os.path.realpath( file ), outputSE.name, catalog=outputFCName ) # lcg_util binding prevent multithreading, use subprocess instead result = pythonCall( 2 * 3600, replicaManager.putAndRegister, outFile, os.path.realpath( file ), outputSE.name, catalog = outputFCName ) if result['OK'] and result['Value']['OK']: if outFile in result['Value']['Value']['Successful']: transferOK = True break else: self.log.error( result['Value']['Value']['Failed'][outFile] ) else: if result['OK']: self.log.error( result['Value']['Message'] ) else: self.log.error( result['Message'] ) if not transferOK: return S_ERROR( fileName ) if result['OK'] or not inputFCName == 'LocalDisk': os.unlink( file ) if not result['OK']: self.log.error( ret['Message'] ) return S_ERROR( fileName ) self.log.info( "Finished transferring %s [%s bytes]" % ( inFile, inBytes ) ) self.__okTransferredFiles += 1 self.__okTransferredBytes += inBytes if inputFCName == 'LocalDisk': return S_OK( fileName ) # Now the file is on final SE/FC, remove from input SE/FC for se in seList: se = StorageElement( se ) self.log.info( 'Removing from %s:' % se.name, inFile ) se.removeFile( inFile ) inputFC.removeFile( inFile ) return S_OK( fileName ) @transferSync def transferCallback( self, threadedJob, submitResult ): if not submitResult['OK']: fileName = submitResult['Message'] if fileName not in self.__failedFiles: self.__failedFiles[fileName] = 0 self.__failedFiles[fileName] += 1 else: fileName = submitResult['Value'] if fileName in self.__failedFiles: del self.__failedFiles[fileName] #Take out from processing files if fileName in self.__processingFiles: self.__processingFiles.discard( fileName )
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts graph refresh in seconds FTSGRAPH_REFRESH = FTSHistoryView.INTERVAL / 2 # # SE R/W access refresh in seconds RW_REFRESH = 600 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag STAGE_FILES = False # # replica manager __replicaManager = None # # placeholder for FTS client __ftsClient = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSGraph __ftsGraph = None # # graph regeneration time delta __ftsGraphValidStamp = None # # r/w access valid stamp __rwAccessValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # se cache __seCache = dict() # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def replicaManager( cls ): """ replica manager getter """ if not cls.__replicaManager: cls.__replicaManager = ReplicaManager() return cls.__replicaManager @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getSE( cls, seName ): """ keep SEs in cache """ if seName not in cls.__seCache: cls.__seCache[seName] = StorageElement( seName ) return cls.__seCache[seName] @classmethod def getRequest( cls, reqName ): """ keep Requests in cache """ if reqName not in cls.__reqCache: getRequest = cls.requestClient().getRequest( reqName ) if not getRequest["OK"]: return getRequest getRequest = getRequest["Value"] if not getRequest: return S_ERROR( "request of name '%s' not found in ReqDB" % reqName ) cls.__reqCache[reqName] = getRequest return S_OK( cls.__reqCache[reqName] ) @classmethod def putRequest( cls, request ): """ put request back to ReqDB :param Request request: Request instance also finalize request if status == Done """ # # put back request put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first is possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestName, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if request.RequestName in cls.__reqCache: del cls.__reqCache[ request.RequestName ] return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.items(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSGraph( self ): """ create fts graph """ log = gLogger.getSubLogger( "ftsGraph" ) ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: log.error( "unable to get FTS history: %s" % ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() self.__ftsGraph = FTSGraph( "FTSGraph", ftsHistory ) finally: self.updateLock().release() log.debug( "FTSSites: %s" % len( self.__ftsGraph.nodes() ) ) for i, site in enumerate( self.__ftsGraph.nodes() ): log.debug( " [%02d] FTSSite: %-25s FTSServer: %s" % ( i, site.name, site.FTSServer ) ) log.debug( "FTSRoutes: %s" % len( self.__ftsGraph.edges() ) ) for i, route in enumerate( self.__ftsGraph.edges() ): log.debug( " [%02d] FTSRoute: %-25s Active FTSJobs (Max) = %s (%s)" % ( i, route.routeName, route.ActiveJobs, route.toNode.MaxActiveJobs ) ) # # save graph stamp self.__ftsGraphValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # refresh SE R/W access try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() # # save rw access stamp self.__rwAccessValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.RW_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ log = self.log.getSubLogger( "initialize" ) self.FTSGRAPH_REFRESH = self.am_getOption( "FTSGraphValidityPeriod", self.FTSGRAPH_REFRESH ) log.info( "FTSGraph validity period = %s s" % self.FTSGRAPH_REFRESH ) self.RW_REFRESH = self.am_getOption( "RWAccessValidityPeriod", self.RW_REFRESH ) log.info( "SEs R/W access validity period = %s s" % self.RW_REFRESH ) self.STAGE_FILES = self.am_getOption( "StageFiles", self.STAGE_FILES ) log.info( "Stage files before submission = %s" % {True: "yes", False: "no"}[bool( self.STAGE_FILES )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = %s" % self.MAX_ACTIVE_JOBS ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = %d" % self.MAX_FILES_PER_JOB ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = %s" % self.MAX_ATTEMPT ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = %s" % self.MIN_THREADS ) log.info( "ThreadPool max threads = %s" % self.MAX_THREADS ) log.info( "initialize: creation of FTSGraph..." ) createGraph = self.resetFTSGraph() if not createGraph["OK"]: log.error( "initialize: %s" % createGraph["Message"] ) return createGraph # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ log = self.log.getSubLogger( "finalize" ) for request in self.__reqCache.values(): put = self.requestClient().putRequest( request ) if not put["OK"]: log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ log = gLogger.getSubLogger( "execute" ) # # reset FTSGraph if expired now = datetime.datetime.now() if now > self.__ftsGraphValidStamp: log.info( "resetting expired FTS graph..." ) resetFTSGraph = self.resetFTSGraph() if not resetFTSGraph["OK"]: log.error( "FTSGraph recreation error: %s" % resetFTSGraph["Message"] ) return resetFTSGraph self.__ftsGraphValidStamp = now + datetime.timedelta( seconds = self.FTSGRAPH_REFRESH ) # # update R/W access in FTSGraph if expired if now > self.__rwAccessValidStamp: log.info( "updating expired R/W access for SEs..." ) try: self.updateLock().acquire() self.__ftsGraph.updateRWAccess() finally: self.updateLock().release() self.__rwAccessValidStamp = now + datetime.timedelta( seconds = self.RW_REFRESH ) requestNames = self.requestClient().getRequestNamesList( [ "Scheduled" ] ) if not requestNames["OK"]: log.error( "unable to read scheduled request names: %s" % requestNames["Message"] ) return requestNames if not requestNames["Value"]: requestNames = self.__reqCache.keys() else: requestNames = [ req[0] for req in requestNames["Value"] ] requestNames = list( set ( requestNames + self.__reqCache.keys() ) ) if not requestNames: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestNames ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestNames ) - len( self.__reqCache ) ) ) for requestName in requestNames: request = self.getRequest( requestName ) if not request["OK"]: log.error( request["Message"] ) continue request = request["Value"] sTJId = request.RequestName while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "request '%s' enqueued for execution" % sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( request.RequestName ) operation = request.getWaiting() if not operation["OK"]: log.error( "unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but %s" % operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got %s" % operation.Status ) return self.putRequest( request ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = ftsJobs["Value"] if ftsJobs["Value"] else [] # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) if ftsJobs: log.info( "==> found %s FTSJobs to monitor" % len( ftsJobs ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in ftsJobs: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob %s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" continue ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.items(): if ftsFiles: log.debug( " => %s FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status != 'Done': log.warn( "Should be set! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %s 'Failed' FTSFiles, request execution cannot proceed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed at this stage "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) return self.putRequest( request ) # # PHASE THREE - update Waiting#SourceSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: if ftsFile.TargetSE not in byTarget: byTarget.setdefault( ftsFile.TargetSE, [] ) byTarget[ftsFile.TargetSE].append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.items(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed: %s" % update["Message"] ) continue # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %s Files waiting for registration, adding 'RegisterReplica' operations" ) registerFiles = self.__register( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations: %s" % registerFiles["Message"] ) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to RMS" ) return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( rescheduleFiles["Message"] ) if request.Status == "Waiting": log.info( "request is in 'Waiting' state, will put it back to ReqDB" ) return self.putRequest( request ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting" ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = list( set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: toSubmit.append( ftsFile ) retryIds.append( ftsFile.FTSFileID ) # # submit new ftsJobs if operation.Status == "Scheduled" and toSubmit: log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": put = self.putRequest( request ) if not put["OK"]: log.error( "unable to put back request: %s" % put["Message"] ) return put # # put back jobs if ftsJobs: putJobs = self.putFTSJobs( ftsJobs ) if not putJobs["OK"]: log.error( "unable to put back FTSJobs: %s" % putJobs["Message"] ) return putJobs return S_OK() def __reschedule( self, request, operation, toReschedule ): """ reschedule list of :toReschedule: files in request for operation :operation: :param Request request: :param Operation operation: :param list toReschedule: list of FTSFiles """ log = self.log.getSubLogger( "%s/reschedule" % request.RequestName ) log.info( "found %s files to reschedule" % len( toReschedule ) ) for opFile in operation: for ftsFile in toReschedule: if opFile.FileID == ftsFile.FileID: opFile.Status = "Waiting" toSchedule = [] # # filter files for opFile in operation.getWaitingFilesList(): replicas = self.__filterReplicas( opFile ) if not replicas["OK"]: continue replicas = replicas["Value"] if not replicas["Valid"] and replicas["Banned"]: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN ) continue validReplicas = replicas["Valid"] bannedReplicas = replicas["Banned"] if not validReplicas and bannedReplicas: log.warn( "unable to schedule '%s', replicas only at banned SEs" % opFile.LFN ) continue if validReplicas: validTargets = list( set( operation.targetSEList ) - set( validReplicas ) ) if not validTargets: log.info( "file %s is already present at all targets" % opFile.LFN ) opFile.Status = "Done" continue toSchedule.append( ( opFile.toJSON()["Value"], validReplicas, validTargets ) ) # # do real schedule here if toSchedule: ftsSchedule = self.ftsClient().ftsSchedule( request.RequestID, operation.OperationID, toSchedule ) if not ftsSchedule["OK"]: self.log.error( ftsSchedule["Message"] ) return ftsSchedule ftsSchedule = ftsSchedule["Value"] for fileID in ftsSchedule["Successful"]: for opFile in operation: if fileID == opFile.FileID: opFile.Status = "Scheduled" for fileID, reason in ftsSchedule["Failed"]: for opFile in operation: if fileID == opFile.FileID: opFile.Error = reason return S_OK() def __submit( self, request, operation, toSubmit ): """ create and submit new FTSJobs using list of FTSFiles :param Request request: ReqDB.Request instance :param list ftsFiles: list of FTSFile instances :return: [ FTSJob, FTSJob, ...] """ log = self.log.getSubLogger( "%s/submit" % request.RequestName ) bySourceAndTarget = {} for ftsFile in toSubmit: if ftsFile.SourceSE not in bySourceAndTarget: bySourceAndTarget.setdefault( ftsFile.SourceSE, {} ) if ftsFile.TargetSE not in bySourceAndTarget[ftsFile.SourceSE]: bySourceAndTarget[ftsFile.SourceSE].setdefault( ftsFile.TargetSE, [] ) bySourceAndTarget[ftsFile.SourceSE][ftsFile.TargetSE].append( ftsFile ) ftsJobs = [] for source, targetDict in bySourceAndTarget.items(): for target, ftsFileList in targetDict.items(): log.info( "found %s files to submit from %s to %s" % ( len( ftsFileList ), source, target ) ) route = self.__ftsGraph.findRoute( source, target ) if not route["OK"]: log.error( route["Message"] ) continue route = route["Value"] sourceRead = route.fromNode.SEs[source]["read"] if not sourceRead: log.error( "SourceSE %s is banned for reading right now" % source ) continue targetWrite = route.toNode.SEs[target]["write"] if not targetWrite: log.error( "TargetSE %s is banned for writing right now" % target ) continue if route.ActiveJobs > route.toNode.MaxActiveJobs: log.warn( "unable to submit new FTS job, max active jobs reached" ) continue # # create FTSJob ftsJob = FTSJob() ftsJob.RequestID = request.RequestID ftsJob.OperationID = operation.OperationID ftsJob.SourceSE = source ftsJob.TargetSE = target sourceSE = self.getSE( source ) sourceToken = sourceSE.getStorageParameters( "SRM2" ) if not sourceToken["OK"]: log.error( "unable to get sourceSE '%s' parameters: %s" % ( source, sourceToken["Message"] ) ) continue ftsJob.SourceToken = sourceToken["Value"].get( "SpaceToken", "" ) targetSE = self.getSE( target ) targetToken = targetSE.getStorageParameters( "SRM2" ) if not targetToken["OK"]: log.error( "unable to get targetSE '%s' parameters: %s" % ( target, targetToken["Message"] ) ) continue ftsJob.TargetToken = targetToken["Value"].get( "SpaceToken", "" ) ftsJob.FTSServer = route.toNode.FTSServer for ftsFile in ftsFileList: ftsFile.Attempt += 1 ftsFile.Error = "" ftsJob.addFile( ftsFile ) submit = ftsJob.submitFTS2( self.STAGE_FILES ) if not submit["OK"]: log.error( "unable to submit FTSJob: %s" % submit["Message"] ) continue log.info( "FTSJob '%s'@'%s' has been submitted" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # update statuses for job files for ftsFile in ftsJob: ftsFile.FTSGUID = ftsJob.FTSGUID ftsFile.Status = "Submitted" ftsFile.Attempt += 1 # # update graph route try: self.updateLock().acquire() route.ActiveJobs += 1 finally: self.updateLock().release() ftsJobs.append( ftsJob ) log.info( "%s new FTSJobs have been submitted" % len( ftsJobs ) ) return S_OK( ftsJobs ) def __monitorJob( self, request, ftsJob ): """ execute FTSJob.monitorFTS2 for a given :ftsJob: if ftsJob is in a final state, finalize it :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "%s/monitor/%s" % ( request.RequestName, ftsJob.FTSGUID ) ) log.info( "FTSJob '%s'@'%s'" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) monitor = ftsJob.monitorFTS2() if not monitor["OK"]: gMonitor.addMark( "FTSMonitorFail", 1 ) log.error( monitor["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in monitor["Message"]: log.error( "FTSJob not known (expired on server?)" ) for ftsFile in ftsJob: ftsFile.Status = "Waiting" ftsFilesDict["toSubmit"] = ftsFile return S_OK( ftsFilesDict ) return monitor monitor = monitor["Value"] log.info( "FTSJob Status = %s Completeness = %s" % ( ftsJob.Status, ftsJob.Completeness ) ) # # monitor status change gMonitor.addMark( "FTSJobs%s" % ftsJob.Status, 1 ) if ftsJob.Status in FTSJob.FINALSTATES: finalizeFTSJob = self.__finalizeFTSJob( request, ftsJob ) if not finalizeFTSJob["OK"]: log.error( finalizeFTSJob["Message"] ) return finalizeFTSJob ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, finalizeFTSJob["Value"] ) return S_OK( ftsFilesDict ) def __finalizeFTSJob( self, request, ftsJob ): """ finalize FTSJob :param Request request: ReqDB.Request instance :param FTSJob ftsJob: FTSDB.FTSJob instance """ log = self.log.getSubLogger( "%s/monitor/%s/finalize" % ( request.RequestName, ftsJob.FTSJobID ) ) log.info( "finalizing FTSJob %s@%s" % ( ftsJob.FTSGUID, ftsJob.FTSServer ) ) # # this will be returned ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) monitor = ftsJob.monitorFTS2( full = True ) if not monitor["OK"]: log.error( monitor["Message"] ) return monitor # # split FTSFiles to different categories processFiles = self.__filterFiles( ftsJob ) if not processFiles["OK"]: log.error( processFiles["Message"] ) return processFiles ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, processFiles["Value"] ) # # send accounting record for this job self.__sendAccounting( ftsJob, request.OwnerDN ) # # update graph - remove this job from graph route = self.__ftsGraph.findRoute( ftsJob.SourceSE, ftsJob.TargetSE ) if route["OK"]: try: self.updateLock().acquire() route["Value"].ActiveJobs -= 1 finally: self.updateLock().release() log.info( "FTSJob is finalized" ) return S_OK( ftsFilesDict ) def __filterFiles( self, ftsJob ): """ process ftsFiles from finished ftsJob :param FTSJob ftsJob: monitored FTSJob instance """ # # lists for different categories toUpdate = [] toReschedule = [] toRegister = [] toSubmit = [] toFail = [] # # loop over files in fts job for ftsFile in ftsJob: # # successful files if ftsFile.Status == "Finished": if ftsFile.Error == "AddCatalogReplicaFailed": toRegister.append( ftsFile ) toUpdate.append( ftsFile ) continue if ftsFile.Status == "Failed": if ftsFile.Error == "MissingSource": toReschedule.append( ftsFile ) else: if ftsFile.Attempt < self.MAX_ATTEMPT: toSubmit.append( ftsFile ) else: toFail.append( ftsFile ) ftsFile.Error = "Max attempts reached" return S_OK( { "toUpdate": toUpdate, "toSubmit": toSubmit, "toRegister": toRegister, "toReschedule": toReschedule, "toFail": toFail } ) def __register( self, request, operation, toRegister ): """ add RegisterReplica operation :param Request request: request instance :param Operation transferOp: 'ReplicateAndRegister' operation for this FTSJob :param list toRegister: [ FTSDB.FTSFile, ... ] - files that failed to register """ log = self.log.getSubLogger( "%s/registerFiles" % request.RequestName ) byTarget = {} for ftsFile in toRegister: if ftsFile.TargetSE not in byTarget: byTarget.setdefault( ftsFile.TargetSE, [] ) byTarget[ftsFile.TargetSE].append( ftsFile ) log.info( "will create %s 'RegisterReplica' operations" % len( byTarget ) ) for target, ftsFileList in byTarget.items(): log.info( "creating 'RegisterReplica' operation for targetSE %s with %s files..." % ( target, len( ftsFileList ) ) ) registerOperation = Operation() registerOperation.Type = "RegisterReplica" registerOperation.Status = "Waiting" registerOperation.TargetSE = target targetSE = self.getSE( target ) for ftsFile in ftsFileList: opFile = File() opFile.LFN = ftsFile.LFN pfn = targetSE.getPfnForProtocol( ftsFile.TargetSURL, "SRM2", withPort = False ) if not pfn["OK"]: continue opFile.PFN = pfn["Value"] registerOperation.addFile( opFile ) request.insertBefore( registerOperation, operation ) return S_OK() @staticmethod def __sendAccounting( ftsJob, ownerDN ): """ prepare and send DataOperation to AccouringDB """ dataOp = DataOperation() dataOp.setStartTime( fromString( ftsJob.SubmitTime ) ) dataOp.setEndTime( fromString( ftsJob.LastUpdate ) ) accountingDict = dict() accountingDict["OperationType"] = "ReplicateAndRegister" username = getUsernameForDN( ownerDN ) if not username["OK"]: username = ownerDN else: username = username["Value"] accountingDict["User"] = username accountingDict["Protocol"] = "FTS3" if 'fts3' in ftsJob.FTSServer.lower() else 'FTS' accountingDict['ExecutionSite'] = ftsJob.FTSServer accountingDict['RegistrationTime'] = ftsJob._regTime accountingDict['RegistrationOK'] = ftsJob._regSuccess accountingDict['RegistrationTotal'] = ftsJob._regTotal accountingDict["TransferOK"] = len( [ f for f in ftsJob if f.Status in FTSFile.SUCCESS_STATES ] ) accountingDict["TransferTotal"] = len( ftsJob ) accountingDict["TransferSize"] = ftsJob.Size - ftsJob.FailedSize accountingDict["FinalStatus"] = ftsJob.Status accountingDict["Source"] = ftsJob.SourceSE accountingDict["Destination"] = ftsJob.TargetSE dt = ftsJob.LastUpdate - ftsJob.SubmitTime transferTime = dt.days * 86400 + dt.seconds accountingDict["TransferTime"] = transferTime # accountingDict['TransferTime'] = sum( [f._duration for f in ftsJob]) dataOp.setValuesFromDict( accountingDict ) dataOp.commit() def __checkReadyReplicas( self, request, operation ): """ check ready replicas for transferOperation """ log = self.log.getSubLogger( "%s/checkReadyReplicas" % request.RequestName ) targetSESet = set( operation.targetSEList ) # # { LFN: [ targetSE, ... ] } missingReplicas = {} scheduledFiles = dict( [ ( opFile.LFN, opFile ) for opFile in operation if opFile.Status in ( "Scheduled", "Waiting" ) ] ) # # get replicas replicas = self.replicaManager().getCatalogReplicas( scheduledFiles.keys() ) if not replicas["OK"]: self.log.error( replicas["Message"] ) return replicas replicas = replicas["Value"] fullyReplicated = 0 missingSEs = {} for successfulLFN in replicas["Successful"]: reps = set( replicas['Successful'][successfulLFN] ) if targetSESet.issubset( reps ): log.info( "%s has been replicated to all targets" % successfulLFN ) fullyReplicated += 1 scheduledFiles[successfulLFN].Status = "Done" else: missingReplicas[successfulLFN] = sorted( targetSESet - reps ) ses = ",".join( missingReplicas[ successfulLFN ] ) missingSEs[ses] = missingSEs.setdefault( ses, 0 ) + 1 log.verbose( "%s is still missing at %s" % ( successfulLFN, ses ) ) if fullyReplicated: log.info( "%d new files have been replicated to all targets" % fullyReplicated ) if missingSEs: for ses in missingSEs: log.info( "%d replicas still missing at %s" % ( missingSEs[ses], ses ) ) reMissing = re.compile( "no such file or directory" ) for failedLFN, errStr in replicas["Failed"].items(): scheduledFiles[failedLFN].Error = errStr if reMissing.search( errStr.lower() ): log.error( "%s is missing, setting its status to 'Failed'" % failedLFN ) scheduledFiles[failedLFN].Status = "Failed" else: log.warn( "unable to read replicas for %s: %s" % ( failedLFN, errStr ) ) return S_OK( missingReplicas ) def __filterReplicas( self, opFile ): """ filter out banned/invalid source SEs """ log = self.log.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "Banned" : [], "Bad" : [] } replicas = self.replicaManager().getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( replicas["Message"] ) reNotExists = re.compile( "not such file or directory" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"][opFile.LFN] if opFile.LFN in replicas["Successful"] else {} for repSEName in replicas: repSE = self.getSE( repSEName ) pfn = repSE.getPfnForLfn( opFile.LFN ) if not pfn["OK"]: log.warn( "unable to create pfn for %s lfn: %s" % ( opFile.LFN, pfn["Message"] ) ) ret["Banned"].append( repSEName ) continue pfn = pfn["Value"] repSEMetadata = repSE.getFileMetadata( pfn, singleFile = True ) if not repSEMetadata["OK"]: self.log.warn( repSEMetadata["Message"] ) ret["Banned"].append( repSEName ) continue repSEMetadata = repSEMetadata["Value"] seChecksum = repSEMetadata["Checksum"].replace( "x", "0" ).zfill( 8 ) if "Checksum" in repSEMetadata else None if opFile.Checksum and opFile.Checksum != seChecksum: self.log.warn( " %s checksum mismatch: %s %s:%s" % ( opFile.LFN, opFile.Checksum, repSE, seChecksum ) ) ret["Bad"].append( repSEName ) continue # # if we're here repSE is OK ret["Valid"].append( repSEName ) return S_OK( ret )
class RemovalAgent( AgentModule, RequestAgentMixIn ): """ This Agent takes care of executing "removal" request from the RequestManagement system """ def __init__( self, *args ): """ Initialize the base class and define some extra data members """ AgentModule.__init__( self, *args ) self.requestDBClient = None self.replicaManager = None self.maxNumberOfThreads = 4 self.maxRequestsInQueue = 100 self.threadPool = None def initialize( self ): """ Called by the framework upon startup, before any cycle (execute method bellow) """ self.requestDBClient = RequestClient() self.replicaManager = ReplicaManager() gMonitor.registerActivity( "Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalAtt", "Physical removals attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalDone", "Successful physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalFail", "Failed physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "PhysicalRemovalSize", "Physically removed size", "RemovalAgent", "Bytes", gMonitor.OP_ACUM ) gMonitor.registerActivity( "ReplicaRemovalAtt", "Replica removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalDone", "Successful replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "ReplicaRemovalFail", "Failed replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileAtt", "File removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileDone", "File removal done", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RemoveFileFail", "File removal failed", "RemovalAgent", "Removal/min", gMonitor.OP_SUM ) self.maxNumberOfThreads = self.am_getOption( 'NumberOfThreads', self.maxNumberOfThreads ) self.maxRequestsInQueue = self.am_getOption( 'RequestsInQueue', self.maxRequestsInQueue ) self.threadPool = ThreadPool( 1, self.maxNumberOfThreads, self.maxRequestsInQueue ) # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted self.threadPool.daemonize() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ Fill the TreadPool with ThreadJobs """ while True: requestExecutor = ThreadedJob( self.executeRequest ) ret = self.threadPool.queueJob( requestExecutor ) if not ret['OK']: break return S_OK() def executeRequest( self ): """ Do the actual work in the Thread """ ################################################ # Get a request from request DB gMonitor.addMark( "Iteration", 1 ) res = self.requestDBClient.getRequest( 'removal' ) if not res['OK']: gLogger.info( "RemovalAgent.execute: Failed to get request from database." ) return S_OK() elif not res['Value']: gLogger.info( "RemovalAgent.execute: No requests to be executed found." ) return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] sourceServer = res['Value']['Server'] try: jobID = int( res['Value']['JobID'] ) except ValueError: jobID = 0 gLogger.info( "RemovalAgent.execute: Obtained request %s" % requestName ) result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer ) if result['OK']: currentOrder = result['Value'] else: gLogger.error( 'Can not get the request execution order' ) return S_OK( 'Can not get the request execution order' ) oRequest = RequestContainer( request = requestString ) ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests( 'removal' ) if not res['OK']: errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests." gLogger.error( errStr, res['Message'] ) return S_OK() gLogger.info( "RemovalAgent.execute: Found %s sub requests." % res['Value'] ) ################################################ # For all the sub-requests in the request modified = False for ind in range( res['Value'] ): gMonitor.addMark( "Execute", 1 ) gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind ) subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal' )['Value'] subExecutionOrder = int( subRequestAttributes['ExecutionOrder'] ) subStatus = subRequestAttributes['Status'] if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal' )['Value'] operation = subRequestAttributes['Operation'] ################################################ # If the sub-request is a physical removal operation if operation == 'physicalRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) physicalFiles = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) pfnToLfn[pfn] = lfn physicalFiles.append( pfn ) gMonitor.addMark( 'PhysicalRemovalAtt', len( physicalFiles ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeStorageFile( physicalFiles, diracSE ) if res['OK']: for pfn in res['Value']['Failed'].keys(): if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = res['Value']['Failed'][pfn] else: errMsg[diracSE] = res['Message'] for pfn in physicalFiles: if not failed.has_key( pfn ): failed[pfn] = {} failed[pfn][diracSE] = 'Completely' # Now analyse the results failedPFNs = failed.keys() pfnsOK = [pfn for pfn in physicalFiles if not pfn in failedPFNs] gMonitor.addMark( 'PhysicalRemovalDone', len( pfnsOK ) ) for pfn in pfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( pfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedPFNs ) ) for pfn in failedPFNs: for diracSE in failed[pfn].keys(): if type( failed[pfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[pfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", pfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', pfnToLfn[pfn] ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( pfn, diracSE, failed[pfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'removeFile': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'RemoveFileAtt', len( lfns ) ) res = self.replicaManager.removeFile( lfns ) if res['OK']: gMonitor.addMark( 'RemoveFileDone', len( res['Value']['Successful'].keys() ) ) for lfn in res['Value']['Successful'].keys(): gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True gMonitor.addMark( 'RemoveFileFail', len( res['Value']['Failed'].keys() ) ) for lfn in res['Value']['Failed'].keys(): if type( res['Value']['Failed'][lfn] ) in StringTypes: if re.search( 'no such file or directory', res['Value']['Failed'][lfn].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file:", "%s %s" % ( lfn, res['Value']['Failed'][lfn] ) ) else: gMonitor.addMark( 'RemoveFileFail', len( lfns ) ) errStr = "RemovalAgent.execute: Completely failed to remove files files." gLogger.error( errStr, res['Message'] ) ################################################ # If the sub-request is a physical removal operation elif operation == 'replicaRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSEs = subRequestAttributes['TargetSE'].split( ',' ) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str( subRequestFile['LFN'] ) lfns.append( lfn ) gMonitor.addMark( 'ReplicaRemovalAtt', len( lfns ) ) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeReplica( diracSE, lfns ) if res['OK']: for lfn in res['Value']['Failed'].keys(): if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = res['Value']['Failed'][lfn] else: errMsg[diracSE] = res['Message'] for lfn in lfns: if not failed.has_key( lfn ): failed[lfn] = {} failed[lfn][diracSE] = 'Completely' # Now analyse the results failedLFNs = failed.keys() lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs] gMonitor.addMark( 'ReplicaRemovalDone', len( lfnsOK ) ) for lfn in lfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % ( lfn, str( diracSEs ) ) ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True if failed: gMonitor.addMark( 'PhysicalRemovalFail', len( failedLFNs ) ) for lfn in failedLFNs: for diracSE in failed[lfn].keys(): if type( failed[lfn][diracSE] ) in StringTypes: if re.search( 'no such file or directory', failed[lfn][diracSE].lower() ): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn ) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % ( lfn, diracSE, failed[lfn][diracSE] ) ) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error( errStr, errMsg[diracSE] ) ################################################ # If the sub-request is a request to the online system to retransfer elif operation == 'reTransfer': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation ) diracSE = subRequestAttributes['TargetSE'] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str( subRequestFile['PFN'] ) lfn = str( subRequestFile['LFN'] ) res = self.replicaManager.onlineRetransfer( diracSE, pfn ) if res['OK']: if res['Value']['Successful'].has_key( pfn ): gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn ) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done' ) if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ( 'Done', lfn ) ) modified = True else: errStr = "RemovalAgent.execute: Failed to request retransfer." gLogger.error( errStr, "%s %s %s" % ( pfn, diracSE, res['Value']['Failed'][pfn] ) ) else: errStr = "RemovalAgent.execute: Completely failed to request retransfer." gLogger.error( errStr, res['Message'] ) else: gLogger.info( "RemovalAgent.execute: File already completed." ) ################################################ # If the sub-request is none of the above types else: gLogger.error( "RemovalAgent.execute: Operation not supported.", operation ) ################################################ # Determine whether there are any active files if oRequest.isSubRequestEmpty( ind, 'removal' )['Value']: oRequest.setSubRequestStatus( ind, 'removal', 'Done' ) gMonitor.addMark( "Done", 1 ) ################################################ # If the sub-request is already in terminal state else: gLogger.info( "RemovalAgent.execute:", "Sub-request %s is status '%s' and not to be executed." % ( ind, subRequestAttributes['Status'] ) ) ################################################ # Generate the new request string after operation requestString = oRequest.toXML()['Value'] res = self.requestDBClient.updateRequest( requestName, requestString, sourceServer ) if modified and jobID: result = self.finalizeRequest( requestName, jobID, sourceServer ) return S_OK() def finalize( self ): """ Called by the Agent framework to cleanly end execution. In this case this module will wait until all pending ThreadedJbos in the ThreadPool get executed """ self.threadPool.processAllResults() return S_OK()
class RemovalAgent(AgentModule, RequestAgentMixIn): """ This Agent takes care of executing "removal" request from the RequestManagement system """ def __init__(self, *args): """ Initialize the base class and define some extra data members """ AgentModule.__init__(self, *args) self.requestDBClient = None self.replicaManager = None self.maxNumberOfThreads = 4 self.maxRequestsInQueue = 100 self.threadPool = None self.timeOutCounter = 0 self.pendingRequests = True def initialize(self): """ Called by the framework upon startup, before any cycle (execute method bellow) """ self.requestDBClient = RequestClient() # the RequestAgentMixIn needs the capitalized version, until is is fixed keep this. self.RequestDBClient = self.requestDBClient self.replicaManager = ReplicaManager() gMonitor.registerActivity("Iteration", "Agent Loops", "RemovalAgent", "Loops/min", gMonitor.OP_SUM) gMonitor.registerActivity("Execute", "Request Processed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("Done", "Request Completed", "RemovalAgent", "Requests/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalAtt", "Physical removals attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalDone", "Successful physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalFail", "Failed physical removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("PhysicalRemovalSize", "Physically removed size", "RemovalAgent", "Bytes", gMonitor.OP_ACUM) gMonitor.registerActivity("ReplicaRemovalAtt", "Replica removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicaRemovalDone", "Successful replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("ReplicaRemovalFail", "Failed replica removals", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveFileAtt", "File removal attempted", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveFileDone", "File removal done", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) gMonitor.registerActivity("RemoveFileFail", "File removal failed", "RemovalAgent", "Removal/min", gMonitor.OP_SUM) self.maxNumberOfThreads = self.am_getOption('NumberOfThreads', self.maxNumberOfThreads) self.maxRequestsInQueue = self.am_getOption('RequestsInQueue', self.maxRequestsInQueue) self.threadPool = ThreadPool(1, self.maxNumberOfThreads, self.maxRequestsInQueue) # Set the ThreadPool in daemon mode to process new ThreadedJobs as they are inserted self.threadPool.daemonize() self.maxRequests = self.am_getOption('MaxRequestsPerCycle', 1200.) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): """ Fill the TreadPool with ThreadJobs """ self.pendingRequests = True self.maxRequests = min( 10000., self.am_getOption('MaxRequestsPerCycle', self.maxRequests)) requestCounter = 0 while self.pendingRequests: if requestCounter > self.maxRequests: break requestCounter += 1 requestExecutor = ThreadedJob(self.executeRequest) ret = self.threadPool.queueJob(requestExecutor) if not ret['OK']: break time.sleep(0.1) if self.timeOutCounter: gLogger.error('Timeouts during removal execution:', self.timeOutCounter) return S_OK() def executeRequest(self): """ Do the actual work in the Thread """ ################################################ # Get a request from request DB gMonitor.addMark("Iteration", 1) res = self.requestDBClient.getRequest('removal') if not res['OK']: gLogger.info( "RemovalAgent.execute: Failed to get request from database.") return S_OK() elif not res['Value']: gLogger.info( "RemovalAgent.execute: No requests to be executed found.") self.pendingRequests = False return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] sourceServer = res['Value']['Server'] jobID = 0 try: jobID = int(res['Value']['JobID']) except: gLogger.warn( "RemovalAgent.execute: JobID not present or malformed in request '%s', will use 0 instead." % requestName) gLogger.info("RemovalAgent.execute: Obtained request %s" % requestName) try: result = self.requestDBClient.getCurrentExecutionOrder( requestName, sourceServer) if result['OK']: currentOrder = result['Value'] else: gLogger.error('Can not get the request execution order') self.requestDBClient.updateRequest(requestName, requestString, sourceServer) return S_OK('Can not get the request execution order') oRequest = RequestContainer(request=requestString) ################################################ # Find the number of sub-requests from the request res = oRequest.getNumSubRequests('removal') if not res['OK']: errStr = "RemovalAgent.execute: Failed to obtain number of removal subrequests." gLogger.error(errStr, res['Message']) return S_OK() gLogger.info("RemovalAgent.execute: Found %s sub requests." % res['Value']) ################################################ # For all the sub-requests in the request modified = False for ind in range(res['Value']): gMonitor.addMark("Execute", 1) gLogger.info( "RemovalAgent.execute: Processing sub-request %s." % ind) subRequestAttributes = oRequest.getSubRequestAttributes( ind, 'removal')['Value'] subExecutionOrder = int(subRequestAttributes['ExecutionOrder']) subStatus = subRequestAttributes['Status'] if subStatus == 'Waiting' and subExecutionOrder <= currentOrder: subRequestFiles = oRequest.getSubRequestFiles( ind, 'removal')['Value'] operation = subRequestAttributes['Operation'] ################################################ # If the sub-request is a physical removal operation if operation == 'physicalRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) diracSEs = subRequestAttributes['TargetSE'].split(',') physicalFiles = [] pfnToLfn = {} for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str(subRequestFile['PFN']) lfn = str(subRequestFile['LFN']) pfnToLfn[pfn] = lfn physicalFiles.append(pfn) gMonitor.addMark('PhysicalRemovalAtt', len(physicalFiles)) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeStorageFile( physicalFiles, diracSE) if res['OK']: for pfn in res['Value']['Failed'].keys(): if not failed.has_key(pfn): failed[pfn] = {} failed[pfn][diracSE] = res['Value'][ 'Failed'][pfn] else: errMsg[diracSE] = res['Message'] for pfn in physicalFiles: if not failed.has_key(pfn): failed[pfn] = {} failed[pfn][diracSE] = 'Completely' # Now analyse the results failedPFNs = failed.keys() pfnsOK = [ pfn for pfn in physicalFiles if not pfn in failedPFNs ] gMonitor.addMark('PhysicalRemovalDone', len(pfnsOK)) for pfn in pfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % (pfn, str(diracSEs))) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', pfnToLfn[pfn])) modified = True if failed: gMonitor.addMark('PhysicalRemovalFail', len(failedPFNs)) for pfn in failedPFNs: for diracSE in failed[pfn].keys(): if type(failed[pfn] [diracSE]) in StringTypes: if re.search( 'no such file or directory', failed[pfn][diracSE].lower()): gLogger.info( "RemovalAgent.execute: File did not exist.", pfn) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', pfnToLfn[pfn], 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', pfnToLfn[pfn])) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % (pfn, diracSE, failed[pfn][diracSE])) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error(errStr, errMsg[diracSE]) ################################################ # If the sub-request is a physical removal operation elif operation == 'removeFile': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str(subRequestFile['LFN']) lfns.append(lfn) gMonitor.addMark('RemoveFileAtt', len(lfns)) res = self.replicaManager.removeFile(lfns) if res['OK']: gMonitor.addMark( 'RemoveFileDone', len(res['Value']['Successful'].keys())) for lfn in res['Value']['Successful'].keys(): gLogger.info( "RemovalAgent.execute: Successfully removed %s." % lfn) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True gMonitor.addMark( 'RemoveFileFail', len(res['Value']['Failed'].keys())) for lfn in res['Value']['Failed'].keys(): if type(res['Value']['Failed'] [lfn]) in StringTypes: if re.search( 'no such file or directory', res['Value']['Failed'] [lfn].lower()): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file:", "%s %s" % (lfn, res['Value']['Failed'][lfn])) else: gMonitor.addMark('RemoveFileFail', len(lfns)) errStr = "RemovalAgent.execute: Completely failed to remove files files." gLogger.error(errStr, res['Message']) ################################################ # If the sub-request is a physical removal operation elif operation == 'replicaRemoval': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) diracSEs = subRequestAttributes['TargetSE'].split(',') lfns = [] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': lfn = str(subRequestFile['LFN']) lfns.append(lfn) gMonitor.addMark('ReplicaRemovalAtt', len(lfns)) failed = {} errMsg = {} for diracSE in diracSEs: res = self.replicaManager.removeReplica( diracSE, lfns) if res['OK']: for lfn in res['Value']['Failed'].keys(): errorMessage = str( res['Value']['Failed'][lfn]) if errorMessage.find( 'Write access not permitted for this credential.' ) != -1: if self.__getProxyAndRemoveReplica( diracSE, lfn): continue if errorMessage.find( 'seconds timeout for "__gfal_wrapper" call' ) != -1: self.timeOutCounter += 1 if not failed.has_key(lfn): failed[lfn] = {} failed[lfn][diracSE] = res['Value'][ 'Failed'][lfn] else: errMsg[diracSE] = res['Message'] for lfn in lfns: if not failed.has_key(lfn): failed[lfn] = {} failed[lfn][diracSE] = 'Completely' # Now analyse the results failedLFNs = failed.keys() lfnsOK = [lfn for lfn in lfns if not lfn in failedLFNs] gMonitor.addMark('ReplicaRemovalDone', len(lfnsOK)) for lfn in lfnsOK: gLogger.info( "RemovalAgent.execute: Successfully removed %s at %s" % (lfn, str(diracSEs))) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True if failed: gMonitor.addMark('PhysicalRemovalFail', len(failedLFNs)) for lfn in failedLFNs: for diracSE in failed[lfn].keys(): if type(failed[lfn] [diracSE]) in StringTypes: if re.search( 'no such file or directory', failed[lfn][diracSE].lower()): gLogger.info( "RemovalAgent.execute: File did not exist.", lfn) res = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not res['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True else: gLogger.info( "RemovalAgent.execute: Failed to remove file.", "%s at %s - %s" % (lfn, diracSE, failed[lfn][diracSE])) if errMsg: for diracSE in errMsg.keys(): errStr = "RemovalAgent.execute: Completely failed to remove replicas. At %s", diracSE gLogger.error(errStr, errMsg[diracSE]) ################################################ # If the sub-request is a request to the online system to retransfer elif operation == 'reTransfer': gLogger.info( "RemovalAgent.execute: Attempting to execute %s sub-request." % operation) diracSE = subRequestAttributes['TargetSE'] for subRequestFile in subRequestFiles: if subRequestFile['Status'] == 'Waiting': pfn = str(subRequestFile['PFN']) lfn = str(subRequestFile['LFN']) res = self.replicaManager.onlineRetransfer( diracSE, pfn) if res['OK']: if res['Value']['Successful'].has_key(pfn): gLogger.info( "RemovalAgent.execute: Successfully requested retransfer of %s." % pfn) result = oRequest.setSubRequestFileAttributeValue( ind, 'removal', lfn, 'Status', 'Done') if not result['OK']: gLogger.error( "RemovalAgent.execute: Error setting status to %s for %s" % ('Done', lfn)) modified = True else: errStr = "RemovalAgent.execute: Failed to request retransfer." gLogger.error( errStr, "%s %s %s" % (pfn, diracSE, res['Value']['Failed'][pfn])) else: errStr = "RemovalAgent.execute: Completely failed to request retransfer." gLogger.error(errStr, res['Message']) else: gLogger.info( "RemovalAgent.execute: File already completed." ) ################################################ # If the sub-request is none of the above types else: gLogger.error( "RemovalAgent.execute: Operation not supported.", operation) ################################################ # Determine whether there are any active files if oRequest.isSubRequestEmpty(ind, 'removal')['Value']: oRequest.setSubRequestStatus(ind, 'removal', 'Done') gMonitor.addMark("Done", 1) ################################################ # If the sub-request is already in terminal state else: gLogger.info( "RemovalAgent.execute:", "Sub-request %s is status '%s' and not to be executed." % (ind, subRequestAttributes['Status'])) ################################################ # Generate the new request string after operation newrequestString = oRequest.toXML()['Value'] except: # if something fails return the original request back to the server res = self.requestDBClient.updateRequest(requestName, requestString, sourceServer) return S_OK() res = self.requestDBClient.updateRequest(requestName, newrequestString, sourceServer) if modified and jobID: result = self.finalizeRequest(requestName, jobID, sourceServer) return S_OK() def __getProxyAndRemoveReplica(self, diracSE, lfn): """ get a proxy from the owner of the file and try to remove it returns True if it succeeds, False otherwise """ result = self.replicaManager.getCatalogDirectoryMetadata( lfn, singleFile=True) if not result['OK']: gLogger.error("Could not get metadata info", result['Message']) return False ownerRole = result['Value']['OwnerRole'] ownerDN = result['Value']['OwnerDN'] if ownerRole[0] != "/": ownerRole = "/%s" % ownerRole userProxy = '' for ownerGroup in Registry.getGroupsWithVOMSAttribute(ownerRole): result = gProxyManager.downloadVOMSProxy( ownerDN, ownerGroup, limited=True, requiredVOMSAttribute=ownerRole) if not result['OK']: gLogger.verbose( 'Failed to retrieve voms proxy for %s : %s:' % (ownerDN, ownerRole), result['Message']) continue userProxy = result['Value'] gLogger.verbose("Got proxy for %s@%s [%s]" % (ownerDN, ownerGroup, ownerRole)) break if not userProxy: return False result = userProxy.dumpAllToFile() if not result['OK']: gLogger.verbose(result['Message']) return False upFile = result['Value'] prevProxyEnv = os.environ['X509_USER_PROXY'] os.environ['X509_USER_PROXY'] = upFile try: res = self.replicaManager.removeReplica(diracSE, lfn) if res['OK'] and lfn in res['Value']['Successful']: gLogger.verbose('Removed %s from %s' % (lfn, diracSE)) return True finally: os.environ['X509_USER_PROXY'] = prevProxyEnv os.unlink(upFile) return False def finalize(self): """ Called by the Agent framework to cleanly end execution. In this case this module will wait until all pending ThreadedJbos in the ThreadPool get executed """ self.threadPool.processAllResults() return S_OK()
class GatewayService( Service ): GATEWAY_NAME = "Framework/Gateway" def __init__( self ): Service.__init__( self, GatewayService.GATEWAY_NAME ) self.__delegatedCredentials = DictCache() self.__transferBytesLimit = 1024 * 1024 * 100 def initialize( self ): #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR( "Could not build service URL for %s" % GatewayService.GATEWAY_NAME ) gLogger.verbose( "Service URL is %s" % self._url ) #Discover Handler self._initMonitoring() self._threadPool = ThreadPool( 1, max( 0, self._cfg.getMaxThreads() ), self._cfg.getMaxWaitingPetitions() ) self._threadPool.daemonize() self._msgBroker = MessageBroker( "%sMSB" % GatewayService.GATEWAY_NAME, threadPool = self._threadPool ) self._msgBroker.useMessageObjects( False ) getGlobalMessageBroker().useMessageObjects( False ) self._msgForwarder = MessageForwarder( self._msgBroker ) return S_OK() #Threaded process function def _processInThread( self, clientTransport ): #Handshake try: clientTransport.handshake() except: return #Add to the transport pool trid = self._transportPool.add( clientTransport ) if not trid: return #Receive and check proposal result = self._receiveAndCheckProposal( trid ) if not result[ 'OK' ]: self._transportPool.sendAndClose( trid, result ) return proposalTuple = result[ 'Value' ] #Instantiate handler result = self.__getClientInitArgs( trid, proposalTuple ) if not result[ 'OK' ]: self._transportPool.sendAndClose( trid, result ) return clientInitArgs = result[ 'Value' ] #Execute the action result = self._processProposal( trid, proposalTuple, clientInitArgs ) #Close the connection if required if result[ 'closeTransport' ]: self._transportPool.close( trid ) return result def _receiveAndCheckProposal( self, trid ): clientTransport = self._transportPool.get( trid ) #Get the peer credentials credDict = clientTransport.getConnectingCredentials() #Receive the action proposal retVal = clientTransport.receiveData( 1024 ) if not retVal[ 'OK' ]: gLogger.error( "Invalid action proposal", "%s %s" % ( self._createIdentityString( credDict, clientTransport ), retVal[ 'Message' ] ) ) return S_ERROR( "Invalid action proposal" ) proposalTuple = retVal[ 'Value' ] gLogger.debug( "Received action from client", "/".join( list( proposalTuple[1] ) ) ) #Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials( proposalTuple[2] ) return S_OK( proposalTuple ) def __getClientInitArgs( self, trid, proposalTuple ): clientTransport = self._transportPool.get( trid ) #Get the peer credentials credDict = clientTransport.getConnectingCredentials() if 'x509Chain' not in credDict: return S_OK() cKey = ( credDict[ 'DN' ], credDict.get( 'group', False ), credDict.get( 'extraCredentials', False ), credDict[ 'isLimitedProxy' ] ) dP = self.__delegatedCredentials.get( cKey, 3600 ) idString = self._createIdentityString( credDict, clientTransport ) if dP: gLogger.verbose( "Proxy for %s is cached" % idString ) return S_OK( dP ) result = self.__requestDelegation( clientTransport, credDict ) if not result[ 'OK' ]: gLogger.warn( "Could not get proxy for %s: %s" % ( idString, result[ 'Message' ] ) ) return result delChain = result[ 'Value' ] delegatedChain = delChain.dumpAllToString()[ 'Value' ] secsLeft = delChain.getRemainingSecs()[ 'Value' ] - 1 clientInitArgs = { BaseClient.KW_SETUP : proposalTuple[0][1], BaseClient.KW_TIMEOUT : 600, BaseClient.KW_IGNORE_GATEWAYS : True, BaseClient.KW_USE_CERTIFICATES : False, BaseClient.KW_PROXY_STRING : delegatedChain } if BaseClient.KW_EXTRA_CREDENTIALS in credDict: clientInitArgs[ BaseClient.KW_EXTRA_CREDENTIALS ] = credDict[ BaseClient.KW_EXTRA_CREDENTIALS ] gLogger.warn( "Got delegated proxy for %s: %s secs left" % ( idString, secsLeft ) ) self.__delegatedCredentials.add( cKey, secsLeft, clientInitArgs ) return S_OK( clientInitArgs ) def __requestDelegation( self, clientTransport, credDict ): peerChain = credDict[ 'x509Chain' ] retVal = peerChain.getCertInChain()[ 'Value' ].generateProxyRequest() if not retVal[ 'OK' ]: return retVal delegationRequest = retVal[ 'Value' ] retVal = delegationRequest.dumpRequest() if not retVal[ 'OK' ]: retVal = S_ERROR( "Server Error: Can't generate delegation request" ) clientTransport.sendData( retVal ) return retVal gLogger.info( "Sending delegation request for %s" % delegationRequest.getSubjectDN()[ 'Value' ] ) clientTransport.sendData( S_OK( { 'delegate' : retVal[ 'Value' ] } ) ) delegatedCertChain = clientTransport.receiveData() delegatedChain = X509Chain( keyObj = delegationRequest.getPKey() ) retVal = delegatedChain.loadChainFromString( delegatedCertChain ) if not retVal[ 'OK' ]: retVal = S_ERROR( "Error in receiving delegated proxy: %s" % retVal[ 'Message' ] ) clientTransport.sendData( retVal ) return retVal return S_OK( delegatedChain ) #Msg def _mbConnect( self, trid, clientInitArgs ): return S_OK() def _mbReceivedMsg( self, cliTrid, msgObj ): return self._msgForwarder.msgFromClient( cliTrid, msgObj ) def _mbDisconnect( self, cliTrid ): self._msgForwarder.cliDisconnect( cliTrid ) #Execute action def _executeAction( self, trid, proposalTuple, clientInitArgs ): clientTransport = self._transportPool.get( trid ) credDict = clientTransport.getConnectingCredentials() targetService = proposalTuple[0][0] actionType = proposalTuple[1][0] actionMethod = proposalTuple[1][1] idString = self._createIdentityString( credDict, clientTransport ) #OOkay! Lets do the magic! retVal = clientTransport.receiveData() if not retVal[ 'OK' ]: gLogger.error( "Error while receiving file description", retVal[ 'Message' ] ) clientTransport.sendData( S_ERROR( "Error while receiving file description: %s" % retVal[ 'Message' ] ) ) return if actionType == "FileTransfer": gLogger.warn( "Received a file transfer action from %s" % idString ) clientTransport.sendData( S_OK( "Accepted" ) ) retVal = self.__forwardFileTransferCall( targetService, clientInitArgs, actionMethod, retVal[ 'Value' ], clientTransport ) elif actionType == "RPC": gLogger.info( "Forwarding %s/%s action to %s for %s" % ( actionType, actionMethod, targetService, idString ) ) retVal = self.__forwardRPCCall( targetService, clientInitArgs, actionMethod, retVal[ 'Value' ] ) elif actionType == "Connection" and actionMethod == "new": gLogger.info( "Initiating a messaging connection to %s for %s" % ( targetService, idString ) ) retVal = self._msgForwarder.addClient( trid, targetService, clientInitArgs, retVal[ 'Value' ] ) else: gLogger.warn( "Received an invalid %s/%s action from %s" % ( actionType, actionMethod, idString ) ) retVal = S_ERROR( "Unknown type of action (%s)" % actionType ) #TODO: Send back the data? if 'rpcStub' in retVal: retVal.pop( 'rpcStub' ) clientTransport.sendData( retVal ) return retVal def __forwardRPCCall( self, targetService, clientInitArgs, method, params ): if targetService == "Configuration/Server": if method == "getCompressedDataIfNewer": #Relay CS data directly serviceVersion = gConfigurationData.getVersion() retDict = { 'newestVersion' : serviceVersion } clientVersion = params[0] if clientVersion < serviceVersion: retDict[ 'data' ] = gConfigurationData.getCompressedData() return S_OK( retDict ) #Default rpcClient = RPCClient( targetService, **clientInitArgs ) methodObj = getattr( rpcClient, method ) return methodObj( *params ) def __forwardFileTransferCall( self, targetService, clientInitArgs, method, params, clientTransport ): transferRelay = TransferRelay( targetService, **clientInitArgs ) transferRelay.setTransferLimit( self.__transferBytesLimit ) cliFH = FileHelper( clientTransport ) #Check file size if method.find( "ToClient" ) > -1: cliFH.setDirection( "send" ) elif method.find( "FromClient" ) > -1: cliFH.setDirection( "receive" ) if not self.__ftCheckMaxTransferSize( params[2] ): cliFH.markAsTransferred() return S_ERROR( "Transfer size is too big" ) #Forward queries try: relayMethodObject = getattr( transferRelay, 'forward%s' % method ) except: return S_ERROR( "Cannot forward unknown method %s" % method ) result = relayMethodObject( cliFH, params ) return result def __ftCheckMaxTransferSize( self, requestedTransferSize ): if not self.__transferBytesLimit: return True if not requestedTransferSize: return True if requestedTransferSize <= self.__transferBytesLimit: return True return False
class FTSMonitorAgent( AgentModule ): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption( "shifterProxy", "DataManager" ) self.minThreads = self.am_getOption( "MinThreads", self.minThreads ) self.maxThreads = self.am_getOption( "MaxThreads", self.maxThreads ) minmax = ( abs( self.minThreads ), abs( self.maxThreads ) ) self.minThreads, self.maxThreads = min( minmax ), max( minmax ) self.log.info( "ThreadPool min threads = %s" % self.minThreads ) self.log.info( "ThreadPool max threads = %s" % self.maxThreads ) self.threadPool = ThreadPool( self.minThreads, self.maxThreads ) self.threadPool.daemonize() return S_OK() def execute( self ): """ push jobs to the thread pool """ self.log.info( "Obtaining requests to monitor" ) res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error( "Failed to get FTS requests", res['Message'] ) return res if not res["Value"]: self.log.info( "No FTS requests found to monitor." ) return S_OK() ftsReqs = res["Value"] self.log.info( "Found %s FTS jobs" % len( ftsReqs ) ) i = 1 for ftsJob in ftsReqs: while True: self.log.debug( "submitting FTS Job %s FTSReqID=%s to monitor" % ( i, ftsJob["FTSReqID"] ) ) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args = ( ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep( 1 ) self.threadPool.processAllResults() return S_OK() def ftsJobExpired( self, ftsReqID, channelID ): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) fileIDs = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not fileIDs["OK"]: log.error( "Unable to retrieve FileIDs associated to %s request" % ftsReqID ) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed" ) if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % ( fileID, fileStatus["Message"] ) ) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server" ) if not failReason["OK"]: log.error( "Unable to set FileToFTS reason for FileID %s: %s" % ( fileID, failReason["Message"] ) ) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not resetChannels["OK"]: log.error( "Failed to reset Channel table for files to retry" ) return resetChannels # # update FTSReq table log.info( "Setting FTS request status to 'Finished'" ) ftsReqStatus = self.transferDB.setFTSReqStatus( ftsReqID, "Finished" ) if not ftsReqStatus["OK"]: log.error( "Failed update FTS Request status", ftsReqStatus["Message"] ) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer( self, ftsReqDict ): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get( "FTSReqID" ) ftsGUID = ftsReqDict.get( "FTSGuid" ) ftsServer = ftsReqDict.get( "FTSServer" ) channelID = ftsReqDict.get( "ChannelID" ) sourceSE = ftsReqDict.get( "SourceSE" ) targetSE = ftsReqDict.get( "TargetSE" ) oFTSRequest = FTSRequest() oFTSRequest.setFTSServer( ftsServer ) oFTSRequest.setFTSGUID( ftsGUID ) oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info( "Perform summary update of the FTS Request" ) infoStr = [ "glite-transfer-status -s %s -l %s" % ( ftsServer, ftsGUID ) ] infoStr.append( "FTS GUID: %s" % ftsGUID ) infoStr.append( "FTS Server: %s" % ftsServer ) log.info( "\n".join( infoStr ) ) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor( ftsReqID ) if not res["OK"]: log.error( "Failed to update the FTS request summary", res["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in res["Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired( ftsReqID, channelID ) if not cleanUp["OK"]: log.error( cleanUp["Message"] ) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error( "Failed to get FTS request summary", res["Message"] ) return res log.info( res['Value'] ) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error( "Failed to get FTS percentage complete", res["Message"] ) return res log.info( 'FTS Request found to be %.1f percent complete' % res["Value"] ) self.transferDB.setFTSReqAttribute( ftsReqID, "PercentageComplete", res["Value"] ) self.transferDB.addLoggingEvent( ftsReqID, res["Value"] ) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error( "Failed to determine whether FTS request terminal", res["Message"] ) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest( oFTSRequest, ftsReqID, channelID, sourceSE ) def terminalRequest( self, oFTSRequest, ftsReqID, channelID, sourceSE ): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) log.info( "FTS Request found to be terminal, updating file states" ) ######################################################################### # Get the LFNS associated to the FTS request log.info( "Obtaining the LFNs associated to this request" ) res = self.transferDB.getFTSReqLFNs( ftsReqID, channelID, sourceSE ) if not res["OK"]: log.error( "Failed to obtain FTS request LFNs", res['Message'] ) return res files = res["Value"] if not files: log.error( "No files present for transfer" ) return S_ERROR( "No files were found in the DB" ) lfns = files.keys() log.debug( "Obtained %s files" % len( lfns ) ) for lfn in lfns: oFTSRequest.setLFN( lfn ) res = oFTSRequest.monitor() if not res["OK"]: log.error( "Failed to perform detailed monitoring of FTS request", res["Message"] ) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error( "Failed to obtained failed files for FTS request", res["Message"] ) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error( "Failed to obtained successful files for FTS request", res["Message"] ) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not res["OK"]: log.error( "Failed to get FileIDs associated to FTS Request", res["Message"] ) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList( fileIDs, ["LFN"] ) if not res["OK"]: log.error( "Failed to get LFNs associated to FTS Request", res["Message"] ) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append( fileID ) transferTime = 0 res = oFTSRequest.getTransferTime( lfn ) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( ( fileID, "Completed", "", 0, transferTime ) ) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason( lfn ) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append( fileID ) continue if self.missingSource( failReason ): log.error( "The source SURL does not exist.", "%s %s" % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) ) filesToFail.append( fileID ) else: filesToRetry.append( fileID ) log.error( "Failed to replicate file on channel.", "%s %s" % ( channelID, failReason ) ) fileToFTSUpdates.append( ( fileID, "Failed", failReason, 0, 0 ) ) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS( ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"] ) return res log.info( 'Adding logging event for FTS request' ) # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed to add logging event for FTS Request', res['Message'] ) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat( oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ) if not updateFileToCat["OK"]: log.error( updateFileToCat["Message"] ) log.debug( "Updating FTS request status" ) res = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed update FTS Request status', res['Message'] ) return S_OK() def updateFileToFTS( self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry ) if filesToRetry else S_OK() if not res["OK"]: log.error( "Failed to update the Channel table for file to retry.", res["Message"] ) allUpdated = False for fileID in filesToFail: log.info( "Updating the Channel table for files to reschedule" ) res = self.transferDB.setFileToReschedule( fileID ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error( "setting Channel status to 'Failed' : " % res["Value"] ) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"] ) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] ) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates ) if not res["OK"]: log.error( "Failed to update the FileToFTS table for files.", res["Message"] ) allUpdated = False return S_OK( allUpdated ) def updateFileToCat( self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append( fileID ) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append( fileID ) elif fileID in filesToFail: regForgetFileIDs.append( fileID ) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs ) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res["Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs ) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs ) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res return S_OK() @classmethod def missingSource( cls, failReason ): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search( failReason ): return 1 return 0
class Service(object): SVC_VALID_ACTIONS = { 'RPC': 'export', 'FileTransfer': 'transfer', 'Message': 'msg', 'Connection': 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__(self, serviceData): """ Init the variables for the service :param serviceData: dict with modName, standalone, loadName, moduleObj, classObj. e.g.: {'modName': 'Framework/serviceName', 'standalone': True, 'loadName': 'Framework/serviceName', 'moduleObj': <module 'serviceNameHandler' from '/home/DIRAC/FrameworkSystem/Service/serviceNameHandler.pyo'>, 'classObj': <class 'serviceNameHandler.serviceHandler'>} Standalone is true if there is only one service started If it's false, every service is linked to a different MonitoringClient """ self._svcData = serviceData self._name = serviceData['modName'] self._startTime = Time.dateTime() self._validNames = [serviceData['modName']] if serviceData['loadName'] not in self._validNames: self._validNames.append(serviceData['loadName']) self._cfg = ServiceConfiguration(list(self._validNames)) if serviceData['standalone']: self._monitor = gMonitor else: self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = {'queries': 0, 'connections': 0} self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection(serviceData['loadName'])) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 def setCloneProcessId(self, cloneId): self.__cloneId = cloneId self._monitor.setComponentName("%s-Clone:%s" % (self._name, cloneId)) def _isMetaAction(self, action): referedAction = Service.SVC_VALID_ACTIONS[action] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize(self): # Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % self._name) gLogger.verbose("Service URL is %s" % self._url) # Load handler result = self._loadHandlerInit() if not result['OK']: return result self._handler = result['Value'] # Initialize lock manager self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions()) self._initMonitoring() # TODO: remove ThreadPool if useThreadPoolExecutor: self._threadPool = ThreadPoolExecutor( max(0, self._cfg.getMaxThreads())) else: self._threadPool = ThreadPool(max(1, self._cfg.getMinThreads()), max(0, self._cfg.getMaxThreads()), self._cfg.getMaxWaitingPetitions()) self._threadPool.daemonize() self._msgBroker = MessageBroker("%sMSB" % self._name, threadPool=self._threadPool) # Create static dict self._serviceInfoDict = { 'serviceName': self._name, 'serviceSectionPath': PathFinder.getServiceSection(self._name), 'URL': self._cfg.getURL(), 'messageSender': MessageSender(self._name, self._msgBroker), 'validNames': self._validNames, 'csPaths': [ PathFinder.getServiceSection(svcName) for svcName in self._validNames ] } # Call static initialization function try: self._handler['class']._rh__initializeClass( dict(self._serviceInfoDict), self._lockManager, self._msgBroker, self._monitor) if self._handler['init']: for initFunc in self._handler['init']: gLogger.verbose("Executing initialization function") try: result = initFunc(dict(self._serviceInfoDict)) except Exception as excp: gLogger.exception( "Exception while calling initialization function", lException=excp) return S_ERROR( "Exception while calling initialization function: %s" % str(excp)) if not isReturnStructure(result): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc) if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (self._name, result['Message'])) except Exception as e: errMsg = "Exception while initializing %s" % self._name gLogger.exception(e) gLogger.exception(errMsg) return S_ERROR(errMsg) # Load actions after the handler has initialized itself result = self._loadActions() if not result['OK']: return result self._actions = result['Value'] gThreadScheduler.addPeriodicTask(30, self.__reportThreadPoolContents) return S_OK() def __searchInitFunctions(self, handlerClass, currentClass=None): if not currentClass: currentClass = handlerClass initFuncs = [] ancestorHasInit = False for ancestor in currentClass.__bases__: initFuncs += self.__searchInitFunctions(handlerClass, ancestor) if 'initializeHandler' in dir(ancestor): ancestorHasInit = True if ancestorHasInit: initFuncs.append( super(currentClass, handlerClass).initializeHandler) if currentClass == handlerClass and 'initializeHandler' in dir( handlerClass): initFuncs.append(handlerClass.initializeHandler) return initFuncs def _loadHandlerInit(self): handlerClass = self._svcData['classObj'] handlerName = handlerClass.__name__ handlerInitMethods = self.__searchInitFunctions(handlerClass) try: handlerInitMethods.append( getattr(self._svcData['moduleObj'], "initialize%s" % handlerName)) except AttributeError: gLogger.verbose( "Not found global initialization function for service") if handlerInitMethods: gLogger.info("Found %s initialization methods" % len(handlerInitMethods)) handlerInfo = {} handlerInfo["name"] = handlerName handlerInfo["module"] = self._svcData['moduleObj'] handlerInfo["class"] = handlerClass handlerInfo["init"] = handlerInitMethods return S_OK(handlerInfo) def _loadActions(self): handlerClass = self._handler['class'] authRules = {} typeCheck = {} methodsList = {} for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue authRules[actionType] = {} typeCheck[actionType] = {} methodsList[actionType] = [] handlerAttributeList = dir(handlerClass) for actionType in Service.SVC_VALID_ACTIONS: if self._isMetaAction(actionType): continue methodPrefix = '%s_' % Service.SVC_VALID_ACTIONS[actionType] for attribute in handlerAttributeList: if attribute.find(methodPrefix) != 0: continue exportedName = attribute[len(methodPrefix):] methodsList[actionType].append(exportedName) gLogger.verbose("+ Found %s method %s" % (actionType, exportedName)) # Create lock for method self._lockManager.createLock( "%s/%s" % (actionType, exportedName), self._cfg.getMaxThreadsForMethod(actionType, exportedName)) # Look for type and auth rules if actionType == 'RPC': typeAttr = "types_%s" % exportedName authAttr = "auth_%s" % exportedName else: typeAttr = "types_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) authAttr = "auth_%s_%s" % ( Service.SVC_VALID_ACTIONS[actionType], exportedName) if typeAttr in handlerAttributeList: obj = getattr(handlerClass, typeAttr) gLogger.verbose("|- Found type definition %s: %s" % (typeAttr, str(obj))) typeCheck[actionType][exportedName] = obj if authAttr in handlerAttributeList: obj = getattr(handlerClass, authAttr) gLogger.verbose("|- Found auth rules %s: %s" % (authAttr, str(obj))) authRules[actionType][exportedName] = obj for actionType in Service.SVC_VALID_ACTIONS: referedAction = self._isMetaAction(actionType) if not referedAction: continue gLogger.verbose("Action %s is a meta action for %s" % (actionType, referedAction)) authRules[actionType] = [] for method in authRules[referedAction]: for prop in authRules[referedAction][method]: if prop not in authRules[actionType]: authRules[actionType].append(prop) gLogger.verbose("Meta action %s props are %s" % (actionType, authRules[actionType])) return S_OK({ 'methods': methodsList, 'auth': authRules, 'types': typeCheck }) def _initMonitoring(self): # Init extra bits of monitoring self._monitor.setComponentType(MonitoringClient.COMPONENT_SERVICE) self._monitor.setComponentName(self._name) self._monitor.setComponentLocation(self._cfg.getURL()) self._monitor.initialize() self._monitor.registerActivity("Connections", "Connections received", "Framework", "connections", MonitoringClient.OP_RATE) self._monitor.registerActivity("Queries", "Queries served", "Framework", "queries", MonitoringClient.OP_RATE) self._monitor.registerActivity('CPU', "CPU Usage", 'Framework', "CPU,%", MonitoringClient.OP_MEAN, 600) self._monitor.registerActivity('MEM', "Memory Usage", 'Framework', 'Memory,MB', MonitoringClient.OP_MEAN, 600) self._monitor.registerActivity('PendingQueries', "Pending queries", 'Framework', 'queries', MonitoringClient.OP_MEAN) self._monitor.registerActivity('ActiveQueries', "Active queries", 'Framework', 'threads', MonitoringClient.OP_MEAN) self._monitor.registerActivity('RunningThreads', "Running threads", 'Framework', 'threads', MonitoringClient.OP_MEAN) self._monitor.registerActivity('MaxFD', "Max File Descriptors", 'Framework', 'fd', MonitoringClient.OP_MEAN) self._monitor.setComponentExtraParam('DIRACVersion', DIRAC.version) self._monitor.setComponentExtraParam('platform', DIRAC.getPlatform()) self._monitor.setComponentExtraParam('startTime', Time.dateTime()) for prop in (("__RCSID__", "version"), ("__doc__", "description")): try: value = getattr(self._handler['module'], prop[0]) except Exception as e: gLogger.exception(e) gLogger.error("Missing property", prop[0]) value = 'unset' self._monitor.setComponentExtraParam(prop[1], value) for secondaryName in self._cfg.registerAlsoAs(): gLogger.info("Registering %s also as %s" % (self._name, secondaryName)) self._validNames.append(secondaryName) return S_OK() def __reportThreadPoolContents(self): # TODO: remove later if useThreadPoolExecutor: pendingQueries = self._threadPool._work_queue.qsize() activeQuereies = len(self._threadPool._threads) else: pendingQueries = self._threadPool.pendingJobs() activeQuereies = self._threadPool.numWorkingThreads() self._monitor.addMark('PendingQueries', pendingQueries) self._monitor.addMark('ActiveQueries', activeQuereies) self._monitor.addMark('RunningThreads', threading.activeCount()) self._monitor.addMark('MaxFD', self.__maxFD) self.__maxFD = 0 def getConfig(self): return self._cfg # End of initialization functions def handleConnection(self, clientTransport): """ This method may be called by ServiceReactor. The method stacks openened connection in a queue, another thread read this queue and handle connection. :param clientTransport: Object wich describe opened connection (PlainTransport or SSLTransport) """ self._stats['connections'] += 1 self._monitor.setComponentExtraParam('queries', self._stats['connections']) # TODO: remove later if useThreadPoolExecutor: self._threadPool.submit(self._processInThread, clientTransport) else: self._threadPool.generateJobAndQueueIt(self._processInThread, args=(clientTransport, )) # Threaded process function def _processInThread(self, clientTransport): """ This method handles a RPC, FileTransfer or Connection. Connection may be opened via ServiceReactor.__acceptIncomingConnection - Do the SSL/TLS Handshake (if dips is used) and extract credentials - Get the action called by the client - Check if the client is authorized to perform ation - If not, connection is closed - Instanciate the RequestHandler (RequestHandler contain all methods callable) (Following is not directly in this method but it describe what happen at #Execute the action) - Notify the client we're ready to execute the action (via _processProposal) and call RequestHandler._rh_executeAction() - Receive arguments/file/something else (depending on action) in the RequestHandler - Executing the action asked by the client :param clientTransport: Object who describe the opened connection (SSLTransport or PlainTransport) :return: S_OK with "closeTransport" a boolean to indicate if th connection have to be closed e.g. after RPC, closeTransport=True """ self.__maxFD = max(self.__maxFD, clientTransport.oSocket.fileno()) self._lockManager.lockGlobal() try: monReport = self.__startReportToMonitoring() except Exception: monReport = False try: # Handshake try: result = clientTransport.handshake() if not result['OK']: clientTransport.close() return except BaseException: return # Add to the transport pool trid = self._transportPool.add(clientTransport) if not trid: return # Receive and check proposal result = self._receiveAndCheckProposal(trid) if not result['OK']: self._transportPool.sendAndClose(trid, result) return proposalTuple = result['Value'] # Instantiate handler result = self._instantiateHandler(trid, proposalTuple) if not result['OK']: self._transportPool.sendAndClose(trid, result) return handlerObj = result['Value'] # Execute the action result = self._processProposal(trid, proposalTuple, handlerObj) # Close the connection if required if result['closeTransport'] or not result['OK']: if not result['OK']: gLogger.error("Error processing proposal", result['Message']) self._transportPool.close(trid) return result finally: self._lockManager.unlockGlobal() if monReport: self.__endReportToMonitoring(*monReport) def _createIdentityString(self, credDict, clientTransport=None): if 'username' in credDict: if 'group' in credDict: identity = "[%s:%s]" % (credDict['username'], credDict['group']) else: identity = "[%s:unknown]" % credDict['username'] else: identity = 'unknown' if clientTransport: addr = clientTransport.getRemoteAddress() if addr: addr = "{%s:%s}" % (addr[0], addr[1]) if 'DN' in credDict: identity += "(%s)" % credDict['DN'] return identity @staticmethod def _deserializeProposalTuple(serializedProposal): """ We receive the proposalTuple as a list. Turn it into a tuple again """ proposalTuple = tuple( tuple(x) if isinstance(x, list) else x for x in serializedProposal) return proposalTuple def _receiveAndCheckProposal(self, trid): clientTransport = self._transportPool.get(trid) # Get the peer credentials credDict = clientTransport.getConnectingCredentials() # Receive the action proposal retVal = clientTransport.receiveData(1024) if not retVal['OK']: gLogger.error( "Invalid action proposal", "%s %s" % (self._createIdentityString( credDict, clientTransport), retVal['Message'])) return S_ERROR("Invalid action proposal") proposalTuple = Service._deserializeProposalTuple(retVal['Value']) gLogger.debug("Received action from client", "/".join(list(proposalTuple[1]))) # Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials(proposalTuple[2]) # Check if this is the requested service requestedService = proposalTuple[0][0] if requestedService not in self._validNames: return S_ERROR("%s is not up in this server" % requestedService) # Check if the action is valid requestedActionType = proposalTuple[1][0] if requestedActionType not in Service.SVC_VALID_ACTIONS: return S_ERROR("%s is not a known action type" % requestedActionType) # Check if it's authorized result = self._authorizeProposal(proposalTuple[1], trid, credDict) if not result['OK']: return result # Proposal is OK return S_OK(proposalTuple) def _authorizeProposal(self, actionTuple, trid, credDict): # Find CS path for the Auth rules referedAction = self._isMetaAction(actionTuple[0]) if referedAction: csAuthPath = "%s/Default" % actionTuple[0] hardcodedMethodAuth = self._actions['auth'][actionTuple[0]] else: if actionTuple[0] == 'RPC': csAuthPath = actionTuple[1] else: csAuthPath = "/".join(actionTuple) # Find if there are hardcoded auth rules in the code hardcodedMethodAuth = False if actionTuple[0] in self._actions['auth']: hardcodedRulesByType = self._actions['auth'][actionTuple[0]] if actionTuple[0] == "FileTransfer": methodName = actionTuple[1][0].lower() + actionTuple[1][1:] else: methodName = actionTuple[1] if methodName in hardcodedRulesByType: hardcodedMethodAuth = hardcodedRulesByType[methodName] # Auth time! if not self._authMgr.authQuery(csAuthPath, credDict, hardcodedMethodAuth): # Get the identity string identity = self._createIdentityString(credDict) fromHost = "unknown host" tr = self._transportPool.get(trid) if tr: fromHost = '/'.join( [str(item) for item in tr.getRemoteAddress()]) gLogger.warn( "Unauthorized query", "to %s:%s by %s from %s" % (self._name, "/".join(actionTuple), identity, fromHost)) result = S_ERROR(ENOAUTH, "Unauthorized query") else: result = S_OK() # Security log tr = self._transportPool.get(trid) if not tr: return S_ERROR("Client disconnected") sourceAddress = tr.getRemoteAddress() identity = self._createIdentityString(credDict) Service.SVC_SECLOG_CLIENT.addMessage(result['OK'], sourceAddress[0], sourceAddress[1], identity, self._cfg.getHostname(), self._cfg.getPort(), self._name, "/".join(actionTuple)) return result def _instantiateHandler(self, trid, proposalTuple=None): """ Generate an instance of the handler for a given service :param int trid: transport ID :param tuple proposalTuple: tuple describing the proposed action :return: S_OK/S_ERROR, Value is the handler object """ # Generate the client params clientParams = {'serviceStartTime': self._startTime} if proposalTuple: # The 4th element is the client version clientParams['clientVersion'] = proposalTuple[3] if len( proposalTuple) > 3 else None clientParams['clientSetup'] = proposalTuple[0][1] if len(proposalTuple[0]) < 3: clientParams['clientVO'] = gConfig.getValue( "/DIRAC/VirtualOrganization", "unknown") else: clientParams['clientVO'] = proposalTuple[0][2] clientTransport = self._transportPool.get(trid) if clientTransport: clientParams['clientAddress'] = clientTransport.getRemoteAddress() # Generate handler dict with per client info handlerInitDict = dict(self._serviceInfoDict) for key in clientParams: handlerInitDict[key] = clientParams[key] # Instantiate and initialize try: handlerInstance = self._handler['class'](handlerInitDict, trid) handlerInstance.initialize() except Exception as e: gLogger.exception("Server error while loading handler: %s" % str(e)) return S_ERROR("Server error while loading handler") return S_OK(handlerInstance) def _processProposal(self, trid, proposalTuple, handlerObj): # Notify the client we're ready to execute the action retVal = self._transportPool.send(trid, S_OK()) if not retVal['OK']: return retVal messageConnection = False if proposalTuple[1] == ('Connection', 'new'): messageConnection = True if messageConnection: if self._msgBroker.getNumConnections( ) > self._cfg.getMaxMessagingConnections(): result = S_ERROR( "Maximum number of connections reached. Try later") result['closeTransport'] = True return result # This is a stable connection self._msgBroker.addTransportId( trid, self._name, receiveMessageCallback=self._mbReceivedMsg, disconnectCallback=self._mbDisconnect, listenToConnection=False) result = self._executeAction(trid, proposalTuple, handlerObj) if result['OK'] and messageConnection: self._msgBroker.listenToTransport(trid) result = self._mbConnect(trid, handlerObj) if not result['OK']: self._msgBroker.removeTransport(trid) result['closeTransport'] = not messageConnection or not result['OK'] return result def _mbConnect(self, trid, handlerObj=None): if not handlerObj: result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] return handlerObj._rh_executeConnectionCallback('connected') def _executeAction(self, trid, proposalTuple, handlerObj): try: return handlerObj._rh_executeAction(proposalTuple) except Exception as e: gLogger.exception("Exception while executing handler action") return S_ERROR("Server error while executing action: %s" % str(e)) def _mbReceivedMsg(self, trid, msgObj): result = self._authorizeProposal( ('Message', msgObj.getName()), trid, self._transportPool.get(trid).getConnectingCredentials()) if not result['OK']: return result result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] return handlerObj._rh_executeMessageCallback(msgObj) def _mbDisconnect(self, trid): result = self._instantiateHandler(trid) if not result['OK']: return result handlerObj = result['Value'] return handlerObj._rh_executeConnectionCallback('drop') def __startReportToMonitoring(self): self._monitor.addMark("Queries") now = time.time() stats = os.times() cpuTime = stats[0] + stats[2] if now - self.__monitorLastStatsUpdate < 0: return (now, cpuTime) # Send CPU consumption mark wallClock = now - self.__monitorLastStatsUpdate self.__monitorLastStatsUpdate = now # Send Memory consumption mark membytes = MemStat.VmB('VmRSS:') if membytes: mem = membytes / (1024. * 1024.) self._monitor.addMark('MEM', mem) return (now, cpuTime) def __endReportToMonitoring(self, initialWallTime, initialCPUTime): wallTime = time.time() - initialWallTime stats = os.times() cpuTime = stats[0] + stats[2] - initialCPUTime percentage = cpuTime / wallTime * 100. if percentage > 0: self._monitor.addMark('CPU', percentage)
class FTSMonitorAgent(AgentModule): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize(self): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption("shifterProxy", "DataManager") self.minThreads = self.am_getOption("MinThreads", self.minThreads) self.maxThreads = self.am_getOption("MaxThreads", self.maxThreads) minmax = (abs(self.minThreads), abs(self.maxThreads)) self.minThreads, self.maxThreads = min(minmax), max(minmax) self.log.info("ThreadPool min threads = %s" % self.minThreads) self.log.info("ThreadPool max threads = %s" % self.maxThreads) self.threadPool = ThreadPool(self.minThreads, self.maxThreads) self.threadPool.daemonize() return S_OK() def execute(self): """ push jobs to the thread pool """ self.log.info("Obtaining requests to monitor") res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error("Failed to get FTS requests", res['Message']) return res if not res["Value"]: self.log.info("No FTS requests found to monitor.") return S_OK() ftsReqs = res["Value"] self.log.info("Found %s FTS jobs" % len(ftsReqs)) i = 1 for ftsJob in ftsReqs: while True: self.log.debug("submitting FTS Job %s FTSReqID=%s to monitor" % (i, ftsJob["FTSReqID"])) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args=(ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep(1) self.threadPool.processAllResults() return S_OK() def ftsJobExpired(self, ftsReqID, channelID): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger("@%s" % str(ftsReqID)) fileIDs = self.transferDB.getFTSReqFileIDs(ftsReqID) if not fileIDs["OK"]: log.error("Unable to retrieve FileIDs associated to %s request" % ftsReqID) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed") if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % (fileID, fileStatus["Message"])) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server") if not failReason["OK"]: log.error("Unable to set FileToFTS reason for FileID %s: %s" % (fileID, failReason["Message"])) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs) if not resetChannels["OK"]: log.error("Failed to reset Channel table for files to retry") return resetChannels # # update FTSReq table log.info("Setting FTS request status to 'Finished'") ftsReqStatus = self.transferDB.setFTSReqStatus(ftsReqID, "Finished") if not ftsReqStatus["OK"]: log.error("Failed update FTS Request status", ftsReqStatus["Message"]) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer(self, ftsReqDict): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get("FTSReqID") ftsGUID = ftsReqDict.get("FTSGuid") ftsServer = ftsReqDict.get("FTSServer") channelID = ftsReqDict.get("ChannelID") sourceSE = ftsReqDict.get("SourceSE") targetSE = ftsReqDict.get("TargetSE") oFTSRequest = FTSRequest() oFTSRequest.setFTSServer(ftsServer) oFTSRequest.setFTSGUID(ftsGUID) oFTSRequest.setSourceSE(sourceSE) oFTSRequest.setTargetSE(targetSE) log = gLogger.getSubLogger("@%s" % str(ftsReqID)) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info("Perform summary update of the FTS Request") infoStr = ["glite-transfer-status -s %s -l %s" % (ftsServer, ftsGUID)] infoStr.append("FTS GUID: %s" % ftsGUID) infoStr.append("FTS Server: %s" % ftsServer) log.info("\n".join(infoStr)) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor(ftsReqID) if not res["OK"]: log.error("Failed to update the FTS request summary", res["Message"]) if "getTransferJobSummary2: Not authorised to query request" in res[ "Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired(ftsReqID, channelID) if not cleanUp["OK"]: log.error(cleanUp["Message"]) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error("Failed to get FTS request summary", res["Message"]) return res log.info(res['Value']) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error("Failed to get FTS percentage complete", res["Message"]) return res log.info('FTS Request found to be %.1f percent complete' % res["Value"]) self.transferDB.setFTSReqAttribute(ftsReqID, "PercentageComplete", res["Value"]) self.transferDB.addLoggingEvent(ftsReqID, res["Value"]) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error("Failed to determine whether FTS request terminal", res["Message"]) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest(oFTSRequest, ftsReqID, channelID, sourceSE) def terminalRequest(self, oFTSRequest, ftsReqID, channelID, sourceSE): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger("@%s" % ftsReqID) log.info("FTS Request found to be terminal, updating file states") ######################################################################### # Get the LFNS associated to the FTS request log.info("Obtaining the LFNs associated to this request") res = self.transferDB.getFTSReqLFNs(ftsReqID, channelID, sourceSE) if not res["OK"]: log.error("Failed to obtain FTS request LFNs", res['Message']) return res files = res["Value"] if not files: log.error("No files present for transfer") return S_ERROR("No files were found in the DB") lfns = files.keys() log.debug("Obtained %s files" % len(lfns)) for lfn in lfns: oFTSRequest.setLFN(lfn) res = oFTSRequest.monitor() if not res["OK"]: log.error("Failed to perform detailed monitoring of FTS request", res["Message"]) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error("Failed to obtained failed files for FTS request", res["Message"]) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error("Failed to obtained successful files for FTS request", res["Message"]) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs(ftsReqID) if not res["OK"]: log.error("Failed to get FileIDs associated to FTS Request", res["Message"]) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList(fileIDs, ["LFN"]) if not res["OK"]: log.error("Failed to get LFNs associated to FTS Request", res["Message"]) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append(fileID) transferTime = 0 res = oFTSRequest.getTransferTime(lfn) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( (fileID, "Completed", "", 0, transferTime)) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason(lfn) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append(fileID) continue if self.missingSource(failReason): log.error("The source SURL does not exist.", "%s %s" % (lfn, oFTSRequest.getSourceSURL(lfn))) filesToFail.append(fileID) else: filesToRetry.append(fileID) log.error("Failed to replicate file on channel.", "%s %s" % (channelID, failReason)) fileToFTSUpdates.append((fileID, "Failed", failReason, 0, 0)) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS(ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"]) return res log.info('Adding logging event for FTS request') # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent(ftsReqID, 'Finished') if not res['OK']: log.error('Failed to add logging event for FTS Request', res['Message']) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat(oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail) if not updateFileToCat["OK"]: log.error(updateFileToCat["Message"]) log.debug("Updating FTS request status") res = self.transferDB.setFTSReqStatus(ftsReqID, 'Finished') if not res['OK']: log.error('Failed update FTS Request status', res['Message']) return S_OK() def updateFileToFTS(self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger("@%s" % ftsReqID) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry) if filesToRetry else S_OK() if not res["OK"]: log.error("Failed to update the Channel table for file to retry.", res["Message"]) allUpdated = False for fileID in filesToFail: log.info("Updating the Channel table for files to reschedule") res = self.transferDB.setFileToReschedule(fileID) if not res["OK"]: log.error("Failed to update Channel table for failed files.", res["Message"]) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error("setting Channel status to 'Failed' : " % res["Value"]) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed') if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"]) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"]) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message']) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates) if not res["OK"]: log.error("Failed to update the FileToFTS table for files.", res["Message"]) allUpdated = False return S_OK(allUpdated) def updateFileToCat(self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append(fileID) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append(fileID) elif fileID in filesToFail: regForgetFileIDs.append(fileID) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res[ "Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res return S_OK() @classmethod def missingSource(cls, failReason): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search(failReason): return 1 return 0
class FTSAgent( AgentModule ): """ .. class:: FTSAgent Agent propagating Scheduled request to Done or Failed state in the FTS system. Requests and associated FTSJobs (and so FTSFiles) are kept in cache. """ # # fts placement refresh in seconds FTSPLACEMENT_REFRESH = FTSHistoryView.INTERVAL / 2 # # placeholder for max job per channel MAX_ACTIVE_JOBS = 50 # # min threads MIN_THREADS = 1 # # max threads MAX_THREADS = 10 # # files per job MAX_FILES_PER_JOB = 100 # # MAX FTS transfer per FTSFile MAX_ATTEMPT = 256 # # stage flag PIN_TIME = 0 # # FTS submission command SUBMIT_COMMAND = 'glite-transfer-submit' # # FTS monitoring command MONITOR_COMMAND = 'glite-transfer-status' # Max number of requests fetched from the RMS MAX_REQUESTS = 100 # Minimum interval (seconds) between 2 job monitoring MONITORING_INTERVAL = 600 # # placeholder for FTS client __ftsClient = None # # placeholder for the FTS version __ftsVersion = None # # placeholder for request client __requestClient = None # # placeholder for resources helper __resources = None # # placeholder for RSS client __rssClient = None # # placeholder for FTSPlacement __ftsPlacement = None # # placement regeneration time delta __ftsPlacementValidStamp = None # # placeholder for threadPool __threadPool = None # # update lock __updateLock = None # # request cache __reqCache = dict() def updateLock( self ): """ update lock """ if not self.__updateLock: self.__updateLock = LockRing().getLock( "FTSAgentLock" ) return self.__updateLock @classmethod def requestClient( cls ): """ request client getter """ if not cls.__requestClient: cls.__requestClient = ReqClient() return cls.__requestClient @classmethod def ftsClient( cls ): """ FTS client """ if not cls.__ftsClient: cls.__ftsClient = FTSClient() return cls.__ftsClient @classmethod def rssClient( cls ): """ RSS client getter """ if not cls.__rssClient: cls.__rssClient = ResourceStatus() return cls.__rssClient @classmethod def getRequest( cls, reqID ): """ get Requests systematically and refresh cache """ getRequest = cls.requestClient().getRequest( reqID ) if not getRequest["OK"]: cls.__reqCache.pop( reqID, None ) return getRequest getRequest = getRequest["Value"] if not getRequest: cls.__reqCache.pop( reqID, None ) return S_ERROR( "request of id '%s' not found in ReqDB" % reqID ) cls.__reqCache[reqID] = getRequest return S_OK( cls.__reqCache[reqID] ) @classmethod def putRequest( cls, request, clearCache = True ): """ put request back to ReqDB :param Request request: Request instance :param bool clearCache: clear the cache? also finalize request if status == Done """ # # put back request if request.RequestID not in cls.__reqCache: return S_OK() put = cls.requestClient().putRequest( request ) if not put["OK"]: return put # # finalize first if possible if request.Status == "Done" and request.JobID: finalizeRequest = cls.requestClient().finalizeRequest( request.RequestID, request.JobID ) if not finalizeRequest["OK"]: request.Status = "Scheduled" # # del request from cache if needed if clearCache: cls.__reqCache.pop( request.RequestID, None ) return S_OK() @classmethod def putFTSJobs( cls, ftsJobsList ): """ put back fts jobs to the FTSDB """ for ftsJob in ftsJobsList: put = cls.ftsClient().putFTSJob( ftsJob ) if not put["OK"]: return put return S_OK() @staticmethod def updateFTSFileDict( ftsFilesDict, toUpdateDict ): """ update :ftsFilesDict: with FTSFiles in :toUpdateDict: """ for category, ftsFileList in ftsFilesDict.iteritems(): for ftsFile in toUpdateDict.get( category, [] ): if ftsFile not in ftsFileList: ftsFileList.append( ftsFile ) return ftsFilesDict # def resources( self ): # """ resource helper getter """ # if not self.__resources: # self.__resources = Resources() # return self.__resources def threadPool( self ): """ thread pool getter """ if not self.__threadPool: self.__threadPool = ThreadPool( self.MIN_THREADS, self.MAX_THREADS ) self.__threadPool.daemonize() return self.__threadPool def resetFTSPlacement( self ): """ create fts Placement """ ftsHistory = self.ftsClient().getFTSHistory() if not ftsHistory["OK"]: self.log.error( "unable to get FTS history:", ftsHistory["Message"] ) return ftsHistory ftsHistory = ftsHistory["Value"] try: self.updateLock().acquire() if not self.__ftsPlacement: self.__ftsPlacement = FTSPlacement( csPath = None, ftsHistoryViews = ftsHistory ) else: self.__ftsPlacement.refresh( ftsHistoryViews = ftsHistory ) finally: self.updateLock().release() # # save time stamp self.__ftsPlacementValidStamp = datetime.datetime.now() + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) return S_OK() def initialize( self ): """ agent's initialization """ # # data manager self.dataManager = DataManager() log = self.log.getSubLogger( "initialize" ) self.FTSPLACEMENT_REFRESH = self.am_getOption( "FTSPlacementValidityPeriod", self.FTSPLACEMENT_REFRESH ) log.info( "FTSPlacement validity period = %s s" % self.FTSPLACEMENT_REFRESH ) self.SUBMIT_COMMAND = self.am_getOption( "SubmitCommand", self.SUBMIT_COMMAND ) log.info( "FTS submit command = %s" % self.SUBMIT_COMMAND ) self.MONITOR_COMMAND = self.am_getOption( "MonitorCommand", self.MONITOR_COMMAND ) log.info( "FTS commands: submit = %s monitor %s" % ( self.SUBMIT_COMMAND, self.MONITOR_COMMAND ) ) self.PIN_TIME = self.am_getOption( "PinTime", self.PIN_TIME ) log.info( "Stage files before submission = ", {True: "yes", False: "no"}[bool( self.PIN_TIME )] ) self.MAX_ACTIVE_JOBS = self.am_getOption( "MaxActiveJobsPerRoute", self.MAX_ACTIVE_JOBS ) log.info( "Max active FTSJobs/route = ", str( self.MAX_ACTIVE_JOBS ) ) self.MAX_FILES_PER_JOB = self.am_getOption( "MaxFilesPerJob", self.MAX_FILES_PER_JOB ) log.info( "Max FTSFiles/FTSJob = ", str( self.MAX_FILES_PER_JOB ) ) self.MAX_ATTEMPT = self.am_getOption( "MaxTransferAttempts", self.MAX_ATTEMPT ) log.info( "Max transfer attempts = ", str( self.MAX_ATTEMPT ) ) # # thread pool self.MIN_THREADS = self.am_getOption( "MinThreads", self.MIN_THREADS ) self.MAX_THREADS = self.am_getOption( "MaxThreads", self.MAX_THREADS ) minmax = ( abs( self.MIN_THREADS ), abs( self.MAX_THREADS ) ) self.MIN_THREADS, self.MAX_THREADS = min( minmax ), max( minmax ) log.info( "ThreadPool min threads = ", str( self.MIN_THREADS ) ) log.info( "ThreadPool max threads = ", str( self.MAX_THREADS ) ) self.MAX_REQUESTS = self.am_getOption( "MaxRequests", self.MAX_REQUESTS ) log.info( "Max Requests fetched = ", str( self.MAX_REQUESTS ) ) self.MONITORING_INTERVAL = self.am_getOption( "MonitoringInterval", self.MONITORING_INTERVAL ) log.info( "Minimum monitoring interval = ", str( self.MONITORING_INTERVAL ) ) self.__ftsVersion = Operations().getValue( 'DataManagement/FTSVersion', 'FTS2' ) log.info( "FTSVersion : %s" % self.__ftsVersion ) log.info( "initialize: creation of FTSPlacement..." ) createPlacement = self.resetFTSPlacement() if not createPlacement["OK"]: log.error( "initialize:", createPlacement["Message"] ) return createPlacement # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) log.info( "will use DataManager proxy" ) self.registrationProtocols = getRegistrationProtocols() # # gMonitor stuff here gMonitor.registerActivity( "RequestsAtt", "Attempted requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsOK", "Successful requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "RequestsFail", "Failed requests executions", "FTSAgent", "Requests/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubAtt", "FTSJobs creation attempts", "FTSAgent", "Created FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubOK", "FTSJobs submitted successfully", "FTSAgent", "Successful FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsSubFail", "FTSJobs submissions failed", "FTSAgent", "Failed FTSJobs submissions/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonAtt", "FTSJobs monitored", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonOK", "FTSJobs monitored successfully", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSJobsMonFail", "FTSJobs attempts failed", "FTSAgent", "FTSJobs/min", gMonitor.OP_SUM ) gMonitor.registerActivity( "FTSMonitorFail", "Failed FTS monitor executions", "FTSAgent", "Execution/mins", gMonitor.OP_SUM ) pollingTime = self.am_getOption( "PollingTime", 60 ) for status in list( FTSJob.INITSTATES + FTSJob.TRANSSTATES + FTSJob.FAILEDSTATES + FTSJob.FINALSTATES ): gMonitor.registerActivity( "FTSJobs%s" % status, "FTSJobs %s" % status , "FTSAgent", "FTSJobs/cycle", gMonitor.OP_ACUM, pollingTime ) gMonitor.registerActivity( "FtSJobsPerRequest", "Average FTSJobs per request", "FTSAgent", "FTSJobs/Request", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSFilesPerJob", "FTSFiles per FTSJob", "FTSAgent", "Number of FTSFiles per FTSJob", gMonitor.OP_MEAN ) gMonitor.registerActivity( "FTSSizePerJob", "Average FTSFiles size per FTSJob", "FTSAgent", "Average submitted size per FTSJob", gMonitor.OP_MEAN ) return S_OK() def finalize( self ): """ finalize processing """ # log = self.log.getSubLogger( "finalize" ) # if self.__reqCache: # log.info( 'putting back %d requests from cache' % len( self.__reqCache ) ) # else: # log.info( 'no requests to put back' ) # for request in self.__reqCache.values(): # put = self.requestClient().putRequest( request ) # if not put["OK"]: # log.error( "unable to put back request '%s': %s" % ( request.RequestName, put["Message"] ) ) return S_OK() def execute( self ): """ one cycle execution """ # Don't use the server certificate otherwise the DFC wont let us write gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' ) log = gLogger.getSubLogger( "execute" ) # # reset FTSPlacement if expired now = datetime.datetime.now() if now > self.__ftsPlacementValidStamp: log.info( "resetting expired FTS placement..." ) resetFTSPlacement = self.resetFTSPlacement() if not resetFTSPlacement["OK"]: log.error( "FTSPlacement recreation error:" , resetFTSPlacement["Message"] ) return resetFTSPlacement self.__ftsPlacementValidStamp = now + datetime.timedelta( seconds = self.FTSPLACEMENT_REFRESH ) requestIDs = self.requestClient().getRequestIDsList( statusList = [ "Scheduled" ], limit = self.MAX_REQUESTS ) if not requestIDs["OK"]: log.error( "unable to read scheduled request ids" , requestIDs["Message"] ) return requestIDs if not requestIDs["Value"]: requestIDs = [] else: requestIDs = [ req[0] for req in requestIDs["Value"] if req[0] not in self.__reqCache ] requestIDs += self.__reqCache.keys() if not requestIDs: log.info( "no 'Scheduled' requests to process" ) return S_OK() log.info( "found %s requests to process:" % len( requestIDs ) ) log.info( " => from internal cache: %s" % ( len( self.__reqCache ) ) ) log.info( " => new read from RMS: %s" % ( len( requestIDs ) - len( self.__reqCache ) ) ) for requestID in requestIDs: request = self.getRequest( requestID ) if not request["OK"]: log.error( "Error getting request", "%s: %s" % ( requestID, request["Message"] ) ) continue request = request["Value"] sTJId = request.RequestID while True: queue = self.threadPool().generateJobAndQueueIt( self.processRequest, args = ( request, ), sTJId = sTJId ) if queue["OK"]: log.info( "Request enqueued for execution", sTJId ) gMonitor.addMark( "RequestsAtt", 1 ) break time.sleep( 1 ) # # process all results self.threadPool().processAllResults() return S_OK() def processRequest( self, request ): """ process one request :param Request request: ReqDB.Request """ log = self.log.getSubLogger( "req_%s/%s" % ( request.RequestID, request.RequestName ) ) operation = request.getWaiting() if not operation["OK"]: log.error( "Unable to find 'Scheduled' ReplicateAndRegister operation in request" ) return self.putRequest( request ) operation = operation["Value"] if not isinstance( operation, Operation ): log.error( "Waiting returned operation is not an operation:", type( operation ) ) return self.putRequest( request ) if operation.Type != "ReplicateAndRegister": log.error( "operation to be executed is not a ReplicateAndRegister but", operation.Type ) return self.putRequest( request ) if operation.Status != "Scheduled": log.error( "operation in a wrong state, expecting 'Scheduled', got", operation.Status ) return self.putRequest( request ) log.info( 'start processRequest' ) # # select FTSJobs, by default all in TRANS_STATES and INIT_STATES ftsJobs = self.ftsClient().getFTSJobsForRequest( request.RequestID ) if not ftsJobs["OK"]: log.error( ftsJobs["Message"] ) return ftsJobs ftsJobs = [ftsJob for ftsJob in ftsJobs.get( "Value", [] ) if ftsJob.Status not in FTSJob.FINALSTATES] # # Use a try: finally: for making sure FTS jobs are put back before returning try: # # dict keeping info about files to reschedule, submit, fail and register ftsFilesDict = dict( [ ( k, list() ) for k in ( "toRegister", "toSubmit", "toFail", "toReschedule", "toUpdate" ) ] ) now = datetime.datetime.utcnow() jobsToMonitor = [job for job in ftsJobs if ( now - job.LastUpdate ).seconds > ( self.MONITORING_INTERVAL * ( 3. if job.Status == 'Staging' else 1. ) )] if jobsToMonitor: log.info( "==> found %s FTSJobs to monitor" % len( jobsToMonitor ) ) # # PHASE 0 = monitor active FTSJobs for ftsJob in jobsToMonitor: monitor = self.__monitorJob( request, ftsJob ) if not monitor["OK"]: log.error( "unable to monitor FTSJob", "%s: %s" % ( ftsJob.FTSJobID, monitor["Message"] ) ) ftsJob.Status = "Submitted" else: ftsFilesDict = self.updateFTSFileDict( ftsFilesDict, monitor["Value"] ) log.info( "monitoring of FTSJobs completed" ) for key, ftsFiles in ftsFilesDict.iteritems(): if ftsFiles: log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) if len( ftsJobs ) != len( jobsToMonitor ): log.info( "==> found %d FTSJobs that were monitored recently" % ( len( ftsJobs ) - len( jobsToMonitor ) ) ) # # PHASE ONE - check ready replicas missingReplicas = self.__checkReadyReplicas( request, operation ) if not missingReplicas["OK"]: log.error( missingReplicas["Message"] ) else: missingReplicas = missingReplicas["Value"] for opFile in operation: # Actually the condition below should never happen... Change printout for checking if opFile.LFN not in missingReplicas and opFile.Status not in ( 'Done', 'Failed' ): log.warn( "File should be set Done! %s is replicated at all targets" % opFile.LFN ) opFile.Status = "Done" if missingReplicas: # Check if these files are in the FTSDB ftsFiles = self.ftsClient().getAllFTSFilesForRequest( request.RequestID ) if not ftsFiles['OK']: log.error( ftsFiles['Message'] ) else: ftsFiles = ftsFiles['Value'] ftsLfns = set( [ftsFile.LFN for ftsFile in ftsFiles] ) # Recover files not in FTSDB toSchedule = set( missingReplicas ) - ftsLfns if toSchedule: log.warn( '%d files in operation are not in FTSDB, reset them Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # Recover files with target not in FTSDB toSchedule = set( [missing for missing, missingSEs in missingReplicas.iteritems() if not [ftsFile for ftsFile in ftsFiles if ftsFile.LFN == missing and ftsFile.TargetSE in missingSEs]] ) if toSchedule: log.warn( '%d targets in operation are not in FTSDB, reset files Waiting' % len( toSchedule ) ) for opFile in operation: if opFile.LFN in toSchedule and opFile.Status == 'Scheduled': opFile.Status = 'Waiting' # identify missing LFNs that are waiting for a replication which is finished for ftsFile in [f for f in ftsFiles if f.LFN in missingReplicas and f.Status.startswith( 'Waiting#' )]: targetSE = ftsFile.Status.split( '#' )[1] finishedFiles = [f for f in ftsFiles if f.LFN == ftsFile.LFN and f.Status == 'Finished' and f.TargetSE == targetSE and f not in ftsFilesDict['toUpdate']] if finishedFiles: log.warn( "%s is %s while replication was Finished to %s, update" % ( ftsFile.LFN, ftsFile.Status, targetSE ) ) ftsFilesDict['toUpdate'] += finishedFiles # identify Active transfers for which there is no FTS job any longer and reschedule them for ftsFile in [f for f in ftsFiles if f.Status == 'Active' and f.TargetSE in missingReplicas.get( f.LFN, [] )]: if not [ftsJob for ftsJob in ftsJobs if ftsJob.FTSGUID == ftsFile.FTSGUID]: ftsFilesDict['toReschedule'].append( ftsFile ) # identify Finished transfer for which the replica is still missing for ftsFile in [f for f in ftsFiles if f.Status == 'Finished' and f.TargetSE in missingReplicas.get( f.LFN, [] ) and f not in ftsFilesDict['toRegister'] ]: # Check if there is a registration operation for that file and that target regOp = [op for op in request if op.Type == 'RegisterReplica' and op.TargetSE == ftsFile.TargetSE and [f for f in op if f.LFN == ftsFile.LFN]] if not regOp: ftsFilesDict['toReschedule'].append( ftsFile ) # Recover files that are Failed but were not spotted for ftsFile in [f for f in ftsFiles if f.Status == 'Failed' and f.TargetSE in missingReplicas.get( f.LFN, [] )]: reschedule, submit, fail = self.__checkFailed( ftsFile ) if fail and ftsFile not in ftsFilesDict['toFail']: ftsFilesDict['toFail'].append( ftsFile ) elif reschedule and ftsFile not in ftsFilesDict['toReschedule']: ftsFilesDict['toReschedule'].append( ftsFile ) elif submit and ftsFile not in ftsFilesDict['toSubmit']: ftsFilesDict['toSubmit'].append( ftsFile ) # If all transfers are finished for unregistered files and there is already a registration operation, set it Done for lfn in missingReplicas: if not [f for f in ftsFiles if f.LFN == lfn and ( f.Status != 'Finished' or f in ftsFilesDict['toReschedule'] or f in ftsFilesDict['toRegister'] )]: for opFile in operation: if opFile.LFN == lfn: opFile.Status = 'Done' break for key, ftsFiles in ftsFilesDict.iteritems(): if ftsFiles: log.info( " => %d FTSFiles to %s" % ( len( ftsFiles ), key[2:].lower() ) ) toFail = ftsFilesDict.get( "toFail", [] ) toReschedule = ftsFilesDict.get( "toReschedule", [] ) toSubmit = ftsFilesDict.get( "toSubmit", [] ) toRegister = ftsFilesDict.get( "toRegister", [] ) toUpdate = ftsFilesDict.get( "toUpdate", [] ) # # PHASE TWO = Failed files? -> make request Failed and return if toFail: log.error( "==> found %d 'Failed' FTSFiles, but maybe other files can be processed..." % len( toFail ) ) for opFile in operation: for ftsFile in toFail: if opFile.FileID == ftsFile.FileID: opFile.Error = ftsFile.Error opFile.Status = "Failed" operation.Error = "%s files are missing any replicas" % len( toFail ) # # requets.Status should be Failed if all files in the operation "Failed" if request.Status == "Failed": request.Error = "ReplicateAndRegister %s failed" % operation.Order log.error( "request is set to 'Failed'" ) # # putRequest is done by the finally: clause... Not good to do it twice raise escapeTry # # PHASE THREE - update Waiting#TargetSE FTSFiles if toUpdate: log.info( "==> found %s possible FTSFiles to update..." % ( len( toUpdate ) ) ) byTarget = {} for ftsFile in toUpdate: byTarget.setdefault( ftsFile.TargetSE, [] ).append( ftsFile.FileID ) for targetSE, fileIDList in byTarget.iteritems(): update = self.ftsClient().setFTSFilesWaiting( operation.OperationID, targetSE, fileIDList ) if not update["OK"]: log.error( "update FTSFiles failed:", update["Message"] ) # # PHASE FOUR - add 'RegisterReplica' Operations if toRegister: log.info( "==> found %d Files waiting for registration, adding 'RegisterReplica' operations" % len( toRegister ) ) registerFiles = self.__insertRegisterOperation( request, operation, toRegister ) if not registerFiles["OK"]: log.error( "unable to create 'RegisterReplica' operations:", registerFiles["Message"] ) # if request.Status == "Waiting": # log.info( "request is in 'Waiting' state, will put it back to RMS" ) # return self.putRequest( request ) # # PHASE FIVE - reschedule operation files if toReschedule: log.info( "==> found %s Files to reschedule" % len( toReschedule ) ) rescheduleFiles = self.__reschedule( request, operation, toReschedule ) if not rescheduleFiles["OK"]: log.error( 'Failed to reschedule files', rescheduleFiles["Message"] ) # # PHASE SIX - read Waiting ftsFiles and submit new FTSJobs. We get also Failed files to recover them if needed ftsFiles = self.ftsClient().getFTSFilesForRequest( request.RequestID, [ "Waiting", "Failed", 'Submitted', 'Canceled' ] ) if not ftsFiles["OK"]: log.error( ftsFiles["Message"] ) else: retryIds = set ( [ ftsFile.FTSFileID for ftsFile in toSubmit ] ) for ftsFile in ftsFiles["Value"]: if ftsFile.FTSFileID not in retryIds: if ftsFile.Status in ( 'Failed', 'Canceled' ): # If the file was not unrecoverable failed and is not yet set toSubmit _reschedule, submit, _fail = self.__checkFailed( ftsFile ) elif ftsFile.Status == 'Submitted': if ftsFile.FTSGUID not in [job.FTSGUID for job in ftsJobs]: log.warn( 'FTS GUID %s not found in FTS jobs, resubmit file transfer' % ftsFile.FTSGUID ) ftsFile.Status = 'Waiting' submit = True else: submit = False else: submit = True if submit: toSubmit.append( ftsFile ) retryIds.add( ftsFile.FTSFileID ) # # should not put back jobs that have not been monitored this time ftsJobs = jobsToMonitor # # submit new ftsJobs if toSubmit: if request.Status != 'Scheduled': log.info( "Found %d FTSFiles to submit while request is no longer in Scheduled status (%s)" \ % ( len( toSubmit ), request.Status ) ) else: self.__checkDuplicates( request.RequestID, toSubmit ) log.info( "==> found %s FTSFiles to submit" % len( toSubmit ) ) submit = self.__submit( request, operation, toSubmit ) if not submit["OK"]: log.error( submit["Message"] ) else: ftsJobs += submit["Value"] # # status change? - put back request if request.Status != "Scheduled": log.info( "request no longer in 'Scheduled' state (%s), will put it back to RMS" % request.Status ) except escapeTry: # This clause is raised when one wants to return from within the try: clause pass except Exception, exceptMessage: log.exception( "Exception in processRequest", lException = exceptMessage ) finally:
class GatewayService(Service): """ Inherits from Service so it can (and should) be run as a DIRAC service, but replaces several of the internal methods """ GATEWAY_NAME = "Framework/Gateway" def __init__(self): """ Initialize like a real service """ super(GatewayService, self).__init__({ 'modName': GatewayService.GATEWAY_NAME, 'loadName': GatewayService.GATEWAY_NAME, 'standalone': True, 'moduleObj': sys.modules[DIRAC.Core.DISET.private.GatewayService.GatewayService. __module__], 'classObj': self.__class__ }) self.__delegatedCredentials = DictCache() self.__transferBytesLimit = 1024 * 1024 * 100 # to be resolved self._url = None self._handler = None self._threadPool = None self._msgBroker = None self._msgForwarder = None def initialize(self): """ This replaces the standard initialize from Service """ #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % GatewayService.GATEWAY_NAME) gLogger.verbose("Service URL is %s" % self._url) #Load handler result = self._loadHandlerInit() if not result['OK']: return result self._handler = result['Value'] #Discover Handler self._threadPool = ThreadPool(1, max(0, self._cfg.getMaxThreads()), self._cfg.getMaxWaitingPetitions()) self._threadPool.daemonize() self._msgBroker = MessageBroker("%sMSB" % GatewayService.GATEWAY_NAME, threadPool=self._threadPool) self._msgBroker.useMessageObjects(False) getGlobalMessageBroker().useMessageObjects(False) self._msgForwarder = MessageForwarder(self._msgBroker) return S_OK() def _processInThread(self, clientTransport): """ Threaded process function """ #Handshake try: clientTransport.handshake() except: return #Add to the transport pool trid = self._transportPool.add(clientTransport) if not trid: return #Receive and check proposal result = self._receiveAndCheckProposal(trid) if not result['OK']: self._transportPool.sendAndClose(trid, result) return proposalTuple = result['Value'] #Instantiate handler result = self.__getClientInitArgs(trid, proposalTuple) if not result['OK']: self._transportPool.sendAndClose(trid, result) return clientInitArgs = result['Value'] #Execute the action result = self._processProposal(trid, proposalTuple, clientInitArgs) #Close the connection if required if result['closeTransport']: self._transportPool.close(trid) return result def _receiveAndCheckProposal(self, trid): clientTransport = self._transportPool.get(trid) #Get the peer credentials credDict = clientTransport.getConnectingCredentials() #Receive the action proposal retVal = clientTransport.receiveData(1024) if not retVal['OK']: gLogger.error( "Invalid action proposal", "%s %s" % (self._createIdentityString( credDict, clientTransport), retVal['Message'])) return S_ERROR("Invalid action proposal") proposalTuple = retVal['Value'] gLogger.debug("Received action from client", "/".join(list(proposalTuple[1]))) #Check if there are extra credentials if proposalTuple[2]: clientTransport.setExtraCredentials(proposalTuple[2]) return S_OK(proposalTuple) def __getClientInitArgs(self, trid, proposalTuple): clientTransport = self._transportPool.get(trid) #Get the peer credentials credDict = clientTransport.getConnectingCredentials() if 'x509Chain' not in credDict: return S_OK() cKey = (credDict['DN'], credDict.get('group', False), credDict.get('extraCredentials', False), credDict['isLimitedProxy']) dP = self.__delegatedCredentials.get(cKey, 3600) idString = self._createIdentityString(credDict, clientTransport) if dP: gLogger.verbose("Proxy for %s is cached" % idString) return S_OK(dP) result = self.__requestDelegation(clientTransport, credDict) if not result['OK']: gLogger.warn("Could not get proxy for %s: %s" % (idString, result['Message'])) return result delChain = result['Value'] delegatedChain = delChain.dumpAllToString()['Value'] secsLeft = delChain.getRemainingSecs()['Value'] - 1 clientInitArgs = { BaseClient.KW_SETUP: proposalTuple[0][1], BaseClient.KW_TIMEOUT: 600, BaseClient.KW_IGNORE_GATEWAYS: True, BaseClient.KW_USE_CERTIFICATES: False, BaseClient.KW_PROXY_STRING: delegatedChain } if BaseClient.KW_EXTRA_CREDENTIALS in credDict: clientInitArgs[BaseClient.KW_EXTRA_CREDENTIALS] = credDict[ BaseClient.KW_EXTRA_CREDENTIALS] gLogger.warn("Got delegated proxy for %s: %s secs left" % (idString, secsLeft)) self.__delegatedCredentials.add(cKey, secsLeft, clientInitArgs) return S_OK(clientInitArgs) def __requestDelegation(self, clientTransport, credDict): peerChain = credDict['x509Chain'] retVal = peerChain.getCertInChain()['Value'].generateProxyRequest() if not retVal['OK']: return retVal delegationRequest = retVal['Value'] retVal = delegationRequest.dumpRequest() if not retVal['OK']: retVal = S_ERROR("Server Error: Can't generate delegation request") clientTransport.sendData(retVal) return retVal gLogger.info("Sending delegation request for %s" % delegationRequest.getSubjectDN()['Value']) clientTransport.sendData(S_OK({'delegate': retVal['Value']})) delegatedCertChain = clientTransport.receiveData() delegatedChain = X509Chain(keyObj=delegationRequest.getPKey()) retVal = delegatedChain.loadChainFromString(delegatedCertChain) if not retVal['OK']: retVal = S_ERROR("Error in receiving delegated proxy: %s" % retVal['Message']) clientTransport.sendData(retVal) return retVal return S_OK(delegatedChain) #Msg def _mbConnect(self, trid, handlerObj=None): return S_OK() def _mbReceivedMsg(self, cliTrid, msgObj): return self._msgForwarder.msgFromClient(cliTrid, msgObj) def _mbDisconnect(self, cliTrid): self._msgForwarder.cliDisconnect(cliTrid) #Execute action def _executeAction(self, trid, proposalTuple, clientInitArgs): clientTransport = self._transportPool.get(trid) credDict = clientTransport.getConnectingCredentials() targetService = proposalTuple[0][0] actionType = proposalTuple[1][0] actionMethod = proposalTuple[1][1] idString = self._createIdentityString(credDict, clientTransport) #OOkay! Lets do the magic! retVal = clientTransport.receiveData() if not retVal['OK']: gLogger.error("Error while receiving file description", retVal['Message']) clientTransport.sendData( S_ERROR("Error while receiving file description: %s" % retVal['Message'])) return if actionType == "FileTransfer": gLogger.warn("Received a file transfer action from %s" % idString) clientTransport.sendData(S_OK("Accepted")) retVal = self.__forwardFileTransferCall(targetService, clientInitArgs, actionMethod, retVal['Value'], clientTransport) elif actionType == "RPC": gLogger.info("Forwarding %s/%s action to %s for %s" % (actionType, actionMethod, targetService, idString)) retVal = self.__forwardRPCCall(targetService, clientInitArgs, actionMethod, retVal['Value']) elif actionType == "Connection" and actionMethod == "new": gLogger.info("Initiating a messaging connection to %s for %s" % (targetService, idString)) retVal = self._msgForwarder.addClient(trid, targetService, clientInitArgs, retVal['Value']) else: gLogger.warn("Received an invalid %s/%s action from %s" % (actionType, actionMethod, idString)) retVal = S_ERROR("Unknown type of action (%s)" % actionType) #TODO: Send back the data? if 'rpcStub' in retVal: retVal.pop('rpcStub') clientTransport.sendData(retVal) return retVal def __forwardRPCCall(self, targetService, clientInitArgs, method, params): if targetService == "Configuration/Server": if method == "getCompressedDataIfNewer": #Relay CS data directly serviceVersion = gConfigurationData.getVersion() retDict = {'newestVersion': serviceVersion} clientVersion = params[0] if clientVersion < serviceVersion: retDict['data'] = gConfigurationData.getCompressedData() return S_OK(retDict) #Default rpcClient = RPCClient(targetService, **clientInitArgs) methodObj = getattr(rpcClient, method) return methodObj(*params) def __forwardFileTransferCall(self, targetService, clientInitArgs, method, params, clientTransport): transferRelay = TransferRelay(targetService, **clientInitArgs) transferRelay.setTransferLimit(self.__transferBytesLimit) cliFH = FileHelper(clientTransport) #Check file size if method.find("ToClient") > -1: cliFH.setDirection("send") elif method.find("FromClient") > -1: cliFH.setDirection("receive") if not self.__ftCheckMaxTransferSize(params[2]): cliFH.markAsTransferred() return S_ERROR("Transfer size is too big") #Forward queries try: relayMethodObject = getattr(transferRelay, 'forward%s' % method) except: return S_ERROR("Cannot forward unknown method %s" % method) result = relayMethodObject(cliFH, params) return result def __ftCheckMaxTransferSize(self, requestedTransferSize): if not self.__transferBytesLimit: return True if not requestedTransferSize: return True if requestedTransferSize <= self.__transferBytesLimit: return True return False
class Service: SVC_VALID_ACTIONS = { 'RPC': 'export', 'FileTransfer': 'transfer', 'Message': 'msg', 'Connection': 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__(self, serviceData): self._svcData = serviceData self._name = serviceData['loadName'] self._startTime = Time.dateTime() self._validNames = [serviceData['modName']] if serviceData['loadName'] not in self._validNames: self._validNames.append(serviceData['loadName']) self._cfg = ServiceConfiguration(list(self._validNames)) if serviceData['standalone']: self._monitor = gMonitor else: self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = {'queries': 0, 'connections': 0} self._authMgr = AuthManager( "%s/Authorization" % PathFinder.getServiceSection(serviceData['loadName'])) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 self.__maxFD = 0 def setCloneProcessId(self, cloneId): self.__cloneId = cloneId self._monitor.setComponentName("%s-Clone:%s" % (self._name, cloneId)) def _isMetaAction(self, action): referedAction = Service.SVC_VALID_ACTIONS[action] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize(self): #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR("Could not build service URL for %s" % self._name) gLogger.verbose("Service URL is %s" % self._url) #Load handler result = self._loadHandlerInit() if not result['OK']: return result self._handler = result['Value'] #Initialize lock manager self._lockManager = LockManager(self._cfg.getMaxWaitingPetitions()) self._initMonitoring() self._threadPool = ThreadPool(1, max(0, self._cfg.getMaxThreads()), self._cfg.getMaxWaitingPetitions()) self._threadPool.daemonize() self._msgBroker = MessageBroker("%sMSB" % self._name, threadPool=self._threadPool) #Create static dict self._serviceInfoDict = { 'serviceName': self._name, 'serviceSectionPath': PathFinder.getServiceSection(self._name), 'URL': self._cfg.getURL(), 'messageSender': MessageSender(self._name, self._msgBroker), 'validNames': self._validNames, 'csPaths': [ PathFinder.getServiceSection(svcName) for svcName in self._validNames ] } #Call static initialization function try: self._handler['class']._rh__initializeClass( dict(self._serviceInfoDict), self._lockManager, self._msgBroker, self._monitor) if self._handler['init']: for initFunc in self._handler['init']: gLogger.verbose("Executing initialization function") try: result = initFunc(dict(self._serviceInfoDict)) except Exception, excp: gLogger.exception( "Exception while calling initialization function") return S_ERROR( "Exception while calling initialization function: %s" % str(excp)) if not isReturnStructure(result): return S_ERROR( "Service initialization function %s must return S_OK/S_ERROR" % initFunc) if not result['OK']: return S_ERROR("Error while initializing %s: %s" % (self._name, result['Message'])) except Exception, e: errMsg = "Exception while initializing %s" % self._name gLogger.exception(errMsg) return S_ERROR(errMsg) #Load actions after the handler has initialized itself result = self._loadActions() if not result['OK']: return result self._actions = result['Value'] gThreadScheduler.addPeriodicTask(30, self.__reportThreadPoolContents) return S_OK()
class Service: SVC_VALID_ACTIONS = { 'RPC' : 'export', 'FileTransfer': 'transfer', 'Message' : 'msg', 'Connection' : 'Message' } SVC_SECLOG_CLIENT = SecurityLogClient() def __init__( self, serviceName ): self._name = serviceName self._startTime = Time.dateTime() self._cfg = ServiceConfiguration( serviceName ) self._validNames = [ self._name ] self._monitor = MonitoringClient() self.__monitorLastStatsUpdate = time.time() self._stats = { 'queries' : 0, 'connections' : 0 } self._authMgr = AuthManager( "%s/Authorization" % self._cfg.getServicePath() ) self._transportPool = getGlobalTransportPool() self.__cloneId = 0 def setCloneProcessId( self, cloneId ): self.__cloneId = cloneId self._monitor.setComponentName( "%s-Clone:%s" % ( self._name, cloneId ) ) def _isMetaAction( self, action ): referedAction = Service.SVC_VALID_ACTIONS[ action ] if referedAction in Service.SVC_VALID_ACTIONS: return referedAction return False def initialize( self ): #Build the URLs self._url = self._cfg.getURL() if not self._url: return S_ERROR( "Could not build service URL for %s" % self._name ) gLogger.verbose( "Service URL is %s" % self._url ) #Discover Handler self._handlerLocation = self._discoverHandlerLocation() if not self._handlerLocation: return S_ERROR( "Could not find handler location for %s" % self._name ) gLogger.verbose( "Handler found at %s" % self._handlerLocation ) #Load handler result = self._loadHandler() if not result[ 'OK' ]: return result self._handler = result[ 'Value' ] #Initialize lock manager self._lockManager = LockManager( self._cfg.getMaxWaitingPetitions() ) #Load actions result = self._loadActions() if not result[ 'OK' ]: return result self._actions = result[ 'Value' ] self._initMonitoring() self._threadPool = ThreadPool( 1, max( 0, self._cfg.getMaxThreads() ), self._cfg.getMaxWaitingPetitions() ) self._threadPool.daemonize() self._msgBroker = MessageBroker( "%sMSB" % self._name, threadPool = self._threadPool ) #Create static dict self._serviceInfoDict = { 'serviceName' : self._name, 'URL' : self._cfg.getURL(), 'systemSectionPath' : self._cfg.getSystemPath(), 'serviceSectionPath' : self._cfg.getServicePath(), 'messageSender' : MessageSender( self._msgBroker ) } #Call static initialization function try: if self._handler[ 'init' ]: result = self._handler[ 'init' ]( dict( self._serviceInfoDict ) ) if not isReturnStructure( result ): return S_ERROR( "Service initialization function must return S_OK/S_ERROR" ) if not result[ 'OK' ]: return S_ERROR( "Error while initializing %s: %s" % ( self._name, result[ 'Message' ] ) ) except Exception, e: errMsg = "Exception while intializing %s" % self._name gLogger.exception( errMsg ) return S_ERROR( errMsg ) gThreadScheduler.addPeriodicTask( 30, self.__reportThreadPoolContents ) return S_OK()