def initialize( self ): """ agent's initalisation :param self: self reference """ # # save tarsnferDB handler self.transferDB = TransferDB() # # read config options self.maxJobsPerChannel = self.am_getOption( 'MaxJobsPerChannel', self.maxJobsPerChannel ) self.log.info( "max jobs/channel = %s" % self.maxJobsPerChannel ) # # checksum test self.cksmTest = bool( self.am_getOption( "ChecksumTest", False ) ) # # ckecksum type if self.cksmTest: self.cksmType = str( self.am_getOption( "ChecksumType", self.__defaultCksmType ) ).upper() if self.cksmType and self.cksmType not in ( "ADLER32", "MD5", "SHA1" ): self.log.warn( "unknown checksum type: %s, will use default %s" % ( self.cksmType, self.__defaultCksmType ) ) self.cksmType = self.__defaultCksmType self.log.info( "checksum test is %s" % ( { True : "enabled using %s checksum" % self.cksmType, False : "disabled"}[self.cksmTest] ) ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) self.filesBeingStaged = {} return S_OK()
def initialize(self): self.TransferDB = TransferDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK()
def initialize(self): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption("shifterProxy", "DataManager") self.minThreads = self.am_getOption("MinThreads", self.minThreads) self.maxThreads = self.am_getOption("MaxThreads", self.maxThreads) minmax = (abs(self.minThreads), abs(self.maxThreads)) self.minThreads, self.maxThreads = min(minmax), max(minmax) self.log.info("ThreadPool min threads = %s" % self.minThreads) self.log.info("ThreadPool max threads = %s" % self.maxThreads) self.threadPool = ThreadPool(self.minThreads, self.maxThreads) self.threadPool.daemonize() return S_OK()
def initialize(self): self.section = PathFinder.getAgentSection(AGENT_NAME) self.RequestDB = RequestDBMySQL() self.TransferDB = TransferDB() self.DataLog = DataLoggingClient() self.factory = StorageFactory() self.rm = ReplicaManager() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK()
def transferDB(cls): """ TransferDB getter :param cls: class reference """ if not cls.__transferDB: cls.__transferDB = TransferDB() return cls.__transferDB
def initialize( self ): """ agent's initalisation :param self: self reference """ ## save tarsnferDB handler self.transferDB = TransferDB() ## read config options self.maxJobsPerChannel = self.am_getOption( 'MaxJobsPerChannel', 2 ) self.log.info("max jobs/channel = %s" % self.maxJobsPerChannel ) ## checksum test self.cksmTest = bool( self.am_getOption( "ChecksumTest", False ) ) ## ckecksum type if self.cksmTest: self.cksmType = str( self.am_getOption( "ChecksumType", self.__defaultCksmType ) ).upper() if self.cksmType and self.cksmType not in ( "ADLER32", "MD5", "SHA1" ): self.log.warn("unknown checksum type: %s, will use default %s" % ( self.cksmType, self.__defaultCksmType ) ) self.cksmType = self.__defaultCksmType self.log.info( "checksum test is %s" % ( { True : "enabled using %s checksum" % self.cksmType, False : "disabled"}[self.cksmTest] ) ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK()
def transferDB(self): """ TransferDB facade :param self: self reference """ if not self.__transferDB: self.__transferDB = TransferDB() return self.__transferDB
def initializeTransferDBMonitoringHandler(serviceInfo): """ handler initialization :param tuple serviceInfo: service info """ global gTransferDB gTransferDB = TransferDB() return S_OK()
def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK()
def initialize( self ): self.TransferDB = TransferDB() self.maxJobsPerChannel = self.am_getOption( 'MaxJobsPerChannel', 2 ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK()
def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption( "shifterProxy", "DataManager" ) self.minThreads = self.am_getOption( "MinThreads", self.minThreads ) self.maxThreads = self.am_getOption( "MaxThreads", self.maxThreads ) minmax = ( abs( self.minThreads ), abs( self.maxThreads ) ) self.minThreads, self.maxThreads = min( minmax ), max( minmax ) self.log.info( "ThreadPool min threads = %s" % self.minThreads ) self.log.info( "ThreadPool max threads = %s" % self.maxThreads ) self.threadPool = ThreadPool( self.minThreads, self.maxThreads ) self.threadPool.daemonize() return S_OK()
def initialize( self ): self.section = PathFinder.getAgentSection( AGENT_NAME ) self.RequestDB = RequestDBMySQL() self.TransferDB = TransferDB() self.DataLog = DataLoggingClient() self.factory = StorageFactory() self.rm = ReplicaManager() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK()
def initializeTransferDBMonitoringHandler( serviceInfo ): """ handler initialization :param tuple serviceInfo: service info """ global transferDB transferDB = TransferDB() monitoringSection = PathFinder.getServiceSection( "DataManagement/TransferDBMonitoring" ) #Get data location retDict = gConfig.getOption( "%s/DataLocation" % monitoringSection ) if not retDict["OK"]: return retDict dataPath = retDict["Value"].strip() if "/" != dataPath[0]: dataPath = os.path.realpath( "%s/%s" % ( gConfig.getValue( '/LocalSite/InstancePath', rootPath ), dataPath ) ) gLogger.info( "Data will be written into %s path" % dataPath ) ## check data path try: ## exists?? if os.path.exists( dataPath ): ## and it's a dir?? if os.path.isdir( dataPath): ## and writable?? if os.access( dataPath, os.W_OK ): return S_OK() else: return S_ERROR( "Data path %s exists, but it is not writable!" % dataPath ) else: return S_ERROR( "Data path %s exists, but points to a file!" % dataPath ) else: ## create os.makedirs( dataPath ) except ( OSError, IOError ) , anError: return S_ERROR( str(anError) )
class FTSMonitorAgent( AgentModule ): def initialize( self ): self.TransferDB = TransferDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): ######################################################################### # Get the details for all active FTS requests gLogger.info( 'Obtaining requests to monitor' ) res = self.TransferDB.getFTSReq() if not res['OK']: gLogger.error( "Failed to get FTS requests", res['Message'] ) return res if not res['Value']: gLogger.info( "FTSMonitorAgent. No FTS requests found to monitor." ) return S_OK() ftsReqs = res['Value'] gLogger.info( 'Found %s FTS jobs' % len( ftsReqs ) ) ####################################################################### # Check them all.... i = 1 for ftsReqDict in ftsReqs: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting monitoring loop %s of %s\n\n" % ( infoStr, i, len( ftsReqs ) ) gLogger.info( infoStr ) res = self.monitorTransfer( ftsReqDict ) i += 1 return S_OK() def monitorTransfer( self, ftsReqDict ): """ Monitors transfer obtained from TransferDB """ ftsReqID = ftsReqDict['FTSReqID'] ftsGUID = ftsReqDict['FTSGuid'] ftsServer = ftsReqDict['FTSServer'] channelID = ftsReqDict['ChannelID'] sourceSE = ftsReqDict['SourceSE'] targetSE = ftsReqDict['TargetSE'] oFTSRequest = FTSRequest() oFTSRequest.setFTSServer( ftsServer ) oFTSRequest.setFTSGUID( ftsGUID ) oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. gLogger.info( 'Perform summary update of the FTS Request' ) infoStr = "Monitoring FTS Job:\n\n" infoStr = "%sglite-transfer-status -s %s -l %s\n" % ( infoStr, ftsServer, ftsGUID ) infoStr = "%s%s%s\n" % ( infoStr, 'FTS GUID:'.ljust( 20 ), ftsGUID ) infoStr = "%s%s%s\n\n" % ( infoStr, 'FTS Server:'.ljust( 20 ), ftsServer ) gLogger.info( infoStr ) res = oFTSRequest.summary() self.TransferDB.setFTSReqLastMonitor( ftsReqID ) if not res['OK']: gLogger.error( "Failed to update the FTS request summary", res['Message'] ) return res res = oFTSRequest.dumpSummary() if not res['OK']: gLogger.error( "Failed to get FTS request summary", res['Message'] ) return res gLogger.info( res['Value'] ) res = oFTSRequest.getPercentageComplete() if not res['OK']: gLogger.error( "Failed to get FTS percentage complete", res['Message'] ) return res gLogger.info( 'FTS Request found to be %.1f percent complete' % res['Value'] ) self.TransferDB.setFTSReqAttribute( ftsReqID, 'PercentageComplete', res['Value'] ) self.TransferDB.addLoggingEvent( ftsReqID, res['Value'] ) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res['OK']: gLogger.error( "Failed to determine whether FTS request terminal", res['Message'] ) return res if not res['Value']: return S_OK() gLogger.info( 'FTS Request found to be terminal, updating file states' ) ######################################################################### # Get the LFNS associated to the FTS request gLogger.info( 'Obtaining the LFNs associated to this request' ) res = self.TransferDB.getFTSReqLFNs( ftsReqID ) if not res['OK']: gLogger.error( "Failed to obtain FTS request LFNs", res['Message'] ) return res files = res['Value'] if not files: gLogger.error( 'No files present for transfer' ) return S_ERROR( 'No files were found in the DB' ) lfns = files.keys() gLogger.info( 'Obtained %s files' % len( lfns ) ) for lfn in lfns: oFTSRequest.setLFN( lfn ) res = oFTSRequest.monitor() if not res['OK']: gLogger.error( "Failed to perform detailed monitoring of FTS request", res['Message'] ) return res res = oFTSRequest.getFailed() if not res['OK']: gLogger.error( "Failed to obtained failed files for FTS request", res['Message'] ) return res failedFiles = res['Value'] res = oFTSRequest.getDone() if not res['OK']: gLogger.error( "Failed to obtained successful files for FTS request", res['Message'] ) return res completedFiles = res['Value'] fileToFTSUpdates = [] completedFileIDs = [] for lfn in completedFiles: fileID = files[lfn] completedFileIDs.append( fileID ) transferTime = 0 res = oFTSRequest.getTransferTime( lfn ) if res['OK']: transferTime = res['Value'] fileToFTSUpdates.append( ( fileID, 'Completed', '', 0, transferTime ) ) filesToRetry = [] filesToReschedule = [] for lfn in failedFiles: fileID = files[lfn] failReason = '' res = oFTSRequest.getFailReason( lfn ) if res['OK']: failReason = res['Value'] if self.missingSource( failReason ): gLogger.error( 'The source SURL does not exist.', '%s %s' % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) ) filesToReschedule.append( fileID ) else: filesToRetry.append( fileID ) gLogger.error( 'Failed to replicate file on channel.', "%s %s" % ( channelID, failReason ) ) fileToFTSUpdates.append( ( fileID, 'Failed', failReason, 0, 0 ) ) allUpdated = True if filesToRetry: gLogger.info( 'Updating the Channel table for files to retry' ) res = self.TransferDB.resetFileChannelStatus( channelID, filesToRetry ) if not res['OK']: gLogger.error( 'Failed to update the Channel table for file to retry.', res['Message'] ) allUpdated = False for fileID in filesToReschedule: gLogger.info( 'Updating the Channel table for files to reschedule' ) res = self.TransferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res['OK']: gLogger.error( 'Failed to update Channel table for failed files.', res['Message'] ) allUpdated = False if completedFileIDs: gLogger.info( 'Updating the Channel table for successful files' ) res = self.TransferDB.updateCompletedChannelStatus( channelID, completedFileIDs ) if not res['OK']: gLogger.error( 'Failed to update the Channel table for successful files.', res['Message'] ) allUpdated = False gLogger.info( 'Updating the Channel table for ancestors of successful files' ) res = self.TransferDB.updateAncestorChannelStatus( channelID, completedFileIDs ) if not res['OK']: gLogger.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] ) allUpdated = False # # cibak: temporary switch off of regitstration in FileToCat # #gLogger.info( 'Updating the FileToCat table for successful files' ) #res = self.TransferDB.setRegistrationWaiting( channelID, completedFileIDs ) #if not res['OK']: # gLogger.error( 'Failed to update the FileToCat table for successful files.', res['Message'] ) # allUpdated = False if fileToFTSUpdates: gLogger.info( 'Updating the FileToFTS table for files' ) res = self.TransferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates ) if not res['OK']: gLogger.error( 'Failed to update the FileToFTS table for files.', res['Message'] ) allUpdated = False if allUpdated: res = oFTSRequest.finalize() if not res['OK']: gLogger.error( "Failed to perform the finalization for the FTS request", res['Message'] ) return res gLogger.info( 'Adding logging event for FTS request' ) # Now set the FTSReq status to terminal so that it is not monitored again res = self.TransferDB.addLoggingEvent( ftsReqID, 'Finished' ) if not res['OK']: gLogger.error( 'Failed to add logging event for FTS Request', res['Message'] ) gLogger.info( 'Updating FTS request status' ) res = self.TransferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not res['OK']: gLogger.error( 'Failed update FTS Request status', res['Message'] ) return S_OK() def missingSource( self, failReason ): missingSourceErrors = ['SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed', 'SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory', 'SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed', 'SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist', 'TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500 Command failed. : open error: No such file or directory', 'SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist'] for error in missingSourceErrors: if re.search( error, failReason ): return 1 return 0
class FTSMonitorAgent( AgentModule ): """ .. class:: FTSMonitorAgent """ ## transfer DB handle transferDB = None def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ execution in one cycle """ ######################################################################### # Get the details for all active FTS requests self.log.info( 'Obtaining requests to monitor' ) res = self.transferDB.getFTSReq() if not res['OK']: self.log.error( "Failed to get FTS requests", res['Message'] ) return res if not res['Value']: self.log.info( "FTSMonitorAgent. No FTS requests found to monitor." ) return S_OK() ftsReqs = res['Value'] self.log.info( 'Found %s FTS jobs' % len( ftsReqs ) ) ####################################################################### # Check them all.... i = 1 for ftsReqDict in ftsReqs: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting monitoring loop %s of %s\n\n" % ( infoStr, i, len( ftsReqs ) ) self.log.info( infoStr ) res = self.monitorTransfer( ftsReqDict ) i += 1 return S_OK() def monitorTransfer( self, ftsReqDict ): """ monitors transfer obtained from TransferDB """ ftsReqID = ftsReqDict['FTSReqID'] ftsGUID = ftsReqDict['FTSGuid'] ftsServer = ftsReqDict['FTSServer'] channelID = ftsReqDict['ChannelID'] sourceSE = ftsReqDict['SourceSE'] targetSE = ftsReqDict['TargetSE'] oFTSRequest = FTSRequest() oFTSRequest.setFTSServer( ftsServer ) oFTSRequest.setFTSGUID( ftsGUID ) oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. self.log.info( 'Perform summary update of the FTS Request' ) infoStr = "Monitoring FTS Job:\n\n" infoStr = "%sglite-transfer-status -s %s -l %s\n" % ( infoStr, ftsServer, ftsGUID ) infoStr = "%s%s%s\n" % ( infoStr, 'FTS GUID:'.ljust( 20 ), ftsGUID ) infoStr = "%s%s%s\n\n" % ( infoStr, 'FTS Server:'.ljust( 20 ), ftsServer ) self.log.info( infoStr ) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor( ftsReqID ) if not res['OK']: self.log.error( "Failed to update the FTS request summary", res['Message'] ) if "getTransferJobSummary2: Not authorised to query request" in res["Message"]: self.log.error("FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side") ## get fileIDs fileIDs = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not fileIDs["OK"]: self.log.error("Unable to retrieve FileIDs associated to %s request" % ftsReqID ) return fileIDs fileIDs = fileIDs["Value"] ## update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed" ) if not fileStatus["OK"]: self.log.error("Unable to set FileToFTS status to Failed for FileID %s: %s" % ( fileID, fileStatus["Message"] ) ) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server" ) if not failReason["OK"]: self.log.error("Unable to set FileToFTS reason for FileID %s: %s" % ( fileID, failReason["Message"] ) ) ## update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not resetChannels["OK"]: self.log.error("Failed to reset Channel table for files to retry") return resetChannels ## update FTSReq table self.log.info( 'Setting FTS request status to Finished' ) ftsReqStatus = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not ftsReqStatus['OK']: self.log.error( 'Failed update FTS Request status', ftsReqStatus['Message'] ) return ftsReqStatus ## if we land here, everything should be OK return S_OK() return res res = oFTSRequest.dumpSummary() if not res['OK']: self.log.error( "Failed to get FTS request summary", res['Message'] ) return res self.log.info( res['Value'] ) res = oFTSRequest.getPercentageComplete() if not res['OK']: self.log.error( "Failed to get FTS percentage complete", res['Message'] ) return res self.log.info( 'FTS Request found to be %.1f percent complete' % res['Value'] ) self.transferDB.setFTSReqAttribute( ftsReqID, 'PercentageComplete', res['Value'] ) self.transferDB.addLoggingEvent( ftsReqID, res['Value'] ) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res['OK']: self.log.error( "Failed to determine whether FTS request terminal", res['Message'] ) return res if not res['Value']: return S_OK() self.log.info( 'FTS Request found to be terminal, updating file states' ) ######################################################################### # Get the LFNS associated to the FTS request self.log.info( 'Obtaining the LFNs associated to this request' ) res = self.transferDB.getFTSReqLFNs( ftsReqID, channelID, sourceSE ) if not res['OK']: self.log.error( "Failed to obtain FTS request LFNs", res['Message'] ) return res files = res['Value'] if not files: self.log.error( 'No files present for transfer' ) return S_ERROR( 'No files were found in the DB' ) lfns = files.keys() self.log.info( 'Obtained %s files' % len( lfns ) ) for lfn in lfns: oFTSRequest.setLFN( lfn ) res = oFTSRequest.monitor() if not res['OK']: self.log.error( "Failed to perform detailed monitoring of FTS request", res['Message'] ) return res res = oFTSRequest.getFailed() if not res['OK']: self.log.error( "Failed to obtained failed files for FTS request", res['Message'] ) return res failedFiles = res['Value'] res = oFTSRequest.getDone() if not res['OK']: self.log.error( "Failed to obtained successful files for FTS request", res['Message'] ) return res completedFiles = res['Value'] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not res['OK']: self.log.error( 'Failed to get FileIDs associated to FTS Request', res['Message'] ) return res fileIDs = res['Value'] res = self.transferDB.getAttributesForFilesList( fileIDs, ['LFN'] ) if not res['OK']: self.log.error( 'Failed to get LFNs associated to FTS Request', res['Message'] ) return res fileIDDict = res['Value'] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append( fileID ) transferTime = 0 res = oFTSRequest.getTransferTime( lfn ) if res['OK']: transferTime = res['Value'] fileToFTSUpdates.append( ( fileID, 'Completed', '', 0, transferTime ) ) if lfn in failedFiles: failReason = '' res = oFTSRequest.getFailReason( lfn ) if res['OK']: failReason = res['Value'] if self.missingSource( failReason ): self.log.error( 'The source SURL does not exist.', '%s %s' % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) ) filesToFail.append( fileID ) else: filesToRetry.append( fileID ) self.log.error( 'Failed to replicate file on channel.', "%s %s" % ( channelID, failReason ) ) fileToFTSUpdates.append( ( fileID, 'Failed', failReason, 0, 0 ) ) allUpdated = True if filesToRetry: self.log.info( 'Updating the Channel table for files to retry' ) res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry ) if not res['OK']: self.log.error( 'Failed to update the Channel table for file to retry.', res['Message'] ) allUpdated = False for fileID in filesToFail: self.log.info( 'Updating the Channel table for files to reschedule' ) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res['OK']: self.log.error( 'Failed to update Channel table for failed files.', res['Message'] ) allUpdated = False if completedFileIDs: self.log.info( 'Updating the Channel table for successful files' ) res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs ) if not res['OK']: self.log.error( 'Failed to update the Channel table for successful files.', res['Message'] ) allUpdated = False self.log.info( 'Updating the Channel table for ancestors of successful files' ) res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs ) if not res['OK']: self.log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] ) allUpdated = False if fileToFTSUpdates: self.log.info( 'Updating the FileToFTS table for files' ) res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates ) if not res['OK']: self.log.error( 'Failed to update the FileToFTS table for files.', res['Message'] ) allUpdated = False if allUpdated: res = oFTSRequest.finalize() if not res['OK']: self.log.error( "Failed to perform the finalization for the FTS request", res['Message'] ) return res self.log.info( 'Adding logging event for FTS request' ) # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent( ftsReqID, 'Finished' ) if not res['OK']: self.log.error( 'Failed to add logging event for FTS Request', res['Message'] ) res = oFTSRequest.getFailedRegistrations() failedRegistrations = res['Value'] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: self.log.info( 'Setting failed registration in FileToCat back to Waiting', lfn ) regFailedFileIDs.append( fileID ) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append( fileID ) elif fileID in filesToFail: regForgetFileIDs.append( fileID ) if regFailedFileIDs: res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs ) if not res['OK']: self.log.error( 'Failed to reset entries in FileToCat', res['Message'] ) return res if regDoneFileIDs: res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs ) if not res['OK']: self.log.error( 'Failed to set entries Done in FileToCat', res['Message'] ) return res if regForgetFileIDs: # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs ) if not res['OK']: self.log.error( 'Failed to set entries Done in FileToCat', res['Message'] ) return res self.log.info( 'Updating FTS request status' ) res = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not res['OK']: self.log.error( 'Failed update FTS Request status', res['Message'] ) return S_OK() def missingSource( self, failReason ): """ check if message sent by FTS server is concering missing source file :param self: self reference :param str failReason: message sent by FTS server """ missingSourceErrors = [ 'SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed', 'SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory', 'SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed', 'SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist', 'TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500 Command failed. : open error: No such file or directory', 'SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist' ] for error in missingSourceErrors: if re.search( error, failReason ): return 1 return 0
class FTSMonitorAgent( AgentModule ): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize( self ): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption( "shifterProxy", "DataManager" ) self.minThreads = self.am_getOption( "MinThreads", self.minThreads ) self.maxThreads = self.am_getOption( "MaxThreads", self.maxThreads ) minmax = ( abs( self.minThreads ), abs( self.maxThreads ) ) self.minThreads, self.maxThreads = min( minmax ), max( minmax ) self.log.info( "ThreadPool min threads = %s" % self.minThreads ) self.log.info( "ThreadPool max threads = %s" % self.maxThreads ) self.threadPool = ThreadPool( self.minThreads, self.maxThreads ) self.threadPool.daemonize() return S_OK() def execute( self ): """ push jobs to the thread pool """ self.log.info( "Obtaining requests to monitor" ) res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error( "Failed to get FTS requests", res['Message'] ) return res if not res["Value"]: self.log.info( "No FTS requests found to monitor." ) return S_OK() ftsReqs = res["Value"] self.log.info( "Found %s FTS jobs" % len( ftsReqs ) ) i = 1 for ftsJob in ftsReqs: while True: self.log.debug( "submitting FTS Job %s FTSReqID=%s to monitor" % ( i, ftsJob["FTSReqID"] ) ) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args = ( ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep( 1 ) self.threadPool.processAllResults() return S_OK() def ftsJobExpired( self, ftsReqID, channelID ): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) fileIDs = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not fileIDs["OK"]: log.error( "Unable to retrieve FileIDs associated to %s request" % ftsReqID ) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed" ) if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % ( fileID, fileStatus["Message"] ) ) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server" ) if not failReason["OK"]: log.error( "Unable to set FileToFTS reason for FileID %s: %s" % ( fileID, failReason["Message"] ) ) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not resetChannels["OK"]: log.error( "Failed to reset Channel table for files to retry" ) return resetChannels # # update FTSReq table log.info( "Setting FTS request status to 'Finished'" ) ftsReqStatus = self.transferDB.setFTSReqStatus( ftsReqID, "Finished" ) if not ftsReqStatus["OK"]: log.error( "Failed update FTS Request status", ftsReqStatus["Message"] ) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer( self, ftsReqDict ): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get( "FTSReqID" ) ftsGUID = ftsReqDict.get( "FTSGuid" ) ftsServer = ftsReqDict.get( "FTSServer" ) channelID = ftsReqDict.get( "ChannelID" ) sourceSE = ftsReqDict.get( "SourceSE" ) targetSE = ftsReqDict.get( "TargetSE" ) oFTSRequest = FTSRequest() oFTSRequest.setFTSServer( ftsServer ) oFTSRequest.setFTSGUID( ftsGUID ) oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) log = gLogger.getSubLogger( "@%s" % str( ftsReqID ) ) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info( "Perform summary update of the FTS Request" ) infoStr = [ "glite-transfer-status -s %s -l %s" % ( ftsServer, ftsGUID ) ] infoStr.append( "FTS GUID: %s" % ftsGUID ) infoStr.append( "FTS Server: %s" % ftsServer ) log.info( "\n".join( infoStr ) ) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor( ftsReqID ) if not res["OK"]: log.error( "Failed to update the FTS request summary", res["Message"] ) if "getTransferJobSummary2: Not authorised to query request" in res["Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired( ftsReqID, channelID ) if not cleanUp["OK"]: log.error( cleanUp["Message"] ) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error( "Failed to get FTS request summary", res["Message"] ) return res log.info( res['Value'] ) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error( "Failed to get FTS percentage complete", res["Message"] ) return res log.info( 'FTS Request found to be %.1f percent complete' % res["Value"] ) self.transferDB.setFTSReqAttribute( ftsReqID, "PercentageComplete", res["Value"] ) self.transferDB.addLoggingEvent( ftsReqID, res["Value"] ) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error( "Failed to determine whether FTS request terminal", res["Message"] ) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest( oFTSRequest, ftsReqID, channelID, sourceSE ) def terminalRequest( self, oFTSRequest, ftsReqID, channelID, sourceSE ): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) log.info( "FTS Request found to be terminal, updating file states" ) ######################################################################### # Get the LFNS associated to the FTS request log.info( "Obtaining the LFNs associated to this request" ) res = self.transferDB.getFTSReqLFNs( ftsReqID, channelID, sourceSE ) if not res["OK"]: log.error( "Failed to obtain FTS request LFNs", res['Message'] ) return res files = res["Value"] if not files: log.error( "No files present for transfer" ) return S_ERROR( "No files were found in the DB" ) lfns = files.keys() log.debug( "Obtained %s files" % len( lfns ) ) for lfn in lfns: oFTSRequest.setLFN( lfn ) res = oFTSRequest.monitor() if not res["OK"]: log.error( "Failed to perform detailed monitoring of FTS request", res["Message"] ) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error( "Failed to obtained failed files for FTS request", res["Message"] ) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error( "Failed to obtained successful files for FTS request", res["Message"] ) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs( ftsReqID ) if not res["OK"]: log.error( "Failed to get FileIDs associated to FTS Request", res["Message"] ) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList( fileIDs, ["LFN"] ) if not res["OK"]: log.error( "Failed to get LFNs associated to FTS Request", res["Message"] ) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append( fileID ) transferTime = 0 res = oFTSRequest.getTransferTime( lfn ) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( ( fileID, "Completed", "", 0, transferTime ) ) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason( lfn ) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append( fileID ) continue if self.missingSource( failReason ): log.error( "The source SURL does not exist.", "%s %s" % ( lfn, oFTSRequest.getSourceSURL( lfn ) ) ) filesToFail.append( fileID ) else: filesToRetry.append( fileID ) log.error( "Failed to replicate file on channel.", "%s %s" % ( channelID, failReason ) ) fileToFTSUpdates.append( ( fileID, "Failed", failReason, 0, 0 ) ) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS( ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"] ) return res log.info( 'Adding logging event for FTS request' ) # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed to add logging event for FTS Request', res['Message'] ) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat( oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ) if not updateFileToCat["OK"]: log.error( updateFileToCat["Message"] ) log.debug( "Updating FTS request status" ) res = self.transferDB.setFTSReqStatus( ftsReqID, 'Finished' ) if not res['OK']: log.error( 'Failed update FTS Request status', res['Message'] ) return S_OK() def updateFileToFTS( self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates ): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger( "@%s" % ftsReqID ) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry ) if filesToRetry else S_OK() if not res["OK"]: log.error( "Failed to update the Channel table for file to retry.", res["Message"] ) allUpdated = False for fileID in filesToFail: log.info( "Updating the Channel table for files to reschedule" ) res = self.transferDB.setFileToReschedule( fileID ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error( "setting Channel status to 'Failed' : " % res["Value"] ) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"] ) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"] ) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs ) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message'] ) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates ) if not res["OK"]: log.error( "Failed to update the FileToFTS table for files.", res["Message"] ) allUpdated = False return S_OK( allUpdated ) def updateFileToCat( self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail ): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append( fileID ) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append( fileID ) elif fileID in filesToFail: regForgetFileIDs.append( fileID ) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs ) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res["Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs ) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs ) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res["Message"] return res return S_OK() @classmethod def missingSource( cls, failReason ): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search( failReason ): return 1 return 0
class FTSMonitorAgent(AgentModule): def initialize(self): self.TransferDB = TransferDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): ######################################################################### # Get the details for all active FTS requests gLogger.info('Obtaining requests to monitor') res = self.TransferDB.getFTSReq() if not res['OK']: gLogger.error("Failed to get FTS requests", res['Message']) return res if not res['Value']: gLogger.info("FTSMonitorAgent. No FTS requests found to monitor.") return S_OK() ftsReqs = res['Value'] gLogger.info('Found %s FTS jobs' % len(ftsReqs)) ####################################################################### # Check them all.... i = 1 for ftsReqDict in ftsReqs: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting monitoring loop %s of %s\n\n" % ( infoStr, i, len(ftsReqs)) gLogger.info(infoStr) res = self.monitorTransfer(ftsReqDict) i += 1 return S_OK() def monitorTransfer(self, ftsReqDict): """ Monitors transfer obtained from TransferDB """ ftsReqID = ftsReqDict['FTSReqID'] ftsGUID = ftsReqDict['FTSGuid'] ftsServer = ftsReqDict['FTSServer'] channelID = ftsReqDict['ChannelID'] sourceSE = ftsReqDict['SourceSE'] targetSE = ftsReqDict['TargetSE'] oFTSRequest = FTSRequest() oFTSRequest.setFTSServer(ftsServer) oFTSRequest.setFTSGUID(ftsGUID) oFTSRequest.setSourceSE(sourceSE) oFTSRequest.setTargetSE(targetSE) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. gLogger.info('Perform summary update of the FTS Request') infoStr = "Monitoring FTS Job:\n\n" infoStr = "%sglite-transfer-status -s %s -l %s\n" % ( infoStr, ftsServer, ftsGUID) infoStr = "%s%s%s\n" % (infoStr, 'FTS GUID:'.ljust(20), ftsGUID) infoStr = "%s%s%s\n\n" % (infoStr, 'FTS Server:'.ljust(20), ftsServer) gLogger.info(infoStr) res = oFTSRequest.summary() self.TransferDB.setFTSReqLastMonitor(ftsReqID) if not res['OK']: gLogger.error("Failed to update the FTS request summary", res['Message']) return res res = oFTSRequest.dumpSummary() if not res['OK']: gLogger.error("Failed to get FTS request summary", res['Message']) return res gLogger.info(res['Value']) res = oFTSRequest.getPercentageComplete() if not res['OK']: gLogger.error("Failed to get FTS percentage complete", res['Message']) return res gLogger.info('FTS Request found to be %.1f percent complete' % res['Value']) self.TransferDB.setFTSReqAttribute(ftsReqID, 'PercentageComplete', res['Value']) self.TransferDB.addLoggingEvent(ftsReqID, res['Value']) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res['OK']: gLogger.error("Failed to determine whether FTS request terminal", res['Message']) return res if not res['Value']: return S_OK() gLogger.info('FTS Request found to be terminal, updating file states') ######################################################################### # Get the LFNS associated to the FTS request gLogger.info('Obtaining the LFNs associated to this request') res = self.TransferDB.getFTSReqLFNs(ftsReqID) if not res['OK']: gLogger.error("Failed to obtain FTS request LFNs", res['Message']) return res files = res['Value'] if not files: gLogger.error('No files present for transfer') return S_ERROR('No files were found in the DB') lfns = files.keys() gLogger.info('Obtained %s files' % len(lfns)) for lfn in lfns: oFTSRequest.setLFN(lfn) res = oFTSRequest.monitor() if not res['OK']: gLogger.error( "Failed to perform detailed monitoring of FTS request", res['Message']) return res res = oFTSRequest.getFailed() if not res['OK']: gLogger.error("Failed to obtained failed files for FTS request", res['Message']) return res failedFiles = res['Value'] res = oFTSRequest.getDone() if not res['OK']: gLogger.error( "Failed to obtained successful files for FTS request", res['Message']) return res completedFiles = res['Value'] fileToFTSUpdates = [] completedFileIDs = [] for lfn in completedFiles: fileID = files[lfn] completedFileIDs.append(fileID) transferTime = 0 res = oFTSRequest.getTransferTime(lfn) if res['OK']: transferTime = res['Value'] fileToFTSUpdates.append((fileID, 'Completed', '', 0, transferTime)) filesToRetry = [] filesToReschedule = [] for lfn in failedFiles: fileID = files[lfn] failReason = '' res = oFTSRequest.getFailReason(lfn) if res['OK']: failReason = res['Value'] if self.missingSource(failReason): gLogger.error('The source SURL does not exist.', '%s %s' % (lfn, oFTSRequest.getSourceSURL(lfn))) filesToReschedule.append(fileID) else: filesToRetry.append(fileID) gLogger.error('Failed to replicate file on channel.', "%s %s" % (channelID, failReason)) fileToFTSUpdates.append((fileID, 'Failed', failReason, 0, 0)) allUpdated = True if filesToRetry: gLogger.info('Updating the Channel table for files to retry') res = self.TransferDB.resetFileChannelStatus( channelID, filesToRetry) if not res['OK']: gLogger.error( 'Failed to update the Channel table for file to retry.', res['Message']) allUpdated = False for fileID in filesToReschedule: gLogger.info('Updating the Channel table for files to reschedule') res = self.TransferDB.setFileChannelStatus(channelID, fileID, 'Failed') if not res['OK']: gLogger.error( 'Failed to update Channel table for failed files.', res['Message']) allUpdated = False if completedFileIDs: gLogger.info('Updating the Channel table for successful files') res = self.TransferDB.updateCompletedChannelStatus( channelID, completedFileIDs) if not res['OK']: gLogger.error( 'Failed to update the Channel table for successful files.', res['Message']) allUpdated = False gLogger.info( 'Updating the Channel table for ancestors of successful files') res = self.TransferDB.updateAncestorChannelStatus( channelID, completedFileIDs) if not res['OK']: gLogger.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message']) allUpdated = False # # cibak: temporary switch off of regitstration in FileToCat # #gLogger.info( 'Updating the FileToCat table for successful files' ) #res = self.TransferDB.setRegistrationWaiting( channelID, completedFileIDs ) #if not res['OK']: # gLogger.error( 'Failed to update the FileToCat table for successful files.', res['Message'] ) # allUpdated = False if fileToFTSUpdates: gLogger.info('Updating the FileToFTS table for files') res = self.TransferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates) if not res['OK']: gLogger.error( 'Failed to update the FileToFTS table for files.', res['Message']) allUpdated = False if allUpdated: res = oFTSRequest.finalize() if not res['OK']: gLogger.error( "Failed to perform the finalization for the FTS request", res['Message']) return res gLogger.info('Adding logging event for FTS request') # Now set the FTSReq status to terminal so that it is not monitored again res = self.TransferDB.addLoggingEvent(ftsReqID, 'Finished') if not res['OK']: gLogger.error('Failed to add logging event for FTS Request', res['Message']) gLogger.info('Updating FTS request status') res = self.TransferDB.setFTSReqStatus(ftsReqID, 'Finished') if not res['OK']: gLogger.error('Failed update FTS Request status', res['Message']) return S_OK() def missingSource(self, failReason): missingSourceErrors = [ 'SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed', 'SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory', 'SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed', 'SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist', 'TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500 Command failed. : open error: No such file or directory', 'SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist' ] for error in missingSourceErrors: if re.search(error, failReason): return 1 return 0
class FTSSubmitAgent( AgentModule ): def initialize( self ): self.TransferDB = TransferDB() self.maxJobsPerChannel = self.am_getOption( 'MaxJobsPerChannel', 2 ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): ######################################################################### # Obtain the eligible channels for submission. gLogger.info( 'Obtaining channels eligible for submission.' ) res = self.TransferDB.selectChannelsForSubmission( self.maxJobsPerChannel ) if not res['OK']: gLogger.error( "Failed to retrieve channels for submission.", res['Message'] ) return S_OK() elif not res['Value']: gLogger.info( "FTSSubmitAgent. No channels eligible for submission." ) return S_OK() channelDicts = res['Value'] gLogger.info( 'Found %s eligible channels.' % len( channelDicts ) ) ######################################################################### # Submit to all the eligible waiting channels. i = 1 for channelDict in channelDicts: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting submission loop %s of %s\n\n" % ( infoStr, i, len( channelDicts ) ) gLogger.info( infoStr ) res = self.submitTransfer( channelDict ) i += 1 return S_OK() def submitTransfer( self, channelDict ): """ This method creates and submits FTS jobs based on information it gets from the DB """ # Create the FTSRequest object for preparing the submission oFTSRequest = FTSRequest() channelID = channelDict['ChannelID'] filesPerJob = channelDict['NumFiles'] ######################################################################### # Obtain the first files in the selected channel. gLogger.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files to transfer on channel %s" % channelID ) res = self.TransferDB.getFilesForChannel( channelID, 2 * filesPerJob ) if not res['OK']: errStr = 'FTSSubmitAgent.%s' % res['Message'] gLogger.error( errStr ) return S_OK() if not res['Value']: gLogger.info( "FTSSubmitAgent.submitTransfer: No files to found for channel." ) return S_OK() filesDict = res['Value'] gLogger.info( 'Obtained %s files for channel' % len( filesDict['Files'] ) ) sourceSE = filesDict['SourceSE'] oFTSRequest.setSourceSE( sourceSE ) targetSE = filesDict['TargetSE'] oFTSRequest.setTargetSE( targetSE ) gLogger.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files for %s to %s channel." % ( sourceSE, targetSE ) ) files = filesDict['Files'] ######################################################################### # Populate the FTS Request with the files. gLogger.info( 'Populating the FTS request with file information' ) fileIDs = [] totalSize = 0 fileIDSizes = {} for file in files: lfn = file['LFN'] oFTSRequest.setLFN( lfn ) oFTSRequest.setSourceSURL( lfn, file['SourceSURL'] ) oFTSRequest.setTargetSURL( lfn, file['TargetSURL'] ) fileID = file['FileID'] fileIDs.append( fileID ) totalSize += file['Size'] fileIDSizes[fileID] = file['Size'] ######################################################################### # Submit the FTS request and retrieve the FTS GUID/Server gLogger.info( 'Submitting the FTS request' ) res = oFTSRequest.submit() if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] gLogger.error( errStr ) gLogger.info( 'Updating the Channel table for files to retry' ) res = self.TransferDB.resetFileChannelStatus( channelID, fileIDs ) if not res['OK']: gLogger.error( 'Failed to update the Channel table for file to retry.', res['Message'] ) return S_ERROR( errStr ) ftsGUID = res['Value']['ftsGUID'] ftsServer = res['Value']['ftsServer'] infoStr = """Submitted FTS Job: FTS Guid: %s FTS Server: %s ChannelID: %s SourceSE: %s TargetSE: %s Files: %s """ % ( ftsGUID, ftsServer, str( channelID ), sourceSE, targetSE, str( len( files ) ) ) gLogger.info( infoStr ) ######################################################################### # Insert the FTS Req details and add the number of files and size res = self.TransferDB.insertFTSReq( ftsGUID, ftsServer, channelID ) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] gLogger.error( errStr ) return S_ERROR( errStr ) ftsReqID = res['Value'] gLogger.info( 'Obtained FTS RequestID %s' % ftsReqID ) res = self.TransferDB.setFTSReqAttribute( ftsReqID, 'SourceSE', sourceSE ) if not res['OK']: gLogger.error( "Failed to set SourceSE for FTSRequest", res['Message'] ) res = self.TransferDB.setFTSReqAttribute( ftsReqID, 'TargetSE', targetSE ) if not res['OK']: gLogger.error( "Failed to set TargetSE for FTSRequest", res['Message'] ) res = self.TransferDB.setFTSReqAttribute( ftsReqID, 'NumberOfFiles', len( fileIDs ) ) if not res['OK']: gLogger.error( "Failed to set NumberOfFiles for FTSRequest", res['Message'] ) res = self.TransferDB.setFTSReqAttribute( ftsReqID, 'TotalSize', totalSize ) if not res['OK']: gLogger.error( "Failed to set TotalSize for FTSRequest", res['Message'] ) ######################################################################### # Insert the submission event in the FTSReqLogging table event = 'Submitted' res = self.TransferDB.addLoggingEvent( ftsReqID, event ) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] gLogger.error( errStr ) ######################################################################### # Insert the FileToFTS details and remove the files from the channel gLogger.info( 'Setting the files as Executing in the Channel table' ) res = self.TransferDB.setChannelFilesExecuting( channelID, fileIDs ) if not res['OK']: gLogger.error( 'Failed to update the Channel tables for files.', res['Message'] ) lfns = [] fileToFTSFileAttributes = [] for file in files: lfn = file['LFN'] fileID = file['FileID'] lfns.append( lfn ) fileToFTSFileAttributes.append( ( fileID, fileIDSizes[fileID] ) ) gLogger.info( 'Populating the FileToFTS table with file information' ) res = self.TransferDB.setFTSReqFiles( ftsReqID, channelID, fileToFTSFileAttributes ) if not res['OK']: gLogger.error( 'Failed to populate the FileToFTS table with files.' )
def setUp(self): self.TransferDB = TransferDB() self.RequestDB = RequestDB('mysql')
class FTSSubmitAgent(AgentModule): def initialize(self): self.TransferDB = TransferDB() self.maxJobsPerChannel = self.am_getOption('MaxJobsPerChannel', 2) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): ######################################################################### # Obtain the eligible channels for submission. gLogger.info('Obtaining channels eligible for submission.') res = self.TransferDB.selectChannelsForSubmission( self.maxJobsPerChannel) if not res['OK']: gLogger.error("Failed to retrieve channels for submission.", res['Message']) return S_OK() elif not res['Value']: gLogger.info( "FTSSubmitAgent. No channels eligible for submission.") return S_OK() channelDicts = res['Value'] gLogger.info('Found %s eligible channels.' % len(channelDicts)) ######################################################################### # Submit to all the eligible waiting channels. i = 1 for channelDict in channelDicts: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting submission loop %s of %s\n\n" % ( infoStr, i, len(channelDicts)) gLogger.info(infoStr) res = self.submitTransfer(channelDict) i += 1 return S_OK() def submitTransfer(self, channelDict): """ This method creates and submits FTS jobs based on information it gets from the DB """ # Create the FTSRequest object for preparing the submission oFTSRequest = FTSRequest() channelID = channelDict['ChannelID'] filesPerJob = channelDict['NumFiles'] ######################################################################### # Obtain the first files in the selected channel. gLogger.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files to transfer on channel %s" % channelID) res = self.TransferDB.getFilesForChannel(channelID, 2 * filesPerJob) if not res['OK']: errStr = 'FTSSubmitAgent.%s' % res['Message'] gLogger.error(errStr) return S_OK() if not res['Value']: gLogger.info( "FTSSubmitAgent.submitTransfer: No files to found for channel." ) return S_OK() filesDict = res['Value'] gLogger.info('Obtained %s files for channel' % len(filesDict['Files'])) sourceSE = filesDict['SourceSE'] oFTSRequest.setSourceSE(sourceSE) targetSE = filesDict['TargetSE'] oFTSRequest.setTargetSE(targetSE) gLogger.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files for %s to %s channel." % (sourceSE, targetSE)) files = filesDict['Files'] ######################################################################### # Populate the FTS Request with the files. gLogger.info('Populating the FTS request with file information') fileIDs = [] totalSize = 0 fileIDSizes = {} for file in files: lfn = file['LFN'] oFTSRequest.setLFN(lfn) oFTSRequest.setSourceSURL(lfn, file['SourceSURL']) oFTSRequest.setTargetSURL(lfn, file['TargetSURL']) fileID = file['FileID'] fileIDs.append(fileID) totalSize += file['Size'] fileIDSizes[fileID] = file['Size'] ######################################################################### # Submit the FTS request and retrieve the FTS GUID/Server gLogger.info('Submitting the FTS request') res = oFTSRequest.submit() if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] gLogger.error(errStr) gLogger.info('Updating the Channel table for files to retry') res = self.TransferDB.resetFileChannelStatus(channelID, fileIDs) if not res['OK']: gLogger.error( 'Failed to update the Channel table for file to retry.', res['Message']) return S_ERROR(errStr) ftsGUID = res['Value']['ftsGUID'] ftsServer = res['Value']['ftsServer'] infoStr = """Submitted FTS Job: FTS Guid: %s FTS Server: %s ChannelID: %s SourceSE: %s TargetSE: %s Files: %s """ % (ftsGUID, ftsServer, str(channelID), sourceSE, targetSE, str(len(files))) gLogger.info(infoStr) ######################################################################### # Insert the FTS Req details and add the number of files and size res = self.TransferDB.insertFTSReq(ftsGUID, ftsServer, channelID) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] gLogger.error(errStr) return S_ERROR(errStr) ftsReqID = res['Value'] gLogger.info('Obtained FTS RequestID %s' % ftsReqID) res = self.TransferDB.setFTSReqAttribute(ftsReqID, 'SourceSE', sourceSE) if not res['OK']: gLogger.error("Failed to set SourceSE for FTSRequest", res['Message']) res = self.TransferDB.setFTSReqAttribute(ftsReqID, 'TargetSE', targetSE) if not res['OK']: gLogger.error("Failed to set TargetSE for FTSRequest", res['Message']) res = self.TransferDB.setFTSReqAttribute(ftsReqID, 'NumberOfFiles', len(fileIDs)) if not res['OK']: gLogger.error("Failed to set NumberOfFiles for FTSRequest", res['Message']) res = self.TransferDB.setFTSReqAttribute(ftsReqID, 'TotalSize', totalSize) if not res['OK']: gLogger.error("Failed to set TotalSize for FTSRequest", res['Message']) ######################################################################### # Insert the submission event in the FTSReqLogging table event = 'Submitted' res = self.TransferDB.addLoggingEvent(ftsReqID, event) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] gLogger.error(errStr) ######################################################################### # Insert the FileToFTS details and remove the files from the channel gLogger.info('Setting the files as Executing in the Channel table') res = self.TransferDB.setChannelFilesExecuting(channelID, fileIDs) if not res['OK']: gLogger.error('Failed to update the Channel tables for files.', res['Message']) lfns = [] fileToFTSFileAttributes = [] for file in files: lfn = file['LFN'] fileID = file['FileID'] lfns.append(lfn) fileToFTSFileAttributes.append((fileID, fileIDSizes[fileID])) gLogger.info('Populating the FileToFTS table with file information') res = self.TransferDB.setFTSReqFiles(ftsReqID, channelID, fileToFTSFileAttributes) if not res['OK']: gLogger.error('Failed to populate the FileToFTS table with files.')
class FTSSubmitAgent( AgentModule ): """ .. class:: FTSSubmitAgent This class is submitting previosly scheduled files to the FTS system using helper class FTSRequest. Files to be transferred are read from TransferDB.Channel table, only those with Status = 'Waiting'. After submision TransferDB.Channel.Status is set to 'Executing'. The rest of state propagation is done in FTSMonitorAgent. An information about newly created FTS request is hold in TransferDB.FTSReq (request itself) table and TransferDB.FileToFTS (files in request) and TransferDB.FileToCat (files to be registered, for failover only). """ ## placaholder for TransferDB reference transferDB = None ## placeholder for max job per channel maxJobsPerChannel = 2 ## placeholder for checksum test option flag cksmTest = False ## placeholder for checksum type cksmType = "" ## default checksum type __defaultCksmType = "ADLER32" def initialize( self ): """ agent's initalisation :param self: self reference """ ## save tarsnferDB handler self.transferDB = TransferDB() ## read config options self.maxJobsPerChannel = self.am_getOption( 'MaxJobsPerChannel', 2 ) self.log.info("max jobs/channel = %s" % self.maxJobsPerChannel ) ## checksum test self.cksmTest = bool( self.am_getOption( "ChecksumTest", False ) ) ## ckecksum type if self.cksmTest: self.cksmType = str( self.am_getOption( "ChecksumType", self.__defaultCksmType ) ).upper() if self.cksmType and self.cksmType not in ( "ADLER32", "MD5", "SHA1" ): self.log.warn("unknown checksum type: %s, will use default %s" % ( self.cksmType, self.__defaultCksmType ) ) self.cksmType = self.__defaultCksmType self.log.info( "checksum test is %s" % ( { True : "enabled using %s checksum" % self.cksmType, False : "disabled"}[self.cksmTest] ) ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ execution in one agent's cycle :param self: self reference """ ######################################################################### # Obtain the eligible channels for submission. self.log.info( 'Obtaining channels eligible for submission.' ) res = self.transferDB.selectChannelsForSubmission( self.maxJobsPerChannel ) if not res['OK']: self.log.error( "Failed to retrieve channels for submission.", res['Message'] ) return S_OK() elif not res['Value']: self.log.info( "FTSSubmitAgent. No channels eligible for submission." ) return S_OK() channelDicts = res['Value'] self.log.info( 'Found %s eligible channels.' % len( channelDicts ) ) ######################################################################### # Submit to all the eligible waiting channels. i = 1 for channelDict in channelDicts: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting submission loop %s of %s\n\n" % ( infoStr, i, len( channelDicts ) ) self.log.info( infoStr ) res = self.submitTransfer( channelDict ) i += 1 return S_OK() def submitTransfer( self, channelDict ): """ create and submit FTS jobs based on information it gets from the DB :param self: self reference :param dict channelDict: dict with channel info as read from TransferDB.selectChannelsForSubmission """ # Create the FTSRequest object for preparing the submission oFTSRequest = FTSRequest() channelID = channelDict['ChannelID'] filesPerJob = channelDict['NumFiles'] ######################################################################### # Obtain the first files in the selected channel. self.log.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files to transfer on channel %s" % channelID ) res = self.transferDB.getFilesForChannel( channelID, 2 * filesPerJob ) if not res['OK']: errStr = 'FTSSubmitAgent.%s' % res['Message'] self.log.error( errStr ) return S_OK() if not res['Value']: self.log.info( "FTSSubmitAgent.submitTransfer: No files to found for channel." ) return S_OK() filesDict = res['Value'] self.log.info( 'Obtained %s files for channel' % len( filesDict['Files'] ) ) sourceSE = filesDict['SourceSE'] oFTSRequest.setSourceSE( sourceSE ) targetSE = filesDict['TargetSE'] oFTSRequest.setTargetSE( targetSE ) self.log.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files for %s to %s channel." % ( sourceSE, targetSE ) ) files = filesDict['Files'] ## enable/disable cksm test oFTSRequest.setCksmTest( self.cksmTest ) if self.cksmType: oFTSRequest.setCksmType( self.cksmType ) ######################################################################### # Populate the FTS Request with the files. self.log.info( 'Populating the FTS request with file information' ) fileIDs = [] totalSize = 0 fileIDSizes = {} for fileMeta in files: lfn = fileMeta['LFN'] oFTSRequest.setLFN( lfn ) oFTSRequest.setSourceSURL( lfn, fileMeta['SourceSURL'] ) oFTSRequest.setTargetSURL( lfn, fileMeta['TargetSURL'] ) fileID = fileMeta['FileID'] fileIDs.append( fileID ) totalSize += fileMeta['Size'] fileIDSizes[fileID] = fileMeta['Size'] ######################################################################### # Submit the FTS request and retrieve the FTS GUID/Server self.log.info( 'Submitting the FTS request' ) res = oFTSRequest.submit() if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] self.log.error( errStr ) self.log.info( 'Updating the Channel table for files to retry' ) res = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not res['OK']: self.log.error( 'Failed to update the Channel table for file to retry.', res['Message'] ) return S_ERROR( errStr ) ftsGUID = res['Value']['ftsGUID'] ftsServer = res['Value']['ftsServer'] infoStr = """Submitted FTS Job: FTS Guid: %s FTS Server: %s ChannelID: %s SourceSE: %s TargetSE: %s Files: %s """ % ( ftsGUID, ftsServer, str( channelID ), sourceSE, targetSE, str( len( files ) ) ) self.log.info( infoStr ) ## filter out skipped files failedFiles = oFTSRequest.getFailed() if not failedFiles["OK"]: self.log.warn("Unable to read skipped LFNs.") failedFiles = failedFiles["Value"] if "Value" in failedFiles else [] failedIDs = [ meta["FileID"] for meta in files if meta["LFN"] in failedFiles ] ## only submitted fileIDs = [ fileID for fileID in fileIDs if fileID not in failedIDs ] ## sub failed from total size totalSize -= sum( [ meta["Size"] for meta in files if meta["LFN"] in failedFiles ] ) ######################################################################### # Insert the FTS Req details and add the number of files and size res = self.transferDB.insertFTSReq( ftsGUID, ftsServer, channelID ) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] self.log.error( errStr ) return S_ERROR( errStr ) ftsReqID = res['Value'] self.log.info( 'Obtained FTS RequestID %s' % ftsReqID ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'SourceSE', sourceSE ) if not res['OK']: self.log.error( "Failed to set SourceSE for FTSRequest", res['Message'] ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'TargetSE', targetSE ) if not res['OK']: self.log.error( "Failed to set TargetSE for FTSRequest", res['Message'] ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'NumberOfFiles', len( fileIDs ) ) if not res['OK']: self.log.error( "Failed to set NumberOfFiles for FTSRequest", res['Message'] ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'TotalSize', totalSize ) if not res['OK']: self.log.error( "Failed to set TotalSize for FTSRequest", res['Message'] ) ######################################################################### # Insert the submission event in the FTSReqLogging table event = 'Submitted' res = self.transferDB.addLoggingEvent( ftsReqID, event ) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] self.log.error( errStr ) ######################################################################### # Insert the FileToFTS details and remove the files from the channel self.log.info( 'Setting the files as Executing in the Channel table' ) res = self.transferDB.setChannelFilesExecuting( channelID, fileIDs ) if not res['OK']: self.log.error( 'Failed to update the Channel tables for files.', res['Message'] ) lfns = [] fileToFTSFileAttributes = [] for fileMeta in files: lfn = fileMeta['LFN'] fileID = fileMeta['FileID'] lfns.append( lfn ) fileToFTSFileAttributes.append( ( fileID, fileIDSizes[fileID] ) ) self.log.info( 'Populating the FileToFTS table with file information' ) res = self.transferDB.setFTSReqFiles( ftsReqID, channelID, fileToFTSFileAttributes ) if not res['OK']: self.log.error( 'Failed to populate the FileToFTS table with files.' )
def setUp(self): self.TransferDB = TransferDB()
class FTSMonitorAgent(AgentModule): """ .. class:: FTSMonitorAgent Monitor submitted FTS jobs. """ # # transfer DB handle transferDB = None # # thread pool threadPool = None # # min threads minThreads = 1 # # max threads maxThreads = 10 # # missing source regexp patterns missingSourceErrors = [ re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[INVALID_PATH\] No such file or directory" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] Failed" ), re.compile( r"SOURCE error during PREPARATION phase: \[INVALID_PATH\] The requested file either does not exist" ), re.compile( r"TRANSFER error during TRANSFER phase: \[INVALID_PATH\] the server sent an error response: 500 500"\ " Command failed. : open error: No such file or directory" ), re.compile( r"SOURCE error during TRANSFER_PREPARATION phase: \[USER_ERROR\] source file doesnt exist" ) ] def initialize(self): """ agent's initialisation """ self.transferDB = TransferDB() self.am_setOption("shifterProxy", "DataManager") self.minThreads = self.am_getOption("MinThreads", self.minThreads) self.maxThreads = self.am_getOption("MaxThreads", self.maxThreads) minmax = (abs(self.minThreads), abs(self.maxThreads)) self.minThreads, self.maxThreads = min(minmax), max(minmax) self.log.info("ThreadPool min threads = %s" % self.minThreads) self.log.info("ThreadPool max threads = %s" % self.maxThreads) self.threadPool = ThreadPool(self.minThreads, self.maxThreads) self.threadPool.daemonize() return S_OK() def execute(self): """ push jobs to the thread pool """ self.log.info("Obtaining requests to monitor") res = self.transferDB.getFTSReq() if not res["OK"]: self.log.error("Failed to get FTS requests", res['Message']) return res if not res["Value"]: self.log.info("No FTS requests found to monitor.") return S_OK() ftsReqs = res["Value"] self.log.info("Found %s FTS jobs" % len(ftsReqs)) i = 1 for ftsJob in ftsReqs: while True: self.log.debug("submitting FTS Job %s FTSReqID=%s to monitor" % (i, ftsJob["FTSReqID"])) ret = self.threadPool.generateJobAndQueueIt( self.monitorTransfer, args=(ftsJob, ), ) if ret["OK"]: i += 1 break # # sleep 1 second to proceed time.sleep(1) self.threadPool.processAllResults() return S_OK() def ftsJobExpired(self, ftsReqID, channelID): """ clean up when FTS job had expired on the server side :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID """ log = gLogger.getSubLogger("@%s" % str(ftsReqID)) fileIDs = self.transferDB.getFTSReqFileIDs(ftsReqID) if not fileIDs["OK"]: log.error("Unable to retrieve FileIDs associated to %s request" % ftsReqID) return fileIDs fileIDs = fileIDs["Value"] # # update FileToFTS table, this is just a clean up, no worry if somethings goes wrong for fileID in fileIDs: fileStatus = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Status", "Failed") if not fileStatus["OK"]: log.error( "Unable to set FileToFTS status to 'Failed' for FileID %s: %s" % (fileID, fileStatus["Message"])) failReason = self.transferDB.setFileToFTSFileAttribute( ftsReqID, fileID, "Reason", "FTS job expired on server") if not failReason["OK"]: log.error("Unable to set FileToFTS reason for FileID %s: %s" % (fileID, failReason["Message"])) # # update Channel table resetChannels = self.transferDB.resetFileChannelStatus( channelID, fileIDs) if not resetChannels["OK"]: log.error("Failed to reset Channel table for files to retry") return resetChannels # # update FTSReq table log.info("Setting FTS request status to 'Finished'") ftsReqStatus = self.transferDB.setFTSReqStatus(ftsReqID, "Finished") if not ftsReqStatus["OK"]: log.error("Failed update FTS Request status", ftsReqStatus["Message"]) return ftsReqStatus # # if we land here, everything should be OK return S_OK() def monitorTransfer(self, ftsReqDict): """ monitors transfer obtained from TransferDB :param dict ftsReqDict: FTS job dictionary """ ftsReqID = ftsReqDict.get("FTSReqID") ftsGUID = ftsReqDict.get("FTSGuid") ftsServer = ftsReqDict.get("FTSServer") channelID = ftsReqDict.get("ChannelID") sourceSE = ftsReqDict.get("SourceSE") targetSE = ftsReqDict.get("TargetSE") oFTSRequest = FTSRequest() oFTSRequest.setFTSServer(ftsServer) oFTSRequest.setFTSGUID(ftsGUID) oFTSRequest.setSourceSE(sourceSE) oFTSRequest.setTargetSE(targetSE) log = gLogger.getSubLogger("@%s" % str(ftsReqID)) ######################################################################### # Perform summary update of the FTS Request and update FTSReq entries. log.info("Perform summary update of the FTS Request") infoStr = ["glite-transfer-status -s %s -l %s" % (ftsServer, ftsGUID)] infoStr.append("FTS GUID: %s" % ftsGUID) infoStr.append("FTS Server: %s" % ftsServer) log.info("\n".join(infoStr)) res = oFTSRequest.summary() self.transferDB.setFTSReqLastMonitor(ftsReqID) if not res["OK"]: log.error("Failed to update the FTS request summary", res["Message"]) if "getTransferJobSummary2: Not authorised to query request" in res[ "Message"]: log.error( "FTS job is not existing at the FTS server anymore, will clean it up on TransferDB side" ) cleanUp = self.ftsJobExpired(ftsReqID, channelID) if not cleanUp["OK"]: log.error(cleanUp["Message"]) return cleanUp return res res = oFTSRequest.dumpSummary() if not res['OK']: log.error("Failed to get FTS request summary", res["Message"]) return res log.info(res['Value']) res = oFTSRequest.getPercentageComplete() if not res['OK']: log.error("Failed to get FTS percentage complete", res["Message"]) return res log.info('FTS Request found to be %.1f percent complete' % res["Value"]) self.transferDB.setFTSReqAttribute(ftsReqID, "PercentageComplete", res["Value"]) self.transferDB.addLoggingEvent(ftsReqID, res["Value"]) ######################################################################### # Update the information in the TransferDB if the transfer is terminal. res = oFTSRequest.isRequestTerminal() if not res["OK"]: log.error("Failed to determine whether FTS request terminal", res["Message"]) return res if not res["Value"]: return S_OK() # # request is terminal return self.terminalRequest(oFTSRequest, ftsReqID, channelID, sourceSE) def terminalRequest(self, oFTSRequest, ftsReqID, channelID, sourceSE): """ process terminal FTS job :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param str sourceSE: FTSReq.SourceSE """ log = gLogger.getSubLogger("@%s" % ftsReqID) log.info("FTS Request found to be terminal, updating file states") ######################################################################### # Get the LFNS associated to the FTS request log.info("Obtaining the LFNs associated to this request") res = self.transferDB.getFTSReqLFNs(ftsReqID, channelID, sourceSE) if not res["OK"]: log.error("Failed to obtain FTS request LFNs", res['Message']) return res files = res["Value"] if not files: log.error("No files present for transfer") return S_ERROR("No files were found in the DB") lfns = files.keys() log.debug("Obtained %s files" % len(lfns)) for lfn in lfns: oFTSRequest.setLFN(lfn) res = oFTSRequest.monitor() if not res["OK"]: log.error("Failed to perform detailed monitoring of FTS request", res["Message"]) return res res = oFTSRequest.getFailed() if not res["OK"]: log.error("Failed to obtained failed files for FTS request", res["Message"]) return res failedFiles = res["Value"] res = oFTSRequest.getDone() if not res["OK"]: log.error("Failed to obtained successful files for FTS request", res["Message"]) return res completedFiles = res["Value"] # An LFN can be included more than once if it was entered into more than one Request. # FTS will only do the transfer once. We need to identify all FileIDs res = self.transferDB.getFTSReqFileIDs(ftsReqID) if not res["OK"]: log.error("Failed to get FileIDs associated to FTS Request", res["Message"]) return res fileIDs = res["Value"] res = self.transferDB.getAttributesForFilesList(fileIDs, ["LFN"]) if not res["OK"]: log.error("Failed to get LFNs associated to FTS Request", res["Message"]) return res fileIDDict = res["Value"] fileToFTSUpdates = [] completedFileIDs = [] filesToRetry = [] filesToFail = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in completedFiles: completedFileIDs.append(fileID) transferTime = 0 res = oFTSRequest.getTransferTime(lfn) if res["OK"]: transferTime = res["Value"] fileToFTSUpdates.append( (fileID, "Completed", "", 0, transferTime)) if lfn in failedFiles: failReason = "" res = oFTSRequest.getFailReason(lfn) if res["OK"]: failReason = res["Value"] if "Source file/user checksum mismatch" in failReason: filesToFail.append(fileID) continue if self.missingSource(failReason): log.error("The source SURL does not exist.", "%s %s" % (lfn, oFTSRequest.getSourceSURL(lfn))) filesToFail.append(fileID) else: filesToRetry.append(fileID) log.error("Failed to replicate file on channel.", "%s %s" % (channelID, failReason)) fileToFTSUpdates.append((fileID, "Failed", failReason, 0, 0)) # # update TransferDB.FileToFTS table updateFileToFTS = self.updateFileToFTS(ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates) if updateFileToFTS["OK"] and updateFileToFTS["Value"]: res = oFTSRequest.finalize() if not res["OK"]: log.error( "Failed to perform the finalization for the FTS request", res["Message"]) return res log.info('Adding logging event for FTS request') # Now set the FTSReq status to terminal so that it is not monitored again res = self.transferDB.addLoggingEvent(ftsReqID, 'Finished') if not res['OK']: log.error('Failed to add logging event for FTS Request', res['Message']) # update TransferDB.FileToCat table updateFileToCat = self.updateFileToCat(oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail) if not updateFileToCat["OK"]: log.error(updateFileToCat["Message"]) log.debug("Updating FTS request status") res = self.transferDB.setFTSReqStatus(ftsReqID, 'Finished') if not res['OK']: log.error('Failed update FTS Request status', res['Message']) return S_OK() def updateFileToFTS(self, ftsReqID, channelID, filesToRetry, filesToFail, completedFileIDs, fileToFTSUpdates): """ update TransferDB.FileToFTS table for finished request :param int ftsReqID: FTSReq.FTSReqID :param int channelID: FTSReq.ChannelID :param list filesToRetry: FileIDs to retry :param list filesToFail: FileIDs for failed files :param list completedFileIDs: files completed :param list fileToFTSUpdates: ??? """ log = gLogger.getSubLogger("@%s" % ftsReqID) allUpdated = True res = self.transferDB.resetFileChannelStatus( channelID, filesToRetry) if filesToRetry else S_OK() if not res["OK"]: log.error("Failed to update the Channel table for file to retry.", res["Message"]) allUpdated = False for fileID in filesToFail: log.info("Updating the Channel table for files to reschedule") res = self.transferDB.setFileToReschedule(fileID) if not res["OK"]: log.error("Failed to update Channel table for failed files.", res["Message"]) allUpdated = False elif res["Value"] == "max reschedule attempt reached": log.error("setting Channel status to 'Failed' : " % res["Value"]) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed') if not res["OK"]: log.error( "Failed to update Channel table for failed files.", res["Message"]) allUpdated = False if completedFileIDs: res = self.transferDB.updateCompletedChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( "Failed to update the Channel table for successful files.", res["Message"]) allUpdated = False res = self.transferDB.updateAncestorChannelStatus( channelID, completedFileIDs) if not res["OK"]: log.error( 'Failed to update the Channel table for ancestors of successful files.', res['Message']) allUpdated = False if fileToFTSUpdates: res = self.transferDB.setFileToFTSFileAttributes( ftsReqID, channelID, fileToFTSUpdates) if not res["OK"]: log.error("Failed to update the FileToFTS table for files.", res["Message"]) allUpdated = False return S_OK(allUpdated) def updateFileToCat(self, oFTSRequest, channelID, fileIDDict, completedFiles, filesToFail): """ update TransferDB.FileToCat table for finished request :param FTSRequest oFTSRequest: FTSRequest instance :param int ftsReqID: FTSReq.FTSReqID :param dict fileIDDict: fileIDs dictionary :param int channelID: FTSReq.ChannelID """ res = oFTSRequest.getFailedRegistrations() failedRegistrations = res["Value"] regFailedFileIDs = [] regDoneFileIDs = [] regForgetFileIDs = [] for fileID, fileDict in fileIDDict.items(): lfn = fileDict['LFN'] if lfn in failedRegistrations: regFailedFileIDs.append(fileID) # if the LFN appears more than once, FileToCat needs to be reset only once del failedRegistrations[lfn] elif lfn in completedFiles: regDoneFileIDs.append(fileID) elif fileID in filesToFail: regForgetFileIDs.append(fileID) res = self.transferDB.setRegistrationWaiting( channelID, regFailedFileIDs) if regFailedFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to reset entries in FileToCat: %s" % res[ "Message"] return res res = self.transferDB.setRegistrationDone( channelID, regDoneFileIDs) if regDoneFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res # This entries could also be set to Failed, but currently there is no method to do so. res = self.transferDB.setRegistrationDone( channelID, regForgetFileIDs) if regForgetFileIDs else S_OK() if not res["OK"]: res["Message"] = "Failed to set entries Done in FileToCat: %s" % res[ "Message"] return res return S_OK() @classmethod def missingSource(cls, failReason): """ check if message sent by FTS server is concering missing source file :param str failReason: message sent by FTS server """ for error in cls.missingSourceErrors: if error.search(failReason): return 1 return 0
class ReplicationScheduler( AgentModule ): def initialize( self ): self.section = PathFinder.getAgentSection( AGENT_NAME ) self.RequestDB = RequestDBMySQL() self.TransferDB = TransferDB() self.DataLog = DataLoggingClient() self.factory = StorageFactory() self.rm = ReplicaManager() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ The main agent execution method """ # This allows dynamic changing of the throughput timescale self.throughputTimescale = self.am_getOption( 'ThroughputTimescale', 3600 ) self.throughputTimescale = 60 * 60 * 1 #print 'ThroughputTimescale:',self.throughputTimescale ###################################################################################### # # Obtain information on the current state of the channel queues # res = self.TransferDB.getChannelQueues() if not res['OK']: errStr = "ReplicationScheduler._execute: Failed to get channel queues from TransferDB." gLogger.error( errStr, res['Message'] ) return S_OK() if not res['Value']: gLogger.info( "ReplicationScheduler._execute: No active channels found for replication." ) return S_OK() channels = res['Value'] res = self.TransferDB.getChannelObservedThroughput( self.throughputTimescale ) if not res['OK']: errStr = "ReplicationScheduler._execute: Failed to get observed throughput from TransferDB." gLogger.error( errStr, res['Message'] ) return S_OK() if not res['Value']: gLogger.info( "ReplicationScheduler._execute: No active channels found for replication." ) return S_OK() bandwidths = res['Value'] self.strategyHandler = StrategyHandler( bandwidths, channels, self.section ) processedRequests = [] requestsPresent = True while requestsPresent: ###################################################################################### # # The first step is to obtain a transfer request from the RequestDB which should be scheduled. # gLogger.info( "ReplicationScheduler._execute: Contacting RequestDB for suitable requests." ) res = self.RequestDB.getRequest( 'transfer' ) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to get a request list from RequestDB.", res['Message'] ) continue if not res['Value']: gLogger.info( "ReplicationScheduler._execute: No requests found in RequestDB." ) requestsPresent = False return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] gLogger.info( "ReplicationScheduler._execute: Obtained Request %s from RequestDB." % ( requestName ) ) ###################################################################################### # # The request must then be parsed to obtain the sub-requests, their attributes and files. # logStr = 'ReplicationScheduler._execute: Parsing Request %s.' % ( requestName ) gLogger.info( logStr ) oRequest = RequestContainer( requestString ) res = oRequest.getAttribute( 'RequestID' ) if not res['OK']: gLogger.error( 'ReplicationScheduler._execute: Failed to get requestID.', res['Message'] ) return S_ERROR( 'ReplicationScheduler._execute: Failed to get number of sub-requests.' ) requestID = res['Value'] if requestID in processedRequests: # Break the loop once we have iterated once over all requests res = self.RequestDB.updateRequest( requestName, requestString ) if not res['OK']: gLogger.error( "Failed to update request", "%s %s" % ( requestName, res['Message'] ) ) return S_OK() processedRequests.append( requestID ) res = oRequest.getNumSubRequests( 'transfer' ) if not res['OK']: gLogger.error( 'ReplicationScheduler._execute: Failed to get number of sub-requests.', res['Message'] ) return S_ERROR( 'ReplicationScheduler._execute: Failed to get number of sub-requests.' ) numberRequests = res['Value'] gLogger.info( "ReplicationScheduler._execute: '%s' found with %s sub-requests." % ( requestName, numberRequests ) ) ###################################################################################### # # The important request attributes are the source and target SEs. # for ind in range( numberRequests ): gLogger.info( "ReplicationScheduler._execute: Treating sub-request %s from '%s'." % ( ind, requestName ) ) attributes = oRequest.getSubRequestAttributes( ind, 'transfer' )['Value'] if attributes['Status'] != 'Waiting': # If the sub-request is already in terminal state gLogger.info( "ReplicationScheduler._execute: Sub-request %s is status '%s' and not to be executed." % ( ind, attributes['Status'] ) ) continue sourceSE = attributes['SourceSE'] targetSE = attributes['TargetSE'] """ This section should go in the transfer request class """ if type( targetSE ) in types.StringTypes: if re.search( ',', targetSE ): targetSEs = targetSE.split( ',' ) else: targetSEs = [targetSE] """----------------------------------------------------- """ operation = attributes['Operation'] reqRepStrategy = None if operation in self.strategyHandler.getSupportedStrategies(): reqRepStrategy = operation ###################################################################################### # # Then obtain the file attribute of interest are the LFN and FileID # res = oRequest.getSubRequestFiles( ind, 'transfer' ) if not res['OK']: gLogger.error( 'ReplicationScheduler._execute: Failed to obtain sub-request files.' , res['Message'] ) continue files = res['Value'] gLogger.info( "ReplicationScheduler._execute: Sub-request %s found with %s files." % ( ind, len( files ) ) ) filesDict = {} for file in files: lfn = file['LFN'] if file['Status'] != 'Waiting': gLogger.debug( "ReplicationScheduler._execute: %s will not be scheduled because it is %s." % ( lfn, file['Status'] ) ) else: fileID = file['FileID'] filesDict[lfn] = fileID if not filesDict: gLogger.info( "ReplicationScheduler._execute: No Waiting files found for request" ) continue notSched = len( files ) - len( filesDict ) if notSched: gLogger.info( "ReplicationScheduler._execute: %d files found not Waiting" % notSched ) ###################################################################################### # # Now obtain replica information for the files associated to the sub-request. # lfns = filesDict.keys() gLogger.info( "ReplicationScheduler._execute: Obtaining replica information for %d sub-request files." % len( lfns ) ) res = self.rm.getCatalogReplicas( lfns ) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to get replica information.", res['Message'] ) continue for lfn, failure in res['Value']['Failed'].items(): gLogger.error( "ReplicationScheduler._execute: Failed to get replicas.", '%s: %s' % ( lfn, failure ) ) replicas = res['Value']['Successful'] if not replicas.keys(): gLogger.error( "ReplicationScheduler._execute: Failed to get replica information for all files." ) continue ###################################################################################### # # Now obtain the file sizes for the files associated to the sub-request. # lfns = replicas.keys() gLogger.info( "ReplicationScheduler._execute: Obtaining file sizes for %d sub-request files." % len( lfns ) ) res = self.rm.getCatalogFileMetadata( lfns ) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to get file size information.", res['Message'] ) continue for lfn, failure in res['Value']['Failed'].items(): gLogger.error( 'ReplicationScheduler._execute: Failed to get file size.', '%s: %s' % ( lfn, failure ) ) metadata = res['Value']['Successful'] if not metadata.keys(): gLogger.error( "ReplicationScheduler._execute: Failed to get metadata for all files." ) continue ###################################################################################### # # For each LFN determine the replication tree # for lfn in sortList( metadata.keys() ): fileSize = metadata[lfn]['Size'] lfnReps = replicas[lfn] fileID = filesDict[lfn] targets = [] for targetSE in targetSEs: if targetSE in lfnReps.keys(): gLogger.debug( "ReplicationScheduler.execute: %s already present at %s." % ( lfn, targetSE ) ) else: targets.append( targetSE ) if not targets: gLogger.info( "ReplicationScheduler.execute: %s present at all targets." % lfn ) oRequest.setSubRequestFileAttributeValue( ind, 'transfer', lfn, 'Status', 'Done' ) continue if not lfnReps: gLogger.error( "ReplicationScheduler.execute: The file has no replicas.", lfn ) continue res = self.strategyHandler.determineReplicationTree( sourceSE, targets, lfnReps, fileSize, strategy = reqRepStrategy ) if not res['OK']: gLogger.error( "ReplicationScheduler.execute: Failed to determine replication tree.", res['Message'] ) continue tree = res['Value'] ###################################################################################### # # For each item in the replication tree obtain the source and target SURLS # for channelID, dict in tree.items(): gLogger.info( "ReplicationScheduler.execute: processing for channel %d %s" % ( channelID, str( dict ) ) ) hopSourceSE = dict['SourceSE'] hopDestSE = dict['DestSE'] hopAncestor = dict['Ancestor'] # Get the sourceSURL if hopAncestor: status = 'Waiting%s' % ( hopAncestor ) res = self.obtainLFNSURL( hopSourceSE, lfn ) if not res['OK']: errStr = res['Message'] gLogger.error( errStr ) return S_ERROR( errStr ) sourceSURL = res['Value'] else: status = 'Waiting' res = self.resolvePFNSURL( hopSourceSE, lfnReps[hopSourceSE] ) if not res['OK']: sourceSURL = lfnReps[hopSourceSE] else: sourceSURL = res['Value'] # Get the targetSURL res = self.obtainLFNSURL( hopDestSE, lfn ) if not res['OK']: errStr = res['Message'] gLogger.error( errStr ) return S_ERROR( errStr ) targetSURL = res['Value'] ###################################################################################### # # For each item in the replication tree add the file to the channel # res = self.TransferDB.addFileToChannel( channelID, fileID, hopSourceSE, sourceSURL, hopDestSE, targetSURL, fileSize, fileStatus = status ) if not res['OK']: errStr = res['Message'] gLogger.error( "ReplicationScheduler._execute: Failed to add File to Channel." , "%s %s" % ( fileID, channelID ) ) return S_ERROR( errStr ) res = self.TransferDB.addFileRegistration( channelID, fileID, lfn, targetSURL, hopDestSE ) if not res['OK']: errStr = res['Message'] gLogger.error( "ReplicationScheduler._execute: Failed to add File registration." , "%s %s" % ( fileID, channelID ) ) result = self.TransferDB.removeFileFromChannel( channelID, fileID ) if not result['OK']: errStr += result['Message'] gLogger.error( "ReplicationScheduler._execute: Failed to remove File." , "%s %s" % ( fileID, channelID ) ) return S_ERROR( errStr ) oRequest.setSubRequestFileAttributeValue( ind, 'transfer', lfn, 'Status', 'Scheduled' ) res = self.TransferDB.addReplicationTree( fileID, tree ) if oRequest.isSubRequestEmpty( ind, 'transfer' )['Value']: oRequest.setSubRequestStatus( ind, 'transfer', 'Scheduled' ) ################################################ # Generate the new request string after operation requestString = oRequest.toXML()['Value'] res = self.RequestDB.updateRequest( requestName, requestString ) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to update request", "%s %s" % ( requestName, res['Message'] ) ) def obtainLFNSURL( self, targetSE, lfn ): """ Creates the targetSURL for the storage and LFN supplied """ res = self.factory.getStorages( targetSE, protocolList = ['SRM2'] ) if not res['OK']: errStr = 'ReplicationScheduler._execute: Failed to create SRM2 storage for %s: %s. ' % ( targetSE, res['Message'] ) gLogger.error( errStr ) return S_ERROR( errStr ) storageObjects = res['Value']['StorageObjects'] for storageObject in storageObjects: res = storageObject.getCurrentURL( lfn ) if res['OK']: return res gLogger.error( 'ReplicationScheduler._execute: Failed to get SRM compliant storage.' , targetSE ) return S_ERROR( 'ReplicationScheduler._execute: Failed to get SRM compliant storage.' ) def resolvePFNSURL( self, sourceSE, pfn ): """ Creates the targetSURL for the storage and LFN supplied """ res = self.rm.getPfnForProtocol( [pfn], sourceSE ) if not res['OK']: return res if pfn in res['Value']['Failed'].keys(): return S_ERROR( res['Value']['Failed'][pfn] ) return S_OK( res['Value']['Successful'][pfn] )
class FTSSubmitAgent( AgentModule ): """ .. class:: FTSSubmitAgent This class is submitting previously scheduled files to the FTS system using helper class FTSRequest. Files to be transferred are read from TransferDB.Channel table, only those with Status = 'Waiting'. After submission TransferDB.Channel.Status is set to 'Executing'. The rest of state propagation is done in FTSMonitorAgent. An information about newly created FTS request is hold in TransferDB.FTSReq (request itself) table and TransferDB.FileToFTS (files in request) and TransferDB.FileToCat (files to be registered, for failover only). """ # # placaholder for TransferDB reference transferDB = None # # placeholder for max job per channel maxJobsPerChannel = 2 # # placeholder for checksum test option flag cksmTest = False # # placeholder for checksum type cksmType = "" # # default checksum type __defaultCksmType = "ADLER32" def initialize( self ): """ agent's initalisation :param self: self reference """ # # save tarsnferDB handler self.transferDB = TransferDB() # # read config options self.maxJobsPerChannel = self.am_getOption( 'MaxJobsPerChannel', self.maxJobsPerChannel ) self.log.info( "max jobs/channel = %s" % self.maxJobsPerChannel ) # # checksum test self.cksmTest = bool( self.am_getOption( "ChecksumTest", False ) ) # # ckecksum type if self.cksmTest: self.cksmType = str( self.am_getOption( "ChecksumType", self.__defaultCksmType ) ).upper() if self.cksmType and self.cksmType not in ( "ADLER32", "MD5", "SHA1" ): self.log.warn( "unknown checksum type: %s, will use default %s" % ( self.cksmType, self.__defaultCksmType ) ) self.cksmType = self.__defaultCksmType self.log.info( "checksum test is %s" % ( { True : "enabled using %s checksum" % self.cksmType, False : "disabled"}[self.cksmTest] ) ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) self.filesBeingStaged = {} return S_OK() def execute( self ): """ execution in one agent's cycle :param self: self reference """ ######################################################################### # Obtain the eligible channels for submission. self.log.info( 'Obtaining channels eligible for submission.' ) res = self.transferDB.selectChannelsForSubmission( self.maxJobsPerChannel ) if not res['OK']: self.log.error( "Failed to retrieve channels for submission.", res['Message'] ) return S_OK() elif not res['Value']: self.log.info( "FTSSubmitAgent. No channels eligible for submission." ) return S_OK() channelDicts = res['Value'] self.log.info( 'Found %s eligible channels.' % len( channelDicts ) ) ######################################################################### # Submit to all the eligible waiting channels. i = 1 for channelDict in channelDicts: infoStr = "\n\n##################################################################################\n\n" infoStr = "%sStarting submission loop %s of %s\n\n" % ( infoStr, i, len( channelDicts ) ) self.log.info( infoStr ) res = self.submitTransfer( channelDict ) i += 1 return S_OK() def submitTransfer( self, channelDict ): """ create and submit FTS jobs based on information it gets from the DB :param self: self reference :param dict channelDict: dict with channel info as read from TransferDB.selectChannelsForSubmission """ channelID = channelDict['ChannelID'] filesPerJob = channelDict['NumFiles'] ######################################################################### # Obtain the first files in the selected channel. self.log.info( "FTSSubmitAgent.submitTransfer: Attempting to obtain files for channel %s: %s to %s" % ( channelID, channelDict['Source'], channelDict['Destination'] ) ) res = self.transferDB.getFilesForChannel( channelID, 2 * filesPerJob ) if not res['OK']: errStr = 'FTSSubmitAgent.%s' % res['Message'] self.log.error( errStr ) return S_OK() if not res['Value']: self.log.info( "FTSSubmitAgent.submitTransfer: No files found for channel." ) return S_OK() filesDict = res['Value'] sourceSE = filesDict['SourceSE'] targetSE = filesDict['TargetSE'] self.log.info( 'Obtained %s files for channel %s to %s' % ( len( filesDict['Files'] ), sourceSE, targetSE ) ) if self.filesBeingStaged.get( channelID ): self.log.info( '%d files are currently in staging status' % len( self.filesBeingStaged[channelID] ) ) # Create the FTSRequest object for preparing the submission oFTSRequest = FTSRequest() oFTSRequest.setSourceSE( sourceSE ) oFTSRequest.setTargetSE( targetSE ) files = filesDict['Files'] # # enable/disable cksm test oFTSRequest.setCksmTest( self.cksmTest ) if self.cksmType: oFTSRequest.setCksmType( self.cksmType ) ######################################################################### # Populate the FTS Request with the files. self.log.info( 'Populating the FTS request with file information' ) fileIDs = [] totalSize = 0 fileIDSizes = {} lfns = set() for fileMeta in files: lfn = fileMeta['LFN'] lfns.add( lfn ) oFTSRequest.setLFN( lfn ) oFTSRequest.setSourceSURL( lfn, fileMeta['SourceSURL'] ) oFTSRequest.setTargetSURL( lfn, fileMeta['TargetSURL'] ) if lfn in self.filesBeingStaged.get( channelID, [] ): oFTSRequest.setStatus( lfn, 'Staging' ) fileID = fileMeta['FileID'] fileIDs.append( fileID ) totalSize += fileMeta['Size'] fileIDSizes[fileID] = fileMeta['Size'] oFTSRequest.resolveSource() noSource = [ lfn for lfn, fileInfo in oFTSRequest.fileDict.items() if fileInfo.get( "Status", "" ) == "Failed" and fileInfo.get( "Reason", "" ) in ( "No replica at SourceSE", "Source file does not exist" ) ] toReschedule = [fileMeta["FileID"] for fileMeta in files if fileMeta["LFN"] in noSource] if toReschedule: self.log.info( "Found %s files to reschedule" % len( toReschedule ) ) for fileID in toReschedule: res = self.transferDB.setFileToReschedule( fileID ) if not res["OK"]: self.log.error( "Failed to update Channel table for failed files.", res["Message"] ) elif res["Value"] == "max reschedule attempt reached": self.log.error( "setting Channel status to 'Failed' : " % res["Value"] ) res = self.transferDB.setFileChannelStatus( channelID, fileID, 'Failed' ) if not res["OK"]: self.log.error( "Failed to update Channel table for failed files.", res["Message"] ) ######################################################################### # Submit the FTS request and retrieve the FTS GUID/Server self.log.info( 'Submitting the FTS request' ) res = oFTSRequest.submit() if not res['OK']: errStr = "FTSSubmitAgent.submit: %s" % res['Message'] self.log.error( errStr ) self.log.info( 'Updating the Channel table for files to retry' ) res = self.transferDB.resetFileChannelStatus( channelID, fileIDs ) if not res['OK']: self.log.error( 'Failed to update the Channel table for file to retry.', res['Message'] ) return S_ERROR( errStr ) ftsGUID = res['Value']['ftsGUID'] ftsServer = res['Value']['ftsServer'] nbSubmitted = res['Value']['submittedFiles'] infoStr = """Submitted FTS Job: FTS Guid: %s FTS Server: %s ChannelID: %s SourceSE: %s TargetSE: %s Files: %s """ % ( ftsGUID, ftsServer, str( channelID ), sourceSE, targetSE, str( nbSubmitted ) ) self.log.info( infoStr ) # # filter out skipped files failedFiles = oFTSRequest.getFailed()['Value'] stagingFiles = oFTSRequest.getStaging()['Value'] # cache files being staged self.filesBeingStaged.setdefault( channelID, set() ).update( stagingFiles ) submittedFiles = lfns.difference( failedFiles, stagingFiles ) # files being submitted are staged self.filesBeingStaged[channelID] -= submittedFiles failedIDs = set( [ meta["FileID"] for meta in files if meta["LFN"] in failedFiles ] ) stagingIDs = set( [ meta["FileID"] for meta in files if meta["LFN"] in stagingFiles ] ) # # only submitted submittedIDs = set( fileIDs ) - failedIDs - stagingIDs # # only count the submitted size totalSize = sum( [ meta["Size"] for meta in files if meta["FileID"] in submittedIDs ] ) ######################################################################### # Insert the FTS Req details and add the number of files and size res = self.transferDB.insertFTSReq( ftsGUID, ftsServer, channelID ) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] self.log.error( errStr ) return S_ERROR( errStr ) ftsReqID = res['Value'] self.log.info( 'Obtained FTS RequestID %s' % ftsReqID ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'SourceSE', sourceSE ) if not res['OK']: self.log.error( "Failed to set SourceSE for FTSRequest", res['Message'] ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'TargetSE', targetSE ) if not res['OK']: self.log.error( "Failed to set TargetSE for FTSRequest", res['Message'] ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'NumberOfFiles', len( submittedIDs ) ) if not res['OK']: self.log.error( "Failed to set NumberOfFiles for FTSRequest", res['Message'] ) res = self.transferDB.setFTSReqAttribute( ftsReqID, 'TotalSize', totalSize ) if not res['OK']: self.log.error( "Failed to set TotalSize for FTSRequest", res['Message'] ) ######################################################################### # Insert the submission event in the FTSReqLogging table event = 'Submitted' res = self.transferDB.addLoggingEvent( ftsReqID, event ) if not res['OK']: errStr = "FTSSubmitAgent.%s" % res['Message'] self.log.error( errStr ) ######################################################################### # Insert the FileToFTS details and remove the files from the channel self.log.info( 'Setting the files as Executing in the Channel table' ) res = self.transferDB.setChannelFilesExecuting( channelID, list( submittedIDs ) ) if not res['OK']: self.log.error( 'Failed to update the Channel tables for files.', res['Message'] ) lfns = [] fileToFTSFileAttributes = [] for fileMeta in files: fileID = fileMeta['FileID'] # Staging is not an error case if fileID not in stagingIDs: lfns.append( fileMeta['LFN'] ) fileToFTSFileAttributes.append( ( fileID, fileIDSizes[fileID] ) ) self.log.info( 'Populating the FileToFTS table with file information' ) res = self.transferDB.setFTSReqFiles( ftsReqID, channelID, fileToFTSFileAttributes ) if not res['OK']: self.log.error( 'Failed to populate the FileToFTS table with files.' )
class ReplicationScheduler(AgentModule): def initialize(self): self.section = PathFinder.getAgentSection(AGENT_NAME) self.RequestDB = RequestDBMySQL() self.TransferDB = TransferDB() self.DataLog = DataLoggingClient() self.factory = StorageFactory() self.rm = ReplicaManager() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): """ The main agent execution method """ # This allows dynamic changing of the throughput timescale self.throughputTimescale = self.am_getOption('ThroughputTimescale', 3600) self.throughputTimescale = 60 * 60 * 1 #print 'ThroughputTimescale:',self.throughputTimescale ###################################################################################### # # Obtain information on the current state of the channel queues # res = self.TransferDB.getChannelQueues() if not res['OK']: errStr = "ReplicationScheduler._execute: Failed to get channel queues from TransferDB." gLogger.error(errStr, res['Message']) return S_OK() if not res['Value']: gLogger.info( "ReplicationScheduler._execute: No active channels found for replication." ) return S_OK() channels = res['Value'] res = self.TransferDB.getChannelObservedThroughput( self.throughputTimescale) if not res['OK']: errStr = "ReplicationScheduler._execute: Failed to get observed throughput from TransferDB." gLogger.error(errStr, res['Message']) return S_OK() if not res['Value']: gLogger.info( "ReplicationScheduler._execute: No active channels found for replication." ) return S_OK() bandwidths = res['Value'] self.strategyHandler = StrategyHandler(bandwidths, channels, self.section) processedRequests = [] requestsPresent = True while requestsPresent: ###################################################################################### # # The first step is to obtain a transfer request from the RequestDB which should be scheduled. # gLogger.info( "ReplicationScheduler._execute: Contacting RequestDB for suitable requests." ) res = self.RequestDB.getRequest('transfer') if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to get a request list from RequestDB.", res['Message']) continue if not res['Value']: gLogger.info( "ReplicationScheduler._execute: No requests found in RequestDB." ) requestsPresent = False return S_OK() requestString = res['Value']['RequestString'] requestName = res['Value']['RequestName'] gLogger.info( "ReplicationScheduler._execute: Obtained Request %s from RequestDB." % (requestName)) ###################################################################################### # # The request must then be parsed to obtain the sub-requests, their attributes and files. # logStr = 'ReplicationScheduler._execute: Parsing Request %s.' % ( requestName) gLogger.info(logStr) oRequest = RequestContainer(requestString) res = oRequest.getAttribute('RequestID') if not res['OK']: gLogger.error( 'ReplicationScheduler._execute: Failed to get requestID.', res['Message']) return S_ERROR( 'ReplicationScheduler._execute: Failed to get number of sub-requests.' ) requestID = res['Value'] if requestID in processedRequests: # Break the loop once we have iterated once over all requests res = self.RequestDB.updateRequest(requestName, requestString) if not res['OK']: gLogger.error("Failed to update request", "%s %s" % (requestName, res['Message'])) return S_OK() processedRequests.append(requestID) res = oRequest.getNumSubRequests('transfer') if not res['OK']: gLogger.error( 'ReplicationScheduler._execute: Failed to get number of sub-requests.', res['Message']) return S_ERROR( 'ReplicationScheduler._execute: Failed to get number of sub-requests.' ) numberRequests = res['Value'] gLogger.info( "ReplicationScheduler._execute: '%s' found with %s sub-requests." % (requestName, numberRequests)) ###################################################################################### # # The important request attributes are the source and target SEs. # for ind in range(numberRequests): gLogger.info( "ReplicationScheduler._execute: Treating sub-request %s from '%s'." % (ind, requestName)) attributes = oRequest.getSubRequestAttributes( ind, 'transfer')['Value'] if attributes['Status'] != 'Waiting': # If the sub-request is already in terminal state gLogger.info( "ReplicationScheduler._execute: Sub-request %s is status '%s' and not to be executed." % (ind, attributes['Status'])) continue sourceSE = attributes['SourceSE'] targetSE = attributes['TargetSE'] """ This section should go in the transfer request class """ if type(targetSE) in types.StringTypes: if re.search(',', targetSE): targetSEs = targetSE.split(',') else: targetSEs = [targetSE] """----------------------------------------------------- """ operation = attributes['Operation'] reqRepStrategy = None if operation in self.strategyHandler.getSupportedStrategies(): reqRepStrategy = operation ###################################################################################### # # Then obtain the file attribute of interest are the LFN and FileID # res = oRequest.getSubRequestFiles(ind, 'transfer') if not res['OK']: gLogger.error( 'ReplicationScheduler._execute: Failed to obtain sub-request files.', res['Message']) continue files = res['Value'] gLogger.info( "ReplicationScheduler._execute: Sub-request %s found with %s files." % (ind, len(files))) filesDict = {} for file in files: lfn = file['LFN'] if file['Status'] != 'Waiting': gLogger.debug( "ReplicationScheduler._execute: %s will not be scheduled because it is %s." % (lfn, file['Status'])) else: fileID = file['FileID'] filesDict[lfn] = fileID if not filesDict: gLogger.info( "ReplicationScheduler._execute: No Waiting files found for request" ) continue notSched = len(files) - len(filesDict) if notSched: gLogger.info( "ReplicationScheduler._execute: %d files found not Waiting" % notSched) ###################################################################################### # # Now obtain replica information for the files associated to the sub-request. # lfns = filesDict.keys() gLogger.info( "ReplicationScheduler._execute: Obtaining replica information for %d sub-request files." % len(lfns)) res = self.rm.getCatalogReplicas(lfns) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to get replica information.", res['Message']) continue for lfn, failure in res['Value']['Failed'].items(): gLogger.error( "ReplicationScheduler._execute: Failed to get replicas.", '%s: %s' % (lfn, failure)) replicas = res['Value']['Successful'] if not replicas.keys(): gLogger.error( "ReplicationScheduler._execute: Failed to get replica information for all files." ) continue ###################################################################################### # # Now obtain the file sizes for the files associated to the sub-request. # lfns = replicas.keys() gLogger.info( "ReplicationScheduler._execute: Obtaining file sizes for %d sub-request files." % len(lfns)) res = self.rm.getCatalogFileMetadata(lfns) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to get file size information.", res['Message']) continue for lfn, failure in res['Value']['Failed'].items(): gLogger.error( 'ReplicationScheduler._execute: Failed to get file size.', '%s: %s' % (lfn, failure)) metadata = res['Value']['Successful'] if not metadata.keys(): gLogger.error( "ReplicationScheduler._execute: Failed to get metadata for all files." ) continue ###################################################################################### # # For each LFN determine the replication tree # for lfn in sortList(metadata.keys()): fileSize = metadata[lfn]['Size'] lfnReps = replicas[lfn] fileID = filesDict[lfn] targets = [] for targetSE in targetSEs: if targetSE in lfnReps.keys(): gLogger.debug( "ReplicationScheduler.execute: %s already present at %s." % (lfn, targetSE)) else: targets.append(targetSE) if not targets: gLogger.info( "ReplicationScheduler.execute: %s present at all targets." % lfn) oRequest.setSubRequestFileAttributeValue( ind, 'transfer', lfn, 'Status', 'Done') continue if not lfnReps: gLogger.error( "ReplicationScheduler.execute: The file has no replicas.", lfn) continue res = self.strategyHandler.determineReplicationTree( sourceSE, targets, lfnReps, fileSize, strategy=reqRepStrategy) if not res['OK']: gLogger.error( "ReplicationScheduler.execute: Failed to determine replication tree.", res['Message']) continue tree = res['Value'] ###################################################################################### # # For each item in the replication tree obtain the source and target SURLS # for channelID, dict in tree.items(): gLogger.info( "ReplicationScheduler.execute: processing for channel %d %s" % (channelID, str(dict))) hopSourceSE = dict['SourceSE'] hopDestSE = dict['DestSE'] hopAncestor = dict['Ancestor'] # Get the sourceSURL if hopAncestor: status = 'Waiting%s' % (hopAncestor) res = self.obtainLFNSURL(hopSourceSE, lfn) if not res['OK']: errStr = res['Message'] gLogger.error(errStr) return S_ERROR(errStr) sourceSURL = res['Value'] else: status = 'Waiting' res = self.resolvePFNSURL(hopSourceSE, lfnReps[hopSourceSE]) if not res['OK']: sourceSURL = lfnReps[hopSourceSE] else: sourceSURL = res['Value'] # Get the targetSURL res = self.obtainLFNSURL(hopDestSE, lfn) if not res['OK']: errStr = res['Message'] gLogger.error(errStr) return S_ERROR(errStr) targetSURL = res['Value'] ###################################################################################### # # For each item in the replication tree add the file to the channel # res = self.TransferDB.addFileToChannel( channelID, fileID, hopSourceSE, sourceSURL, hopDestSE, targetSURL, fileSize, fileStatus=status) if not res['OK']: errStr = res['Message'] gLogger.error( "ReplicationScheduler._execute: Failed to add File to Channel.", "%s %s" % (fileID, channelID)) return S_ERROR(errStr) res = self.TransferDB.addFileRegistration( channelID, fileID, lfn, targetSURL, hopDestSE) if not res['OK']: errStr = res['Message'] gLogger.error( "ReplicationScheduler._execute: Failed to add File registration.", "%s %s" % (fileID, channelID)) result = self.TransferDB.removeFileFromChannel( channelID, fileID) if not result['OK']: errStr += result['Message'] gLogger.error( "ReplicationScheduler._execute: Failed to remove File.", "%s %s" % (fileID, channelID)) return S_ERROR(errStr) oRequest.setSubRequestFileAttributeValue( ind, 'transfer', lfn, 'Status', 'Scheduled') res = self.TransferDB.addReplicationTree(fileID, tree) if oRequest.isSubRequestEmpty(ind, 'transfer')['Value']: oRequest.setSubRequestStatus(ind, 'transfer', 'Scheduled') ################################################ # Generate the new request string after operation requestString = oRequest.toXML()['Value'] res = self.RequestDB.updateRequest(requestName, requestString) if not res['OK']: gLogger.error( "ReplicationScheduler._execute: Failed to update request", "%s %s" % (requestName, res['Message'])) def obtainLFNSURL(self, targetSE, lfn): """ Creates the targetSURL for the storage and LFN supplied """ res = self.factory.getStorages(targetSE, protocolList=['SRM2']) if not res['OK']: errStr = 'ReplicationScheduler._execute: Failed to create SRM2 storage for %s: %s. ' % ( targetSE, res['Message']) gLogger.error(errStr) return S_ERROR(errStr) storageObjects = res['Value']['StorageObjects'] for storageObject in storageObjects: res = storageObject.getCurrentURL(lfn) if res['OK']: return res gLogger.error( 'ReplicationScheduler._execute: Failed to get SRM compliant storage.', targetSE) return S_ERROR( 'ReplicationScheduler._execute: Failed to get SRM compliant storage.' ) def resolvePFNSURL(self, sourceSE, pfn): """ Creates the targetSURL for the storage and LFN supplied """ res = self.rm.getPfnForProtocol([pfn], sourceSE) if not res['OK']: return res if pfn in res['Value']['Failed'].keys(): return S_ERROR(res['Value']['Failed'][pfn]) return S_OK(res['Value']['Successful'][pfn])