def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] pfn = problematicDict['PFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).getFileSize( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = Utils.executeSingleFileOrDirWrapper( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if ( catalogSize == bookkeepingSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID )
def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = returnSingleResult( StorageElement( se ).getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = returnSingleResult( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = returnSingleResult( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID )
class RequestPreparationAgent( AgentModule ): def initialize( self ): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "There were no New replicas found" ) return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len( replicaIDs ) ) # Check if the files exist in the FileCatalog res = self.__getExistingFiles( replicas ) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for replicaID in replicas[lfn].values(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len( exist ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len( terminal ) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize( exist ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len( fileSizes ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len( terminal ) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas( fileSizes.keys() ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len( fileReplicas ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len( terminal ) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas.get( lfn ) # This should not happen in principle, but it was seen # after a corrupted staging request has entered the DB if not lfnReplicas: gLogger.error( "Missing replicas information", "%s %s" % ( lfn, requestedSEs ) ) continue for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop( requestedSE ) else: replicaMetadata.append( ( replicaID, lfnReplicas[requestedSE], fileSizes[lfn] ) ) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message'] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len( replicaMetadata ) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation( replicaMetadata ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message'] ) return S_OK() def __getNewReplicas( self ): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas( {'Status':'New'} ) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len( res['Value'] ) ) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] replicas.setdefault( lfn, {} )[storageElement] = replicaID replicaIDs[replicaID] = ( lfn, storageElement ) return S_OK( {'Replicas':replicas, 'ReplicaIDs':replicaIDs} ) def __getExistingFiles( self, lfns ): """ This checks that the files exist in the FileCatalog. """ res = self.fileCatalog.exists( list( set( lfns ) ) ) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message'] ) return res failed = res['Value']['Failed'] success = res['Value']['Successful'] exist = [lfn for lfn, exists in success.items() if exists] missing = list( set( success ) - set( exist ) ) if missing: reason = 'LFN not registered in the FC' gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, '\n'.join( [''] + missing ) ) self.__reportProblematicFiles( missing, 'LFN-LFC-DoesntExist' ) missing = dict.fromkeys( missing, reason ) else: missing = {} return S_OK( {'Exist':exist, 'Missing':missing, 'Failed':failed} ) def __getFileSize( self, lfns ): """ This obtains the file size from the FileCatalog. """ fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn( "RequestPreparation.__getFileSize: %s" % reason, lfn ) self.__reportProblematicFiles( zeroSize.keys(), 'LFN-LFC-ZeroSize' ) return S_OK( {'FileSizes':fileSizes, 'ZeroSize':zeroSize, 'Failed':failed} ) def __getFileReplicas( self, lfns ): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.dm.getActiveReplicas( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len( lfnReplicas.keys() ) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn ) self.__reportProblematicFiles( noReplicas.keys(), 'LFN-LFC-NoReplicas' ) return S_OK( {'Replicas':replicas, 'ZeroReplicas':noReplicas, 'Failed':failed} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'RequestPreparationAgent' ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption("shifterProxy", "DataManager") return S_OK() def execute(self): """This is the first logical task to be executed and manages the New->Waiting transition of the Replicas""" res = self.__getNewReplicas() if not res["OK"]: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res["Message"] ) return res if not res["Value"]: gLogger.info("There were no New replicas found") return res replicas = res["Value"]["Replicas"] replicaIDs = res["Value"]["ReplicaIDs"] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs) ) # Check if the files exist in the FileCatalog res = self.__getExistingFiles(replicas) if not res["OK"]: return res exist = res["Value"]["Exist"] terminal = res["Value"]["Missing"] failed = res["Value"]["Failed"] if not exist: gLogger.error("RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file") return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for replicaID in replicas[lfn].values(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info("RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res["OK"]: return res failed.update(res["Value"]["Failed"]) terminal = res["Value"]["ZeroSize"] fileSizes = res["Value"]["FileSizes"] if not fileSizes: gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine sizes of any files") return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(list(fileSizes)) if not res["OK"]: return res failed.update(res["Value"]["Failed"]) terminal = res["Value"]["ZeroReplicas"] fileReplicas = res["Value"]["Replicas"] if not fileReplicas: gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine replicas for any files") return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas.get(lfn) # This should not happen in principle, but it was seen # after a corrupted staging request has entered the DB if not lfnReplicas: gLogger.error("Missing replicas information", "%s %s" % (lfn, requestedSEs)) continue for requestedSE, replicaID in requestedSEs.items(): if requestedSE not in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append((replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs) ) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res["OK"]: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res["Message"] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation(replicaMetadata) if not res["OK"]: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res["Message"] ) return S_OK() def __getNewReplicas(self): """This obtains the New replicas from the Replicas table and for each LFN the requested storage element""" # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas({"Status": "New"}) if not res["OK"]: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res["Message"] ) return res if not res["Value"]: gLogger.debug("RequestPreparation.__getNewReplicas: No New replicas found to process.") return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res["Value"]) ) replicas = {} replicaIDs = {} for replicaID, info in res["Value"].items(): lfn = info["LFN"] storageElement = info["SE"] replicas.setdefault(lfn, {})[storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({"Replicas": replicas, "ReplicaIDs": replicaIDs}) def __getExistingFiles(self, lfns): """This checks that the files exist in the FileCatalog.""" res = self.fileCatalog.exists(list(set(lfns))) if not res["OK"]: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res["Message"] ) return res failed = res["Value"]["Failed"] success = res["Value"]["Successful"] exist = [lfn for lfn, exists in success.items() if exists] missing = list(set(success) - set(exist)) if missing: reason = "LFN not registered in the FC" gLogger.warn("RequestPreparation.__getExistingFiles: %s" % reason, "\n".join([""] + missing)) self.__reportProblematicFiles(missing, "LFN-LFC-DoesntExist") missing = dict.fromkeys(missing, reason) else: missing = {} return S_OK({"Exist": exist, "Missing": missing, "Failed": failed}) def __getFileSize(self, lfns): """This obtains the file size from the FileCatalog.""" fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res["OK"]: gLogger.error("RequestPreparation.__getFileSize: Failed to get sizes for files.", res["Message"]) return res failed = res["Value"]["Failed"] for lfn, size in res["Value"]["Successful"].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), "LFN-LFC-ZeroSize") return S_OK({"FileSizes": fileSizes, "ZeroSize": zeroSize, "Failed": failed}) def __getFileReplicas(self, lfns): """This obtains the replicas from the FileCatalog.""" replicas = {} noReplicas = {} res = self.dm.getActiveReplicas(lfns) if not res["OK"]: gLogger.error("RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res["Message"]) return res failed = res["Value"]["Failed"] for lfn, lfnReplicas in res["Value"]["Successful"].items(): if len(lfnReplicas) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn("RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(list(noReplicas), "LFN-LFC-NoReplicas") return S_OK({"Replicas": replicas, "ZeroReplicas": noReplicas, "Failed": failed}) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic(lfns, reason, sourceComponent="RequestPreparationAgent") if not res["OK"]: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res["Message"] ) return res if res["Value"]["Successful"]: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res["Value"]["Successful"]) ) if res["Value"]["Failed"]: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res["Value"]["Failed"]) ) return res
class PluginUtilities(object): """ Utility class used by plugins """ def __init__(self, plugin='Standard', transClient=None, dataManager=None, fc=None, debug=False, transInThread=None, transID=None): """ c'tor Setting defaults """ # clients if transClient is None: self.transClient = TransformationClient() else: self.transClient = transClient if dataManager is None: self.dm = DataManager() else: self.dm = dataManager if fc is None: self.fc = FileCatalog() else: self.fc = fc self.dmsHelper = DMSHelpers() self.plugin = plugin self.transID = transID self.params = {} self.groupSize = 0 self.maxFiles = 0 self.cachedLFNSize = {} self.transString = '' self.debug = debug if transInThread is None: self.transInThread = {} else: self.transInThread = transInThread self.log = gLogger.getSubLogger(self.plugin + self.transInThread.get(self.transID, ' [NoThread] [%s] ' % self.transID)) # FIXME: This doesn't work (yet) but should soon, will allow scripts to get the context self.log.showHeaders(True) def logVerbose(self, message, param=''): """ logger helper """ if self.debug: log = gLogger.getSubLogger(self.plugin + ' (V)' + self.transInThread.get(self.transID, ' [NoThread] [%d] ' % self.transID)) log.info(message, param) else: self.log.verbose(message, param) def logDebug(self, message, param=''): """ logger helper """ self.log.debug(message, param) def logInfo(self, message, param=''): """ logger helper """ self.log.info(message, param) def logWarn(self, message, param=''): """ logger helper """ self.log.warn(message, param) def logError(self, message, param=''): """ logger helper """ self.log.error(message, param) def logException(self, message, param='', lException=False): """ logger helper """ self.log.exception(message, param, lException) def setParameters(self, params): """ Set the transformation parameters and extract transID """ self.params = params self.transID = params['TransformationID'] self.log = gLogger.getSubLogger(self.plugin + self.transInThread.get(self.transID, ' [NoThread] [%d] ' % self.transID)) # @timeThis def groupByReplicas(self, files, status): """ Generates tasks based on the location of the input data :param dict fileReplicas: {'/this/is/at.1': ['SE1'], '/this/is/at.12': ['SE1', 'SE2'], '/this/is/at.2': ['SE2'], '/this/is/at_123': ['SE1', 'SE2', 'SE3'], '/this/is/at_23': ['SE2', 'SE3'], '/this/is/at_4': ['SE4']} """ tasks = [] nTasks = 0 if not files: return S_OK(tasks) files = dict(files) # Parameters if not self.groupSize: self.groupSize = self.getPluginParam('GroupSize', 10) flush = (status == 'Flush') self.logVerbose( "groupByReplicas: %d files, groupSize %d, flush %s" % (len(files), self.groupSize, flush)) # Consider files by groups of SEs, a file is only in one group # Then consider files site by site, but a file can now be at more than one site for groupSE in (True, False): if not files: break seFiles = getFileGroups(files, groupSE=groupSE) self.logDebug("fileGroups set: ", seFiles) for replicaSE in sortSEs(seFiles): lfns = seFiles[replicaSE] if lfns: tasksLfns = breakListIntoChunks(lfns, self.groupSize) lfnsInTasks = [] for taskLfns in tasksLfns: if flush or (len(taskLfns) >= self.groupSize): tasks.append((replicaSE, taskLfns)) lfnsInTasks += taskLfns # In case the file was at more than one site, remove it from the other sites' list # Remove files from global list for lfn in lfnsInTasks: files.pop(lfn) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks] self.logVerbose( "groupByReplicas: %d tasks created (groupSE %s)" % (len(tasks) - nTasks, str(groupSE)), "%d files not included in tasks" % len(files)) nTasks = len(tasks) return S_OK(tasks) def createTasksBySize(self, lfns, replicaSE, fileSizes=None, flush=False): """ Split files in groups according to the size and create tasks for a given SE """ tasks = [] if fileSizes is None: fileSizes = self._getFileSize(lfns).get('Value') if fileSizes is None: self.logWarn('Error getting file sizes, no tasks created') return tasks taskLfns = [] taskSize = 0 if not self.groupSize: # input size in GB converted to bytes self.groupSize = float(self.getPluginParam('GroupSize', 1.)) * 1000 * 1000 * 1000 if not self.maxFiles: # FIXME: prepare for chaging the name of the ambiguoug CS option self.maxFiles = self.getPluginParam('MaxFilesPerTask', self.getPluginParam('MaxFiles', 100)) lfns = sorted(lfns, key=fileSizes.get) for lfn in lfns: size = fileSizes.get(lfn, 0) if size: if size > self.groupSize: tasks.append((replicaSE, [lfn])) else: taskSize += size taskLfns.append(lfn) if (taskSize > self.groupSize) or (len(taskLfns) >= self.maxFiles): tasks.append((replicaSE, taskLfns)) taskLfns = [] taskSize = 0 if flush and taskLfns: tasks.append((replicaSE, taskLfns)) if not tasks and not flush and taskLfns: self.logVerbose( 'Not enough data to create a task, and flush not set (%d bytes for groupSize %d)' % (taskSize, self.groupSize)) return tasks # @timeThis def groupBySize(self, files, status): """ Generate a task for a given amount of data """ tasks = [] nTasks = 0 if not len(files): return S_OK(tasks) files = dict(files) # Parameters if not self.groupSize: # input size in GB converted to bytes self.groupSize = float(self.getPluginParam('GroupSize', 1)) * 1000 * 1000 * 1000 flush = (status == 'Flush') self.logVerbose( "groupBySize: %d files, groupSize: %d, flush: %s" % (len(files), self.groupSize, flush)) # Get the file sizes res = self._getFileSize(list(files)) if not res['OK']: return res fileSizes = res['Value'] for groupSE in (True, False): if not files: break seFiles = getFileGroups(files, groupSE=groupSE) for replicaSE in sorted(seFiles) if groupSE else sortSEs(seFiles): lfns = seFiles[replicaSE] newTasks = self.createTasksBySize(lfns, replicaSE, fileSizes=fileSizes, flush=flush) lfnsInTasks = [] for task in newTasks: lfnsInTasks += task[1] tasks += newTasks # Remove the selected files from the size cache self.clearCachedFileSize(lfnsInTasks) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks] # Remove files from global list for lfn in lfnsInTasks: files.pop(lfn) self.logVerbose( "groupBySize: %d tasks created with groupSE %s" % (len(tasks) - nTasks, str(groupSE))) self.logVerbose("groupBySize: %d files have not been included in tasks" % len(files)) nTasks = len(tasks) self.logVerbose("Grouped %d files by size" % len(files)) return S_OK(tasks) def getExistingCounters(self, normalise=False, requestedSites=[]): res = self.transClient.getCounters('TransformationFiles', ['UsedSE'], {'TransformationID': self.params['TransformationID']}) if not res['OK']: return res usageDict = {} for usedDict, count in res['Value']: usedSE = usedDict['UsedSE'] if usedSE != 'Unknown': usageDict[usedSE] = count if requestedSites: siteDict = {} for se, count in usageDict.items(): res = getSitesForSE(se) if not res['OK']: return res for site in res['Value']: if site in requestedSites: siteDict[site] = count usageDict = siteDict.copy() if normalise: usageDict = self._normaliseShares(usageDict) return S_OK(usageDict) # @timeThis def _getFileSize(self, lfns): """ Get file size from a cache, if not from the catalog #FIXME: have to fill the cachedLFNSize! """ lfns = list(lfns) cachedLFNSize = dict(self.cachedLFNSize) fileSizes = {} for lfn in [lfn for lfn in lfns if lfn in cachedLFNSize]: fileSizes[lfn] = cachedLFNSize[lfn] self.logDebug( "Found cache hit for File size for %d files out of %d" % (len(fileSizes), len(lfns))) lfns = [lfn for lfn in lfns if lfn not in cachedLFNSize] if lfns: fileSizes = self._getFileSizeFromCatalog(lfns, fileSizes) if not fileSizes['OK']: self.logError(fileSizes['Message']) return fileSizes fileSizes = fileSizes['Value'] return S_OK(fileSizes) # @timeThis def _getFileSizeFromCatalog(self, lfns, fileSizes): """ Get file size from the catalog """ lfns = list(lfns) fileSizes = dict(fileSizes) res = self.fc.getFileSize(lfns) if not res['OK']: return S_ERROR("Failed to get sizes for all files: %s" % res['Message']) if res['Value']['Failed']: errorReason = sorted(set(res['Value']['Failed'].values())) self.logWarn("Failed to get sizes for %d files:" % len(res['Value']['Failed']), errorReason) fileSizes.update(res['Value']['Successful']) self.cachedLFNSize.update((res['Value']['Successful'])) self.logVerbose("Got size of %d files from catalog" % len(lfns)) return S_OK(fileSizes) def clearCachedFileSize(self, lfns): """ Utility function """ for lfn in [lfn for lfn in lfns if lfn in self.cachedLFNSize]: self.cachedLFNSize.pop(lfn) def getPluginParam(self, name, default=None): """ Get plugin parameters using specific settings or settings defined in the CS Caution: the type returned is that of the default value """ # get the value of a parameter looking 1st in the CS if default is not None: valueType = type(default) else: valueType = None # First look at a generic value... optionPath = "TransformationPlugins/%s" % (name) value = Operations().getValue(optionPath, None) self.logVerbose("Default plugin param %s: '%s'" % (optionPath, value)) # Then look at a plugin-specific value optionPath = "TransformationPlugins/%s/%s" % (self.plugin, name) value = Operations().getValue(optionPath, value) self.logVerbose("Specific plugin param %s: '%s'" % (optionPath, value)) if value is not None: default = value # Finally look at a transformation-specific parameter value = self.params.get(name, default) self.logVerbose( "Transformation plugin param %s: '%s'. Convert to %s" % (name, value, str(valueType))) if valueType and not isinstance(value, valueType): if valueType is list: try: value = ast.literal_eval(value) if value and value != 'None' else [] # literal_eval('SE-DST') -> ValueError # literal_eval('SE_MC-DST') -> SyntaxError # Don't ask... except (ValueError, SyntaxError): value = [val for val in value.replace(' ', '').split(',') if val] elif valueType is int: value = int(value) elif valueType is float: value = float(value) elif valueType is bool: if value in ('False', 'No', 'None', None, 0): value = False else: value = bool(value) elif valueType is not str: self.logWarn( "Unknown parameter type (%s) for %s, passed as string" % (str(valueType), name)) self.logVerbose("Final plugin param %s: '%s'" % (name, value)) return value @staticmethod def _normaliseShares(originalShares): """ Normalize shares to 1 """ total = sum(float(share) for share in originalShares.values()) return dict([(site, 100. * float(share) / total if total else 0.) for site, share in originalShares.items()]) def uniqueSEs(self, ses): """ return a list of SEs that are not physically the same """ newSEs = [] for se in ses: if not self.isSameSEInList(se, newSEs): newSEs.append(se) return newSEs def isSameSE(self, se1, se2): """ Check if 2 SEs are indeed the same. :param se1: name of the first StorageElement :param se2: name of the second StorageElement :returns: True/False if they are considered the same. See :py:mod:`~DIRAC.Resources.Storage.StorageElement.StorageElementItem.isSameSE` """ if se1 == se2: return True return StorageElement(se1).isSameSE(StorageElement(se2)) def isSameSEInList(self, se1, seList): """ Check if an SE is the same as any in a list """ if se1 in seList: return True for se in seList: if self.isSameSE(se1, se): return True return False def closerSEs(self, existingSEs, targetSEs, local=False): """ Order the targetSEs such that the first ones are closer to existingSEs. Keep all elements in targetSEs """ setTarget = set(targetSEs) sameSEs = set([se1 for se1 in setTarget for se2 in existingSEs if self.isSameSE(se1, se2)]) targetSEs = setTarget - set(sameSEs) if targetSEs: # Some SEs are left, look for sites existingSites = [self.dmsHelper.getLocalSiteForSE(se).get('Value') for se in existingSEs] existingSites = set([site for site in existingSites if site]) closeSEs = set([se for se in targetSEs if self.dmsHelper.getLocalSiteForSE(se).get('Value') in existingSites]) # print existingSEs, existingSites, targetSEs, closeSEs otherSEs = targetSEs - closeSEs targetSEs = list(closeSEs) random.shuffle(targetSEs) if not local and otherSEs: otherSEs = list(otherSEs) random.shuffle(otherSEs) targetSEs += otherSEs else: targetSEs = [] return (targetSEs + list(sameSEs)) if not local else targetSEs @staticmethod def seParamtoList(inputParam): """Transform ``inputParam`` to list. :param inputParam: can be string, list, or string representation of list :returns: list """ if not inputParam: return [] if inputParam.count('['): return eval(inputParam) # pylint: disable=eval-used elif isinstance(inputParam, list): return inputParam return [inputParam]
class DataIntegrityClient(Client): """Client exposing the DataIntegrity Service.""" def __init__(self, **kwargs): super(DataIntegrityClient, self).__init__(**kwargs) self.setServer('DataManagement/DataIntegrity') self.dm = DataManager() self.fc = FileCatalog() def setFileProblematic(self, lfn, reason, sourceComponent=''): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance(lfn, list): lfns = lfn elif isinstance(lfn, six.string_types): lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len(lfns)) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': '', 'SE': '' } res = self.insertProblematic(sourceComponent, fileMetadata) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def reportProblematicReplicas(self, replicaTuple, se, reason): """ Simple wrapper function around setReplicaProblematic """ gLogger.info('The following %s files had %s at %s' % (len(replicaTuple), reason, se)) for lfn, _pfn, se, reason in sorted(replicaTuple): if lfn: gLogger.info(lfn) res = self.setReplicaProblematic(replicaTuple, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with replicas', res['Message']) else: gLogger.info('Successfully updated integrity DB with replicas') def setReplicaProblematic(self, replicaTuple, sourceComponent=''): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance(replicaTuple, tuple): replicaTuple = [replicaTuple] elif isinstance(replicaTuple, list): pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len(replicaTuple)) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': pfn, 'SE': se } res = self.insertProblematic(sourceComponent, replicaDict) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus(replicaDict) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error(errStr, res['Message']) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful': successful, 'Failed': failed} return S_OK(resDict) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles(self, prognosis, fileID): gLogger.info("%s file (%d) is resolved" % (prognosis, fileID)) return self.setProblematicStatus(fileID, 'Resolved') def __returnProblematicError(self, fileID, res): self.incrementProblematicRetry(fileID) gLogger.error('DataIntegrityClient failure', res['Message']) return res def __updateReplicaToChecked(self, problematicDict): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info("%s replica (%d) is updated to Checked status" % (prognosis, fileID)) return self.__updateCompletedFiles(prognosis, fileID) def resolveCatalogPFNSizeMismatch(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value'] res = returnSingleResult(StorageElement(se).getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) storageSize = res['Value'] bkKCatalog = FileCatalog(['BookkeepingDB']) res = returnSingleResult(bkKCatalog.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID) return self.__updateReplicaToChecked(problematicDict) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(res['Value']) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID) res = self.dm.removeReplica(se, lfn) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('CatalogPFNSizeMismatch', fileID) if (catalogSize != bookkeepingSize) and (bookkeepingSize == storageSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID) res = self.__updateReplicaToChecked(problematicDict) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.changeProblematicPrognosis(fileID, 'BKCatalogSizeMismatch') gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) ############################################################################################ def _reportProblematicFiles(self, lfns, reason): """ Simple wrapper function around setFileProblematic """ gLogger.info('The following %s files were found with %s' % (len(lfns), reason)) for lfn in sorted(lfns): gLogger.info(lfn) res = self.setFileProblematic(lfns, reason, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with files', res['Message']) else: gLogger.info('Successfully updated integrity DB with files')
class helper_TransferAgent(object): def __init__(self, transferAgent, gTransferDB): self.transferAgent =transferAgent self.transferDB = gTransferDB gLogger.info("Creating File Catalog") self.fileCatalog = FileCatalog() def helper_add_transfer(self, result): if not result: gLogger.error("There is no infomation") return False res = self.transferDB.get_TransferRequest(condDict={ "id": result.trans_req_id }) if not res["OK"]: return False req_list = res["Value"] if len(req_list) != 1: return False req = TransRequestEntryWithID._make(req_list[0]) # construct the info info = {"id": result.id, "LFN": result.LFN, "srcSE": req.srcSE, "dstSE": req.dstSE, "retransfer": -1, "error": ""} # Add the Transfer worker = gTransferFactory.generate(req.protocol, info) self.transferAgent.transfer_worker.append(worker) # Change the status self.helper_status_update( self.transferDB.tables["TransferFileList"], result.id, {"status":"transfer", "start_time":datetime.datetime.utcnow()}) # Add Accounting: d = {} d["User"] = req.username d["Source"] = req.srcSE d["Destination"] = req.dstSE d["Protocol"] = req.protocol d["FinalStatus"] = "OK" d["TransferSize"] = 0 # TODO r = self.fileCatalog.getFileSize(result.LFN) if r["OK"]: if r["Value"]["Successful"]: d["TransferSize"] = r["Value"]["Successful"][result.LFN] d["TransferTime"] = 1 # 1s d["TransferOK"] = 1 d["TransferTotal"] = 1 acct_dt = DataTransfer() acct_dt.setValuesFromDict(d) acct_dt.setNowAsStartAndEndTime() # save it worker.acct_dt = acct_dt return True def helper_remove_transfer(self, worker): info = worker.info gLogger.info("File.id = %d -> finish" % info["id"]) self.helper_status_update( self.transferDB.tables["TransferFileList"], info["id"], {"status":"finish", "finish_time": datetime.datetime.utcnow()}) # Accounting acct_dt = worker.acct_dt acct_dt.setEndTime() # TODO d = {} td = acct_dt.endTime-acct_dt.startTime td_s = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 d["TransferTime"] = td_s # 1s if info["error"]: d["FinalStatus"] = "FAILED" d["TransferOK"] = 0 else: d["FinalStatus"] = "OK" d["TransferOK"] = 1 acct_dt.setValuesFromDict(d) acct_dt.commit() gLogger.info("Submit Accounting Data") def helper_check_request(self): """ check if the *transfer* request are ok. if the whole files are *finish*, then this request will become *finish*. """ infoDict = {"status": "transfer"} res = self.transferDB.get_TransferRequest(condDict = infoDict) if not res["OK"]: return reqlist = map(TransRequestEntryWithID._make, res["Value"]) for req in reqlist: res = self.transferDB._query( 'select count(*) from %(table)s where trans_req_id = %(id)d and status not in %(status_list)s' % { "table": self.transferDB.tables["TransferFileList"], "id": req.id, "status_list": '("finish", "kill")' # XXX finish or kill means this request is ok. } ) if not res["OK"]: # TODO continue count = res["Value"][0][0] if count == 0: # if all status is finish, # the req status --> finish gLogger.info("req.id %d change from %s to finish" % (req.id, req.status)) self.helper_status_update( self.transferDB.tables["TransferRequest"], req.id, {"status":"finish"}) return def helper_get_new_request(self): # 1. get the *new* File in the <<Transfer File List>>. # if we get, goto <<Add New Transfer>> already_load_status = False result_new_file = self.helper_get_new_File() # 1.1 2014.04.20 # They want to the other requests are also loaded, # so I have to not return immediately if result_new_file: already_load_status = True # 2. if we can't get, use should get a *new* request # from the <<Transfer Request>>. # if we can't get, return False. STOP self.helper_check_request() result = self.helper_get_new_request_entry() if result: # 3. add the filelist in the dataset to the << Transfer File List >> condDict = {"name":result.dataset} res = self.transferDB.get_Dataset(condDict) if not res["OK"]: gLogger.error(res) return None filelist = res["Value"] # update the status in << Request >> if len(filelist) > 0: req_status = "transfer" else: req_status = "finish" self.helper_status_update(self.transferDB.tables["TransferRequest"], result.id, {"status":req_status}) self.transferDB.insert_TransferFileList(result.id, filelist) # 4. get the *new* File Again. # 5. can't get, return False. STOP # 4.prelude # If already loaded, return the last result if already_load_status and result_new_file: return result_new_file result = self.helper_get_new_File() return result def helper_get_new_request_entry(self): """ TransRequestEntryWithID( id=1L, username='******', dataset='my-dataset', srcSE='IHEP-USER', dstSE='IHEPD-USER', submit_time=datetime.datetime(2013, 3, 13, 20, 9, 34), status='new') """ condDict = {"status": "new"} res = self.transferDB.get_TransferRequest(condDict) if not res["OK"]: return None req_list = res["Value"] len_req = len(req_list) if len_req: # random select tmp_idx = random.randint(0, len_req-1) return TransRequestEntryWithID._make(req_list[tmp_idx]) pass def helper_get_new_File(self): """ >>> helper.helper_get_new_File() TransFileListEntryWithID( id=1L, LFN='/path/does/not/exist', trans_req_id=1L, start_time=None, finish_time=None, status='new') """ condDict = {"status": "new"} res = self.transferDB.get_TransferFileList(condDict) if not res["OK"]: gLogger.error(res) return None filelist = res["Value"] gLogger.info("Filelist:") gLogger.info(filelist) len_files = len(filelist) if len_files > 0: tmp_idx = random.randint(0, len_files-1) gLogger.info("get file entry index randomly: %d/%d"%(tmp_idx, len_files)) gLogger.info("get file entry", filelist[tmp_idx]) return TransFileListEntryWithID._make(filelist[tmp_idx]) return None def helper_status_update(self, table, id, toUpdate): res = self.transferDB.updateFields( table, updateDict = toUpdate, condDict = {"id":id}, ) print res def helper_error_report(self, worker, reason): self.helper_status_update(self.transferDB.tables["TransferFileList"], worker.info["id"], {"error": reason}) def check_worker_status(self, worker): """check whether the file transfer is kill(in DB)""" res = self.transferDB.getFields(self.transferDB.tables["TransferFileList"], outFields = ["status"], condDict = {"id":worker.info["id"]}) if not res["OK"]: gLogger.error(res) return if not res["Value"]: return if len(res["Value"]) != 1: gLogger.error[res] return status = res["Value"][0][0] if status == "kill": gLogger.info("check worker should be killed: ", status) worker.proc.kill()
class TransformationPlugin( PluginBase ): """ A TransformationPlugin object should be instantiated by every transformation. """ def __init__( self, plugin, transClient = None, dataManager = None ): """ plugin name has to be passed in: it will then be executed as one of the functions below, e.g. plugin = 'BySize' will execute TransformationPlugin('BySize')._BySize() """ super( TransformationPlugin, self ).__init__( plugin ) self.data = {} self.files = False if transClient is None: self.transClient = TransformationClient() else: self.transClient = transClient if dataManager is None: self.dm = DataManager() else: self.dm = dataManager self.fc = FileCatalog() def isOK( self ): self.valid = True if ( not self.data ) or ( not self.params ): self.valid = False return self.valid def setInputData( self, data ): self.data = data def setTransformationFiles( self, files ): #TODO ADDED self.files = files def _Standard( self ): """ Simply group by replica location """ res = self._groupByReplicas() if not res['OK']: return res newTasks = [] for _se, lfns in res['Value']: newTasks.append( ( '', lfns ) ) return S_OK( newTasks ) def _BySize( self ): """ Alias for groupBySize """ return self._groupBySize() def _Broadcast( self ): """ This plug-in takes files found at the sourceSE and broadcasts to all (or a selection of) targetSEs. """ if not self.params: return S_ERROR( "TransformationPlugin._Broadcast: The 'Broadcast' plugin requires additional parameters." ) targetseParam = self.params['TargetSE'] targetSEs = [] sourceSEs = eval( self.params['SourceSE'] ) if targetseParam.count( '[' ): targetSEs = eval( targetseParam ) elif type(targetseParam)==type([]): targetSEs = targetseParam else: targetSEs = [targetseParam] #sourceSEs = eval(self.params['SourceSE']) #targetSEs = eval(self.params['TargetSE']) destinations = int( self.params.get( 'Destinations', 0 ) ) if destinations and ( destinations >= len(targetSEs) ): destinations = 0 status = self.params['Status'] groupSize = self.params['GroupSize']#Number of files per tasks fileGroups = self._getFileGroups( self.data )#groups by SE targetSELfns = {} for replicaSE, lfns in fileGroups.items(): ses = replicaSE.split( ',' ) #sourceSites = self._getSitesForSEs(ses) atSource = False for se in ses: if se in sourceSEs: atSource = True if not atSource: continue for lfn in lfns: targets = [] sources = self._getSitesForSEs( ses ) random.shuffle( targetSEs ) for targetSE in targetSEs: site = self._getSiteForSE( targetSE )['Value'] if not site in sources: if ( destinations ) and ( len( targets ) >= destinations ): continue sources.append( site ) targets.append( targetSE )#after all, if someone wants to copy to the source, it's his choice strTargetSEs = str.join( ',', sorted( targets ) ) if not targetSELfns.has_key( strTargetSEs ): targetSELfns[strTargetSEs] = [] targetSELfns[strTargetSEs].append( lfn ) tasks = [] for ses, lfns in targetSELfns.items(): tasksLfns = breakListIntoChunks(lfns, groupSize) for taskLfns in tasksLfns: if ( status == 'Flush' ) or ( len( taskLfns ) >= int( groupSize ) ): #do not allow groups smaller than the groupSize, except if transformation is in flush state tasks.append( ( ses, taskLfns ) ) return S_OK( tasks ) def _ByShare( self, shareType = 'CPU' ): """ first get the shares from the CS, and then makes the grouping looking at the history """ res = self._getShares( shareType, normalise = True ) if not res['OK']: return res cpuShares = res['Value'] gLogger.info( "Obtained the following target shares (%):" ) for site in sorted( cpuShares.keys() ): gLogger.info( "%s: %.1f" % ( site.ljust( 15 ), cpuShares[site] ) ) # Get the existing destinations from the transformationDB res = self._getExistingCounters( requestedSites = cpuShares.keys() ) if not res['OK']: gLogger.error( "Failed to get existing file share", res['Message'] ) return res existingCount = res['Value'] if existingCount: gLogger.info( "Existing site utilization (%):" ) normalisedExistingCount = self._normaliseShares( existingCount.copy() ) for se in sorted( normalisedExistingCount.keys() ): gLogger.info( "%s: %.1f" % ( se.ljust( 15 ), normalisedExistingCount[se] ) ) # Group the input files by their existing replicas res = self._groupByReplicas() if not res['OK']: return res replicaGroups = res['Value'] tasks = [] # For the replica groups for replicaSE, lfns in replicaGroups: possibleSEs = replicaSE.split( ',' ) # Determine the next site based on requested shares, existing usage and candidate sites res = self._getNextSite( existingCount, cpuShares, candidates = self._getSitesForSEs( possibleSEs ) ) if not res['OK']: gLogger.error( "Failed to get next destination SE", res['Message'] ) continue targetSite = res['Value'] # Resolve the ses for the target site res = getSEsForSite( targetSite ) if not res['OK']: continue ses = res['Value'] # Determine the selected SE and create the task for chosenSE in ses: if chosenSE in possibleSEs: tasks.append( ( chosenSE, lfns ) ) if not existingCount.has_key( targetSite ): existingCount[targetSite] = 0 existingCount[targetSite] += len( lfns ) return S_OK( tasks ) def _getShares( self, shareType, normalise = False ): """ Takes share from the CS, eventually normalize them """ res = gConfig.getOptionsDict( '/Resources/Shares/%s' % shareType ) if not res['OK']: return res if not res['Value']: return S_ERROR( "/Resources/Shares/%s option contains no shares" % shareType ) shares = res['Value'] for site, value in shares.items(): shares[site] = float( value ) if normalise: shares = self._normaliseShares( shares ) if not shares: return S_ERROR( "No non-zero shares defined" ) return S_OK( shares ) def _getExistingCounters( self, normalise = False, requestedSites = [] ): res = self.transClient.getCounters( 'TransformationFiles', ['UsedSE'], {'TransformationID':self.params['TransformationID']} ) if not res['OK']: return res usageDict = {} for usedDict, count in res['Value']: usedSE = usedDict['UsedSE'] if usedSE != 'Unknown': usageDict[usedSE] = count if requestedSites: siteDict = {} for se, count in usageDict.items(): res = getSitesForSE( se, gridName = 'LCG' ) if not res['OK']: return res for site in res['Value']: if site in requestedSites: siteDict[site] = count usageDict = siteDict.copy() if normalise: usageDict = self._normaliseShares( usageDict ) return S_OK( usageDict ) @classmethod def _normaliseShares( self, originalShares ): shares = originalShares.copy() total = 0.0 for site in shares.keys(): share = float( shares[site] ) shares[site] = share total += share for site in shares.keys(): share = 100.0 * ( shares[site] / total ) shares[site] = share return shares def _getNextSite( self, existingCount, cpuShares, candidates = [] ): # normalise the shares siteShare = self._normaliseShares( existingCount ) # then fill the missing share values to 0 for site in cpuShares.keys(): if ( not siteShare.has_key( site ) ): siteShare[site] = 0.0 # determine which site is furthest from its share chosenSite = '' minShareShortFall = -float( "inf" ) for site, cpuShare in cpuShares.items(): if ( candidates ) and not ( site in candidates ): continue if not cpuShare: continue existingShare = siteShare[site] shareShortFall = cpuShare - existingShare if shareShortFall > minShareShortFall: minShareShortFall = shareShortFall chosenSite = site return S_OK( chosenSite ) def _groupByReplicas( self ): """ Generates a job based on the location of the input data """ if not self.params: return S_ERROR( "TransformationPlugin._Standard: The 'Standard' plug-in requires parameters." ) status = self.params['Status'] groupSize = self.params['GroupSize'] # Group files by SE fileGroups = self._getFileGroups( self.data ) # Create tasks based on the group size tasks = [] for replicaSE in sorted( fileGroups.keys() ): lfns = fileGroups[replicaSE] tasksLfns = breakListIntoChunks( lfns, groupSize ) for taskLfns in tasksLfns: if ( status == 'Flush' ) or ( len( taskLfns ) >= int( groupSize ) ): tasks.append( ( replicaSE, taskLfns ) ) return S_OK( tasks ) def _groupBySize( self ): """ Generate a task for a given amount of data """ if not self.params: return S_ERROR( "TransformationPlugin._BySize: The 'BySize' plug-in requires parameters." ) status = self.params['Status'] requestedSize = float( self.params['GroupSize'] ) * 1000 * 1000 * 1000 # input size in GB converted to bytes maxFiles = self.params.get( 'MaxFiles', 100 ) # Group files by SE fileGroups = self._getFileGroups( self.data ) # Get the file sizes res = self.fc.getFileSize( self.data ) if not res['OK']: return S_ERROR( "Failed to get sizes for files" ) if res['Value']['Failed']: return S_ERROR( "Failed to get sizes for all files" ) fileSizes = res['Value']['Successful'] tasks = [] for replicaSE, lfns in fileGroups.items(): taskLfns = [] taskSize = 0 for lfn in lfns: taskSize += fileSizes[lfn] taskLfns.append( lfn ) if ( taskSize > requestedSize ) or ( len( taskLfns ) >= maxFiles ): tasks.append( ( replicaSE, taskLfns ) ) taskLfns = [] taskSize = 0 if ( status == 'Flush' ) and taskLfns: tasks.append( ( replicaSE, taskLfns ) ) return S_OK( tasks ) @classmethod def _getFileGroups( cls, fileReplicas ): """ get file groups dictionary { "SE1,SE2,SE3" : [ lfn1, lfn2 ], ... } :param dict fileReplicas: { lfn : [SE1, SE2, SE3], ... } """ fileGroups = {} for lfn, replicas in fileReplicas.items(): replicaSEs = ",".join( sorted( list( set( replicas ) ) ) ) if replicaSEs not in fileGroups: fileGroups[replicaSEs] = [] fileGroups[replicaSEs].append( lfn ) return fileGroups @classmethod def _getSiteForSE( cls, se ): """ Get site name for the given SE """ result = getSitesForSE( se, gridName = 'LCG' ) if not result['OK']: return result if result['Value']: return S_OK( result['Value'][0] ) return S_OK( '' ) @classmethod def _getSitesForSEs( cls, seList ): """ Get all the sites for the given SE list """ sites = [] for se in seList: result = getSitesForSE( se, gridName = 'LCG' ) if result['OK']: sites += result['Value'] return sites
class DataIntegrityClient( Client ): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__( self, **kwargs ): super( DataIntegrityClient, self ).__init__( **kwargs ) self.setServer( 'DataManagement/DataIntegrity' ) self.dm = DataManager() self.fc = FileCatalog() def setFileProblematic( self, lfn, reason, sourceComponent = '' ): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance( lfn, list ): lfns = lfn elif isinstance( lfn, basestring ): lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len( lfns ) ) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':'', 'SE':''} res = self.insertProblematic( sourceComponent, fileMetadata ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def reportProblematicReplicas( self, replicaTuple, se, reason ): """ Simple wrapper function around setReplicaProblematic """ gLogger.info( 'The following %s files had %s at %s' % ( len( replicaTuple ), reason, se ) ) for lfn, _pfn, se, reason in sorted( replicaTuple ): if lfn: gLogger.info( lfn ) res = self.setReplicaProblematic( replicaTuple, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with replicas', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with replicas' ) def setReplicaProblematic( self, replicaTuple, sourceComponent = '' ): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance( replicaTuple, tuple ): replicaTuple = [replicaTuple] elif isinstance( replicaTuple, list ): pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len( replicaTuple ) ) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':pfn, 'SE':se} res = self.insertProblematic( sourceComponent, replicaDict ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus( replicaDict ) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error( errStr, res['Message'] ) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict ) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles( self, prognosis, fileID ): gLogger.info( "%s file (%d) is resolved" % ( prognosis, fileID ) ) return self.setProblematicStatus( fileID, 'Resolved' ) def __returnProblematicError( self, fileID, res ): self.incrementProblematicRetry( fileID ) gLogger.error( 'DataIntegrityClient failure', res['Message'] ) return res def __updateReplicaToChecked( self, problematicDict ): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "%s replica (%d) is updated to Checked status" % ( prognosis, fileID ) ) return self.__updateCompletedFiles( prognosis, fileID ) def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = returnSingleResult( StorageElement( se ).getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = returnSingleResult( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = returnSingleResult( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) ############################################################################################ def _reportProblematicFiles( self, lfns, reason ): """ Simple wrapper function around setFileProblematic """ gLogger.info( 'The following %s files were found with %s' % ( len( lfns ), reason ) ) for lfn in sorted( lfns ): gLogger.info( lfn ) res = self.setFileProblematic( lfns, reason, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with files', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with files' )
class DataIntegrityClient( Client ): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__( self, **kwargs ): Client.__init__( self, **kwargs ) self.setServer( 'DataManagement/DataIntegrity' ) self.dm = DataManager() self.fc = FileCatalog() ########################################################################## # # This section contains the specific methods for LFC->SE checks # def catalogDirectoryToSE( self, lfnDir ): """ This obtains the replica and metadata information from the catalog for the supplied directory and checks against the storage elements. """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the LFC->SE check" ) gLogger.info( "-" * 40 ) if type( lfnDir ) in types.StringTypes: lfnDir = [lfnDir] res = self.__getCatalogDirectoryContents( lfnDir ) if not res['OK']: return res replicas = res['Value']['Replicas'] catalogMetadata = res['Value']['Metadata'] res = self.__checkPhysicalFiles( replicas, catalogMetadata ) if not res['OK']: return res resDict = {'CatalogMetadata':catalogMetadata, 'CatalogReplicas':replicas} return S_OK( resDict ) def catalogFileToSE( self, lfns ): """ This obtains the replica and metadata information from the catalog and checks against the storage elements. """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the LFC->SE check" ) gLogger.info( "-" * 40 ) if type( lfns ) in types.StringTypes: lfns = [lfns] res = self.__getCatalogMetadata( lfns ) if not res['OK']: return res catalogMetadata = res['Value'] res = self.__getCatalogReplicas( catalogMetadata.keys() ) if not res['OK']: return res replicas = res['Value'] res = self.__checkPhysicalFiles( replicas, catalogMetadata ) if not res['OK']: return res resDict = {'CatalogMetadata':catalogMetadata, 'CatalogReplicas':replicas} return S_OK( resDict ) def checkPhysicalFiles( self, replicas, catalogMetadata, ses = [] ): """ This obtains takes the supplied replica and metadata information obtained from the catalog and checks against the storage elements. """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the LFC->SE check" ) gLogger.info( "-" * 40 ) return self.__checkPhysicalFiles( replicas, catalogMetadata, ses = ses ) def __checkPhysicalFiles( self, replicas, catalogMetadata, ses = [] ): """ This obtains the physical file metadata and checks the metadata against the catalog entries """ sePfns = {} pfnLfns = {} for lfn, replicaDict in replicas.items(): for se, pfn in replicaDict.items(): if ( ses ) and ( se not in ses ): continue if not sePfns.has_key( se ): sePfns[se] = [] sePfns[se].append( pfn ) pfnLfns[pfn] = lfn gLogger.info( '%s %s' % ( 'Storage Element'.ljust( 20 ), 'Replicas'.rjust( 20 ) ) ) for site in sortList( sePfns.keys() ): files = len( sePfns[site] ) gLogger.info( '%s %s' % ( site.ljust( 20 ), str( files ).rjust( 20 ) ) ) for se in sortList( sePfns.keys() ): pfns = sePfns[se] pfnDict = {} for pfn in pfns: pfnDict[pfn] = pfnLfns[pfn] sizeMismatch = [] res = self.__checkPhysicalFileMetadata( pfnDict, se ) if not res['OK']: gLogger.error( 'Failed to get physical file metadata.', res['Message'] ) return res for pfn, metadata in res['Value'].items(): if catalogMetadata.has_key( pfnLfns[pfn] ): if ( metadata['Size'] != catalogMetadata[pfnLfns[pfn]]['Size'] ) and ( metadata['Size'] != 0 ): sizeMismatch.append( ( pfnLfns[pfn], pfn, se, 'CatalogPFNSizeMismatch' ) ) if sizeMismatch: self.__reportProblematicReplicas( sizeMismatch, se, 'CatalogPFNSizeMismatch' ) return S_OK() def __checkPhysicalFileMetadata( self, pfnLfns, se ): """ Check obtain the physical file metadata and check the files are available """ gLogger.info( 'Checking the integrity of %s physical files at %s' % ( len( pfnLfns ), se ) ) res = StorageElement( se ).getFileMetadata( pfnLfns.keys() ) if not res['OK']: gLogger.error( 'Failed to get metadata for pfns.', res['Message'] ) return res pfnMetadataDict = res['Value']['Successful'] # If the replicas are completely missing missingReplicas = [] for pfn, reason in res['Value']['Failed'].items(): if re.search( 'File does not exist', reason ): missingReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNMissing' ) ) if missingReplicas: self.__reportProblematicReplicas( missingReplicas, se, 'PFNMissing' ) lostReplicas = [] unavailableReplicas = [] zeroSizeReplicas = [] # If the files are not accessible for pfn, pfnMetadata in pfnMetadataDict.items(): if pfnMetadata['Lost']: lostReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNLost' ) ) if pfnMetadata['Unavailable']: unavailableReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNUnavailable' ) ) if pfnMetadata['Size'] == 0: zeroSizeReplicas.append( ( pfnLfns[pfn], pfn, se, 'PFNZeroSize' ) ) if lostReplicas: self.__reportProblematicReplicas( lostReplicas, se, 'PFNLost' ) if unavailableReplicas: self.__reportProblematicReplicas( unavailableReplicas, se, 'PFNUnavailable' ) if zeroSizeReplicas: self.__reportProblematicReplicas( zeroSizeReplicas, se, 'PFNZeroSize' ) gLogger.info( 'Checking the integrity of physical files at %s complete' % se ) return S_OK( pfnMetadataDict ) ########################################################################## # # This section contains the specific methods for SE->LFC checks # def storageDirectoryToCatalog( self, lfnDir, storageElement ): """ This obtains the file found on the storage element in the supplied directories and determines whether they exist in the catalog and checks their metadata elements """ gLogger.info( "-" * 40 ) gLogger.info( "Performing the SE->LFC check at %s" % storageElement ) gLogger.info( "-" * 40 ) if type( lfnDir ) in types.StringTypes: lfnDir = [lfnDir] res = self.__getStorageDirectoryContents( lfnDir, storageElement ) if not res['OK']: return res storageFileMetadata = res['Value'] if storageFileMetadata: return self.__checkCatalogForSEFiles( storageFileMetadata, storageElement ) return S_OK( {'CatalogMetadata':{}, 'StorageMetadata':{}} ) def __checkCatalogForSEFiles( self, storageMetadata, storageElement ): gLogger.info( 'Checking %s storage files exist in the catalog' % len( storageMetadata ) ) # RF_NOTE : this comment is completely wrong # First get all the PFNs as they should be registered in the catalog res = StorageElement( storageElement ).getPfnForProtocol( storageMetadata.keys(), withPort = False ) if not res['OK']: gLogger.error( "Failed to get registered PFNs for physical files", res['Message'] ) return res for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) for original, registered in res['Value']['Successful'].items(): storageMetadata[registered] = storageMetadata.pop( original ) # Determine whether these PFNs are registered and if so obtain the LFN res = self.fc.getLFNForPFN( storageMetadata.keys() ) if not res['OK']: gLogger.error( "Failed to get registered LFNs for PFNs", res['Message'] ) return res failedPfns = res['Value']['Failed'] notRegisteredPfns = [] for pfn, error in failedPfns.items(): if re.search( 'No such file or directory', error ): notRegisteredPfns.append( ( storageMetadata[pfn]['LFN'], pfn, storageElement, 'PFNNotRegistered' ) ) failedPfns.pop( pfn ) if notRegisteredPfns: self.__reportProblematicReplicas( notRegisteredPfns, storageElement, 'PFNNotRegistered' ) if failedPfns: return S_ERROR( 'Failed to obtain LFNs for PFNs' ) pfnLfns = res['Value']['Successful'] for pfn in storageMetadata.keys(): pfnMetadata = storageMetadata.pop( pfn ) if pfn in pfnLfns.keys(): lfn = pfnLfns[pfn] storageMetadata[lfn] = pfnMetadata storageMetadata[lfn]['PFN'] = pfn # For the LFNs found to be registered obtain the file metadata from the catalog and verify against the storage metadata res = self.__getCatalogMetadata( storageMetadata.keys() ) if not res['OK']: return res catalogMetadata = res['Value'] sizeMismatch = [] for lfn, lfnCatalogMetadata in catalogMetadata.items(): lfnStorageMetadata = storageMetadata[lfn] if ( lfnStorageMetadata['Size'] != lfnCatalogMetadata['Size'] ) and ( lfnStorageMetadata['Size'] != 0 ): sizeMismatch.append( ( lfn, storageMetadata[lfn]['PFN'], storageElement, 'CatalogPFNSizeMismatch' ) ) if sizeMismatch: self.__reportProblematicReplicas( sizeMismatch, storageElement, 'CatalogPFNSizeMismatch' ) gLogger.info( 'Checking storage files exist in the catalog complete' ) resDict = {'CatalogMetadata':catalogMetadata, 'StorageMetadata':storageMetadata} return S_OK( resDict ) def getStorageDirectoryContents( self, lfnDir, storageElement ): """ This obtains takes the supplied lfn directories and recursively obtains the files in the supplied storage element """ return self.__getStorageDirectoryContents( lfnDir, storageElement ) def __getStorageDirectoryContents( self, lfnDir, storageElement ): """ Obtians the contents of the supplied directory on the storage """ gLogger.info( 'Obtaining the contents for %s directories at %s' % ( len( lfnDir ), storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( lfnDir ) if not res['OK']: gLogger.error( "Failed to get PFNs for directories", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain directory PFN from LFNs', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain directory PFN from LFNs' ) storageDirectories = res['Value']['Successful'].values() res = se.exists( storageDirectories ) if not res['OK']: gLogger.error( "Failed to obtain existance of directories", res['Message'] ) return res for directory, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to determine existance of directory', '%s %s' % ( directory, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to determine existance of directory' ) directoryExists = res['Value']['Successful'] activeDirs = [] for directory in sortList( directoryExists.keys() ): exists = directoryExists[directory] if exists: activeDirs.append( directory ) allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = se.listDirectory( currentDir ) activeDirs.remove( currentDir ) if not res['OK']: gLogger.error( 'Failed to get directory contents', res['Message'] ) return res elif res['Value']['Failed'].has_key( currentDir ): gLogger.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Value']['Failed'][currentDir] ) ) return S_ERROR( res['Value']['Failed'][currentDir] ) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend( dirContents['SubDirs'] ) fileMetadata = dirContents['Files'] # RF_NOTE This ugly trick is needed because se.getPfnPath does not follow the Successful/Failed convention # res = { "Successful" : {}, "Failed" : {} } # for pfn in fileMetadata: # inRes = se.getPfnPath( pfn ) # if inRes["OK"]: # res["Successful"][pfn] = inRes["Value"] # else: # res["Failed"][pfn] = inRes["Message"] res = se.getLfnForPfn( fileMetadata.keys() ) if not res['OK']: gLogger.error( 'Failed to get directory content LFNs', res['Message'] ) return res for pfn, error in res['Value']['Failed'].items(): gLogger.error( "Failed to get LFN for PFN", "%s %s" % ( pfn, error ) ) if res['Value']['Failed']: return S_ERROR( "Failed to get LFNs for PFNs" ) pfnLfns = res['Value']['Successful'] for pfn, lfn in pfnLfns.items(): fileMetadata[pfn]['LFN'] = lfn allFiles.update( fileMetadata ) zeroSizeFiles = [] lostFiles = [] unavailableFiles = [] for pfn in sortList( allFiles.keys() ): if os.path.basename( pfn ) == 'dirac_directory': allFiles.pop( pfn ) else: metadata = allFiles[pfn] if metadata['Size'] == 0: zeroSizeFiles.append( ( metadata['LFN'], pfn, storageElement, 'PFNZeroSize' ) ) # if metadata['Lost']: # lostFiles.append((metadata['LFN'],pfn,storageElement,'PFNLost')) # if metadata['Unavailable']: # unavailableFiles.append((metadata['LFN'],pfn,storageElement,'PFNUnavailable')) if zeroSizeFiles: self.__reportProblematicReplicas( zeroSizeFiles, storageElement, 'PFNZeroSize' ) if lostFiles: self.__reportProblematicReplicas( lostFiles, storageElement, 'PFNLost' ) if unavailableFiles: self.__reportProblematicReplicas( unavailableFiles, storageElement, 'PFNUnavailable' ) gLogger.info( 'Obtained at total of %s files for directories at %s' % ( len( allFiles ), storageElement ) ) return S_OK( allFiles ) def __getStoragePathExists( self, lfnPaths, storageElement ): gLogger.info( 'Determining the existance of %d files at %s' % ( len( lfnPaths ), storageElement ) ) se = StorageElement( storageElement ) res = se.getPfnForLfn( lfnPaths ) if not res['OK']: gLogger.error( "Failed to get PFNs for LFNs", res['Message'] ) return res for lfnPath, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain PFN from LFN', '%s %s' % ( lfnPath, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to obtain PFNs from LFNs' ) lfnPfns = res['Value']['Successful'] pfnLfns = {} for lfn, pfn in lfnPfns.items(): pfnLfns[pfn] = lfn res = se.exists( pfnLfns ) if not res['OK']: gLogger.error( "Failed to obtain existance of paths", res['Message'] ) return res for lfnPath, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to determine existance of path', '%s %s' % ( lfnPath, error ) ) if res['Value']['Failed']: return S_ERROR( 'Failed to determine existance of paths' ) pathExists = res['Value']['Successful'] resDict = {} for pfn, exists in pathExists.items(): if exists: resDict[pfnLfns[pfn]] = pfn return S_OK( resDict ) ########################################################################## # # This section contains the specific methods for obtaining replica and metadata information from the catalog # def __getCatalogDirectoryContents( self, lfnDir ): """ Obtain the contents of the supplied directory """ gLogger.info( 'Obtaining the catalog contents for %s directories' % len( lfnDir ) ) activeDirs = lfnDir allFiles = {} while len( activeDirs ) > 0: currentDir = activeDirs[0] res = self.fc.listDirectory( currentDir ) activeDirs.remove( currentDir ) if not res['OK']: gLogger.error( 'Failed to get directory contents', res['Message'] ) return res elif res['Value']['Failed'].has_key( currentDir ): gLogger.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Value']['Failed'][currentDir] ) ) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend( dirContents['SubDirs'] ) allFiles.update( dirContents['Files'] ) zeroReplicaFiles = [] zeroSizeFiles = [] allReplicaDict = {} allMetadataDict = {} for lfn, lfnDict in allFiles.items(): lfnReplicas = {} for se, replicaDict in lfnDict['Replicas'].items(): lfnReplicas[se] = replicaDict['PFN'] if not lfnReplicas: zeroReplicaFiles.append( lfn ) allReplicaDict[lfn] = lfnReplicas allMetadataDict[lfn] = lfnDict['MetaData'] if lfnDict['MetaData']['Size'] == 0: zeroSizeFiles.append( lfn ) if zeroReplicaFiles: self.__reportProblematicFiles( zeroReplicaFiles, 'LFNZeroReplicas' ) if zeroSizeFiles: self.__reportProblematicFiles( zeroSizeFiles, 'LFNZeroSize' ) gLogger.info( 'Obtained at total of %s files for the supplied directories' % len( allMetadataDict ) ) resDict = {'Metadata':allMetadataDict, 'Replicas':allReplicaDict} return S_OK( resDict ) def __getCatalogReplicas( self, lfns ): """ Obtain the file replicas from the catalog while checking that there are replicas """ gLogger.info( 'Obtaining the replicas for %s files' % len( lfns ) ) zeroReplicaFiles = [] res = self.fc.getReplicas( lfns, allStatus = True ) if not res['OK']: gLogger.error( 'Failed to get catalog replicas', res['Message'] ) return res allReplicas = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search( 'File has zero replicas', error ): zeroReplicaFiles.append( lfn ) if zeroReplicaFiles: self.__reportProblematicFiles( zeroReplicaFiles, 'LFNZeroReplicas' ) gLogger.info( 'Obtaining the replicas for files complete' ) return S_OK( allReplicas ) def __getCatalogMetadata( self, lfns ): """ Obtain the file metadata from the catalog while checking they exist """ if not lfns: return S_OK( {} ) gLogger.info( 'Obtaining the catalog metadata for %s files' % len( lfns ) ) missingCatalogFiles = [] zeroSizeFiles = [] res = self.fc.getFileMetadata( lfns ) if not res['OK']: gLogger.error( 'Failed to get catalog metadata', res['Message'] ) return res allMetadata = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search( 'No such file or directory', error ): missingCatalogFiles.append( lfn ) if missingCatalogFiles: self.__reportProblematicFiles( missingCatalogFiles, 'LFNCatalogMissing' ) for lfn, metadata in allMetadata.items(): if metadata['Size'] == 0: zeroSizeFiles.append( lfn ) if zeroSizeFiles: self.__reportProblematicFiles( zeroSizeFiles, 'LFNZeroSize' ) gLogger.info( 'Obtaining the catalog metadata complete' ) return S_OK( allMetadata ) ########################################################################## # # This section contains the methods for inserting problematic files into the integrity DB # def __reportProblematicFiles( self, lfns, reason ): """ Simple wrapper function around setFileProblematic """ gLogger.info( 'The following %s files were found with %s' % ( len( lfns ), reason ) ) for lfn in sortList( lfns ): gLogger.info( lfn ) res = self.setFileProblematic( lfns, reason, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with files', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with files' ) def setFileProblematic( self, lfn, reason, sourceComponent = '' ): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type( lfn ) == types.ListType: lfns = lfn elif type( lfn ) == types.StringType: lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len( lfns ) ) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':'', 'SE':''} res = self.insertProblematic( sourceComponent, fileMetadata ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def __reportProblematicReplicas( self, replicaTuple, se, reason ): """ Simple wrapper function around setReplicaProblematic """ gLogger.info( 'The following %s files had %s at %s' % ( len( replicaTuple ), reason, se ) ) for lfn, pfn, se, reason in sortList( replicaTuple ): if lfn: gLogger.info( lfn ) else: gLogger.info( pfn ) res = self.setReplicaProblematic( replicaTuple, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with replicas', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with replicas' ) def setReplicaProblematic( self, replicaTuple, sourceComponent = '' ): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type( replicaTuple ) == types.TupleType: replicaTuple = [replicaTuple] elif type( replicaTuple ) == types.ListType: pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len( replicaTuple ) ) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':pfn, 'SE':se} res = self.insertProblematic( sourceComponent, replicaDict ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus( replicaDict ) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error( errStr, res['Message'] ) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict ) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles( self, prognosis, fileID ): gLogger.info( "%s file (%d) is resolved" % ( prognosis, fileID ) ) return self.setProblematicStatus( fileID, 'Resolved' ) def __returnProblematicError( self, fileID, res ): self.incrementProblematicRetry( fileID ) gLogger.error( res['Message'] ) return res def __getRegisteredPFNLFN( self, pfn, storageElement ): res = StorageElement( storageElement ).getPfnForProtocol( pfn, withPort = False ) if not res['OK']: gLogger.error( "Failed to get registered PFN for physical files", res['Message'] ) return res for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) registeredPFN = res['Value']['Successful'][pfn] res = Utils.executeSingleFileOrDirWrapper( self.fc.getLFNForPFN( registeredPFN ) ) if ( not res['OK'] ) and re.search( 'No such file or directory', res['Message'] ): return S_OK( False ) return S_OK( res['Value'] ) def __updateReplicaToChecked( self, problematicDict ): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = Utils.executeSingleFileOrDirWrapper( self.fc.setReplicaStatus( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "%s replica (%d) is updated to Checked status" % ( prognosis, fileID ) ) return self.__updateCompletedFiles( prognosis, fileID ) def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] pfn = problematicDict['PFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).getFileSize( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = Utils.executeSingleFileOrDirWrapper( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if ( catalogSize == bookkeepingSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) def resolvePFNNotRegistered( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] pfn = problematicDict['PFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = Utils.executeSingleFileOrDirWrapper( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: # The file does not exist in the catalog res = Utils.executeSingleFileOrDirWrapper( se.removeFile( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) res = Utils.executeSingleFileOrDirWrapper( se.getFileMetadata( pfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNNotRegistered replica (%d) found to be missing." % fileID ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) elif not res['OK']: return self.__returnProblematicError( fileID, res ) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) # HACK until we can obtain the space token descriptions through GFAL site = seName.split( '_' )[0].split( '-' )[0] if not storageMetadata['Cached']: if lfn.endswith( '.raw' ): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith( '/lhcb/data' ): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith( '/lhcb/data' ): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = se.getPfnForProtocol( pfn, withPort = False ) if not res['OK']: return self.__returnProblematicError( fileID, res ) for pfn, error in res['Value']['Failed'].items(): gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) problematicDict['PFN'] = res['Value']['Successful'][pfn] res = Utils.executeSingleFileOrDirWrapper( self.fc.addReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) def resolveLFNCatalogMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = Utils.executeSingleFileOrDirWrapper( self.fc.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) def resolvePFNMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ pfn = problematicDict['PFN'] se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: gLogger.info( "PFNMissing file (%d) no longer exists in catalog" % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).exists( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: gLogger.info( "PFNMissing replica (%d) is no longer missing" % fileID ) return self.__updateReplicaToChecked( problematicDict ) gLogger.info( "PFNMissing replica (%d) does not exist" % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn, allStatus = True ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) replicas = res['Value'] seSite = se.split( '_' )[0].split( '-' )[0] found = False print replicas for replicaSE in replicas.keys(): if re.search( seSite, replicaSE ): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.removeReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( replicas ) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'LFNZeroReplicas' ) res = self.dm.replicateAndRegister( problematicDict['LFN'], se ) if not res['OK']: return self.__returnProblematicError( fileID, res ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'PFNMissing', fileID ) def resolvePFNUnavailable( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ pfn = problematicDict['PFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( StorageElement( se ).getFileMetadata( pfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) if ( not res['OK'] ) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID ) return self.incrementProblematicRetry( fileID ) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) gLogger.info( "PFNUnavailable replica (%d) is no longer Unavailable" % fileID ) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked( problematicDict ) def resolvePFNZeroSize( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ pfn = problematicDict['PFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = Utils.executeSingleFileOrDirWrapper( se.getFileSize( pfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) storageSize = res['Value'] if storageSize == 0: res = Utils.executeSingleFileOrDirWrapper( se.removeFile( pfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) res = self.__getRegisteredPFNLFN( pfn, seName ) if not res['OK']: return self.__returnProblematicError( fileID, res ) lfn = res['Value'] if not lfn: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNNotRegistered' ) res = Utils.executeSingleFileOrDirWrapper( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNZeroSize', fileID ) ############################################################################################ def resolveLFNZeroReplicas( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = Utils.executeSingleFileOrDirWrapper( self.fc.getReplicas( lfn, allStatus = True ) ) if res['OK'] and res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found to have replicas" % fileID ) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID ) pfnsFound = False for storageElementName in sortList( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [] ) ): res = self.__getStoragePathExists( [lfn], storageElementName ) if res['Value'].has_key( lfn ): gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % ( fileID, storageElementName ) ) pfn = res['Value'][lfn] self.__reportProblematicReplicas( [( lfn, pfn, storageElementName, 'PFNNotRegistered' )], storageElementName, 'PFNNotRegistered' ) pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID ) res = Utils.executeSingleFileOrDirWrapper( self.fc.removeFile( lfn ) ) if not res['OK']: gLogger.error( res['Message'] ) # Increment the number of retries for this file self.server.incrementProblematicRetry( fileID ) return res gLogger.info( "LFNZeroReplicas file (%d) removed from catalog" % fileID ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'LFNZeroReplicas', fileID )
class PluginUtilities(object): """ Utility class used by plugins """ def __init__(self, plugin='Standard', transClient=None, dataManager=None, fc=None, debug=False, transInThread=None, transID=None): """ c'tor Setting defaults """ # clients if transClient is None: self.transClient = TransformationClient() else: self.transClient = transClient if dataManager is None: self.dm = DataManager() else: self.dm = dataManager if fc is None: self.fc = FileCatalog() else: self.fc = fc self.dmsHelper = DMSHelpers() self.plugin = plugin self.transID = transID self.params = {} self.groupSize = 0 self.maxFiles = 0 self.cachedLFNSize = {} self.transString = '' self.debug = debug self.seConfig = {} if transInThread is None: self.transInThread = {} else: self.transInThread = transInThread self.log = gLogger.getSubLogger("%s/PluginUtilities" % plugin) def logVerbose(self, message, param=''): if self.debug: self.log.info('(V)' + self.transString + message, param) else: self.log.verbose(self.transString + message, param) def logDebug(self, message, param=''): self.log.debug(self.transString + message, param) def logInfo(self, message, param=''): self.log.info(self.transString + message, param) def logWarn(self, message, param=''): self.log.warn(self.transString + message, param) def logError(self, message, param=''): self.log.error(self.transString + message, param) def logException(self, message, param='', lException=False): self.log.exception(self.transString + message, param, lException) def setParameters(self, params): self.params = params self.transID = params['TransformationID'] self.transString = self.transInThread.get( self.transID, ' [NoThread] [%d] ' % self.transID) + '%s: ' % self.plugin @timeThis def groupByReplicas(self, files, status): """ Generates tasks based on the location of the input data :param dict fileReplicas: {'/this/is/at.1': ['SE1'], '/this/is/at.12': ['SE1', 'SE2'], '/this/is/at.2': ['SE2'], '/this/is/at_123': ['SE1', 'SE2', 'SE3'], '/this/is/at_23': ['SE2', 'SE3'], '/this/is/at_4': ['SE4']} """ tasks = [] nTasks = 0 if not len(files): return S_OK(tasks) files = dict(files) # Parameters if not self.groupSize: self.groupSize = self.getPluginParam('GroupSize', 10) flush = (status == 'Flush') self.logVerbose("groupByReplicas: %d files, groupSize %d, flush %s" % (len(files), self.groupSize, flush)) # Consider files by groups of SEs, a file is only in one group # Then consider files site by site, but a file can now be at more than one site for groupSE in (True, False): if not files: break seFiles = getFileGroups(files, groupSE=groupSE) self.logDebug("fileGroups set: ", seFiles) for replicaSE in sortSEs(seFiles): lfns = seFiles[replicaSE] if lfns: tasksLfns = breakListIntoChunks(lfns, self.groupSize) lfnsInTasks = [] for taskLfns in tasksLfns: if (flush and not groupSE) or (len(taskLfns) >= self.groupSize): tasks.append((replicaSE, taskLfns)) lfnsInTasks += taskLfns # In case the file was at more than one site, remove it from the other sites' list # Remove files from global list for lfn in lfnsInTasks: files.pop(lfn) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [ lfn for lfn in seFiles[se] if lfn not in lfnsInTasks ] self.logVerbose( "groupByReplicas: %d tasks created (groupSE %s), %d files not included in tasks" % (len(tasks) - nTasks, str(groupSE), len(files))) nTasks = len(tasks) return S_OK(tasks) def createTasksBySize(self, lfns, replicaSE, fileSizes=None, flush=False): """ Split files in groups according to the size and create tasks for a given SE """ tasks = [] if fileSizes is None: fileSizes = self._getFileSize(lfns).get('Value') if fileSizes is None: self.logWarn('Error getting file sizes, no tasks created') return tasks taskLfns = [] taskSize = 0 if not self.groupSize: self.groupSize = float( self.getPluginParam('GroupSize', 1.) ) * 1000 * 1000 * 1000 # input size in GB converted to bytes if not self.maxFiles: self.maxFiles = self.getPluginParam('MaxFiles', 100) lfns = sorted(lfns, key=fileSizes.get) for lfn in lfns: size = fileSizes.get(lfn, 0) if size: if size > self.groupSize: tasks.append((replicaSE, [lfn])) else: taskSize += size taskLfns.append(lfn) if (taskSize > self.groupSize) or (len(taskLfns) >= self.maxFiles): tasks.append((replicaSE, taskLfns)) taskLfns = [] taskSize = 0 if flush and taskLfns: tasks.append((replicaSE, taskLfns)) return tasks @timeThis def groupBySize(self, files, status): """ Generate a task for a given amount of data """ tasks = [] nTasks = 0 if not len(files): return S_OK(tasks) files = dict(files) # Parameters if not self.groupSize: self.groupSize = float(self.getPluginParam( 'GroupSize', 1)) * 1000 * 1000 * 1000 # input size in GB converted to bytes flush = (status == 'Flush') self.logVerbose("groupBySize: %d files, groupSize: %d, flush: %s" % (len(files), self.groupSize, flush)) # Get the file sizes res = self._getFileSize(files.keys()) if not res['OK']: return res fileSizes = res['Value'] for groupSE in (True, False): if not files: break seFiles = getFileGroups(files, groupSE=groupSE) for replicaSE in sorted(seFiles) if groupSE else sortSEs(seFiles): lfns = seFiles[replicaSE] newTasks = self.createTasksBySize(lfns, replicaSE, fileSizes=fileSizes, flush=flush) lfnsInTasks = [] for task in newTasks: lfnsInTasks += task[1] tasks += newTasks # Remove the selected files from the size cache self.clearCachedFileSize(lfnsInTasks) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [ lfn for lfn in seFiles[se] if lfn not in lfnsInTasks ] # Remove files from global list for lfn in lfnsInTasks: files.pop(lfn) self.logVerbose("groupBySize: %d tasks created with groupSE %s" % (len(tasks) - nTasks, str(groupSE))) self.logVerbose( "groupBySize: %d files have not been included in tasks" % len(files)) nTasks = len(tasks) self.logVerbose("Grouped %d files by size" % len(files)) return S_OK(tasks) def getExistingCounters(self, normalise=False, requestedSites=[]): res = self.transClient.getCounters( 'TransformationFiles', ['UsedSE'], {'TransformationID': self.params['TransformationID']}) if not res['OK']: return res usageDict = {} for usedDict, count in res['Value']: usedSE = usedDict['UsedSE'] if usedSE != 'Unknown': usageDict[usedSE] = count if requestedSites: siteDict = {} for se, count in usageDict.items(): res = getSitesForSE(se) if not res['OK']: return res for site in res['Value']: if site in requestedSites: siteDict[site] = count usageDict = siteDict.copy() if normalise: usageDict = self._normaliseShares(usageDict) return S_OK(usageDict) @timeThis def _getFileSize(self, lfns): """ Get file size from a cache, if not from the catalog #FIXME: have to fill the cachedLFNSize! """ lfns = list(lfns) cachedLFNSize = dict(self.cachedLFNSize) fileSizes = {} for lfn in [lfn for lfn in lfns if lfn in cachedLFNSize]: fileSizes[lfn] = cachedLFNSize[lfn] self.logDebug("Found cache hit for File size for %d files out of %d" % (len(fileSizes), len(lfns))) lfns = [lfn for lfn in lfns if lfn not in cachedLFNSize] if lfns: fileSizes = self._getFileSizeFromCatalog(lfns, fileSizes) if not fileSizes['OK']: self.logError(fileSizes['Message']) return fileSizes fileSizes = fileSizes['Value'] return S_OK(fileSizes) @timeThis def _getFileSizeFromCatalog(self, lfns, fileSizes): """ Get file size from the catalog """ lfns = list(lfns) fileSizes = dict(fileSizes) res = self.fc.getFileSize(lfns) if not res['OK']: return S_ERROR("Failed to get sizes for all files: %s" % res['Message']) if res['Value']['Failed']: errorReason = sorted(set(res['Value']['Failed'].values())) self.logWarn( "Failed to get sizes for %d files:" % len(res['Value']['Failed']), errorReason) fileSizes.update(res['Value']['Successful']) self.cachedLFNSize.update((res['Value']['Successful'])) self.logVerbose("Got size of %d files from catalog" % len(lfns)) return S_OK(fileSizes) def clearCachedFileSize(self, lfns): """ Utility function """ for lfn in [lfn for lfn in lfns if lfn in self.cachedLFNSize]: self.cachedLFNSize.pop(lfn) def getPluginParam(self, name, default=None): """ Get plugin parameters using specific settings or settings defined in the CS Caution: the type returned is that of the default value """ # get the value of a parameter looking 1st in the CS if default != None: valueType = type(default) else: valueType = None # First look at a generic value... optionPath = "TransformationPlugins/%s" % (name) value = Operations().getValue(optionPath, None) self.logVerbose("Default plugin param %s: '%s'" % (optionPath, value)) # Then look at a plugin-specific value optionPath = "TransformationPlugins/%s/%s" % (self.plugin, name) value = Operations().getValue(optionPath, value) self.logVerbose("Specific plugin param %s: '%s'" % (optionPath, value)) if value != None: default = value # Finally look at a transformation-specific parameter value = self.params.get(name, default) self.logVerbose("Transformation plugin param %s: '%s'. Convert to %s" % (name, value, str(valueType))) if valueType and type(value) is not valueType: if valueType is list: try: value = ast.literal_eval( value) if value and value != 'None' else [] except Exception: value = [ val for val in value.replace(' ', '').split(',') if val ] elif valueType is int: value = int(value) elif valueType is float: value = float(value) elif valueType is bool: if value in ('False', 'No', 'None', None, 0): value = False else: value = bool(value) elif valueType is not str: self.logWarn( "Unknown parameter type (%s) for %s, passed as string" % (str(valueType), name)) self.logVerbose("Final plugin param %s: '%s'" % (name, value)) return value @staticmethod def _normaliseShares(originalShares): shares = originalShares.copy() total = 0.0 for site in shares.keys(): share = float(shares[site]) shares[site] = share total += share for site in shares.keys(): share = 100.0 * (shares[site] / total) shares[site] = share return shares def uniqueSEs(self, ses): newSEs = [] for se in ses: if not self.isSameSEInList(se, newSEs): newSEs.append(se) return newSEs def isSameSE(self, se1, se2): if se1 == se2: return True for se in (se1, se2): if se not in self.seConfig: self.seConfig[se] = {} res = StorageElement(se).getStorageParameters('SRM2') if res['OK']: params = res['Value'] for item in ('Host', 'Path'): self.seConfig[se][item] = params[item].replace( 't1d1', 't0d1') else: self.logError( "Error getting StorageElement parameters for %s" % se, res['Message']) return self.seConfig[se1] == self.seConfig[se2] def isSameSEInList(self, se1, seList): if se1 in seList: return True for se in seList: if self.isSameSE(se1, se): return True return False def closerSEs(self, existingSEs, targetSEs, local=False): """ Order the targetSEs such that the first ones are closer to existingSEs. Keep all elements in targetSEs """ setTarget = set(targetSEs) sameSEs = set([ se1 for se1 in setTarget for se2 in existingSEs if self.isSameSE(se1, se2) ]) targetSEs = setTarget - set(sameSEs) if targetSEs: # Some SEs are left, look for sites existingSites = [ self.dmsHelper.getLocalSiteForSE(se).get('Value') for se in existingSEs if not self.dmsHelper.isSEArchive(se) ] existingSites = set([site for site in existingSites if site]) closeSEs = set([ se for se in targetSEs if self.dmsHelper.getLocalSiteForSE( se).get('Value') in existingSites ]) # print existingSEs, existingSites, targetSEs, closeSEs otherSEs = targetSEs - closeSEs targetSEs = list(closeSEs) random.shuffle(targetSEs) if not local and otherSEs: otherSEs = list(otherSEs) random.shuffle(otherSEs) targetSEs += otherSEs else: targetSEs = [] return (targetSEs + list(sameSEs)) if not local else targetSEs
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = self.prepareNewReplicas() return res def prepareNewReplicas(self): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message']) return res if not res['Value']: gLogger.info("There were no New replicas found") return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)) # Check that the files exist in the FileCatalog res = self.__getExistingFiles(replicas.keys()) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine existance of any files' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal)) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(fileSizes.keys()) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal)) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas[lfn] for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[ replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append( (replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message']) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation(replicaMetadata) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message']) return S_OK() def __getNewReplicas(self): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas({'Status': 'New'}) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message']) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res['Value'])) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] if not replicas.has_key(lfn): replicas[lfn] = {} replicas[lfn][storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({'Replicas': replicas, 'ReplicaIDs': replicaIDs}) def __getExistingFiles(self, lfns): """ This checks that the files exist in the FileCatalog. """ filesExist = [] missing = {} res = self.fileCatalog.exists(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message']) return res failed = res['Value']['Failed'] for lfn, exists in res['Value']['Successful'].items(): if exists: filesExist.append(lfn) else: missing[lfn] = 'LFN not registered in the FileCatalog' if missing: for lfn, reason in missing.items(): gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, lfn) self.__reportProblematicFiles(missing.keys(), 'LFN-LFC-DoesntExist') return S_OK({ 'Exist': filesExist, 'Missing': missing, 'Failed': failed }) def __getFileSize(self, lfns): """ This obtains the file size from the FileCatalog. """ fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message']) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[ lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), 'LFN-LFC-ZeroSize') return S_OK({ 'FileSizes': fileSizes, 'ZeroSize': zeroSize, 'Failed': failed }) def __getFileReplicas(self, lfns): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.dm.getActiveReplicas(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message']) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len(lfnReplicas.keys()) == 0: noReplicas[ lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(noReplicas.keys(), 'LFN-LFC-NoReplicas') return S_OK({ 'Replicas': replicas, 'ZeroReplicas': noReplicas, 'Failed': failed }) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent='RequestPreparationAgent') if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res
class helper_TransferAgent(object): def __init__(self, transferAgent, gTransferDB): self.transferAgent = transferAgent self.transferDB = gTransferDB gLogger.info("Creating File Catalog") self.fileCatalog = FileCatalog() def helper_add_transfer(self, result): if not result: gLogger.error("There is no infomation") return False res = self.transferDB.get_TransferRequest( condDict={"id": result.trans_req_id}) if not res["OK"]: return False req_list = res["Value"] if len(req_list) != 1: return False req = TransRequestEntryWithID._make(req_list[0]) # construct the info info = { "id": result.id, "LFN": result.LFN, "srcSE": req.srcSE, "dstSE": req.dstSE, "retransfer": -1, "error": "" } # Add the Transfer worker = gTransferFactory.generate(req.protocol, info) if worker is None: return True self.transferAgent.transfer_worker.append(worker) # Change the status self.helper_status_update(self.transferDB.tables["TransferFileList"], result.id, { "status": "transfer", "start_time": datetime.datetime.utcnow() }) # Add Accounting: d = {} d["User"] = req.username d["Source"] = req.srcSE d["Destination"] = req.dstSE d["Protocol"] = req.protocol d["FinalStatus"] = "OK" d["TransferSize"] = 0 # TODO r = self.fileCatalog.getFileSize(result.LFN) if r["OK"]: if r["Value"]["Successful"]: d["TransferSize"] = r["Value"]["Successful"][result.LFN] d["TransferTime"] = 1 # 1s d["TransferOK"] = 1 d["TransferTotal"] = 1 acct_dt = DataTransfer() acct_dt.setValuesFromDict(d) acct_dt.setNowAsStartAndEndTime() # save it worker.acct_dt = acct_dt return True def helper_remove_transfer(self, worker): info = worker.info gLogger.info("File.id = %d -> finish" % info["id"]) self.helper_status_update( self.transferDB.tables["TransferFileList"], info["id"], { "status": "finish", "finish_time": datetime.datetime.utcnow() }) # Accounting acct_dt = worker.acct_dt acct_dt.setEndTime() # TODO d = {} td = acct_dt.endTime - acct_dt.startTime td_s = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 d["TransferTime"] = td_s # 1s if info["error"]: d["FinalStatus"] = "FAILED" d["TransferOK"] = 0 else: d["FinalStatus"] = "OK" d["TransferOK"] = 1 acct_dt.setValuesFromDict(d) acct_dt.commit() gLogger.info("Submit Accounting Data") def helper_check_request(self): """ check if the *transfer* request are ok. if the whole files are *finish*, then this request will become *finish*. """ infoDict = {"status": "transfer"} res = self.transferDB.get_TransferRequest(condDict=infoDict) if not res["OK"]: return reqlist = map(TransRequestEntryWithID._make, res["Value"]) for req in reqlist: res = self.transferDB._query( 'select count(*) from %(table)s where trans_req_id = %(id)d and status not in %(status_list)s' % { "table": self.transferDB.tables["TransferFileList"], "id": req.id, "status_list": '("finish", "kill")' # XXX finish or kill means this request is ok. }) if not res["OK"]: # TODO continue count = res["Value"][0][0] if count == 0: # if all status is finish, # the req status --> finish gLogger.info("req.id %d change from %s to finish" % (req.id, req.status)) self.helper_status_update( self.transferDB.tables["TransferRequest"], req.id, {"status": "finish"}) return def helper_get_new_request(self): # 1. get the *new* File in the <<Transfer File List>>. # if we get, goto <<Add New Transfer>> already_load_status = False result_new_file = self.helper_get_new_File() # 1.1 2014.04.20 # They want to the other requests are also loaded, # so I have to not return immediately if result_new_file: already_load_status = True # 2. if we can't get, use should get a *new* request # from the <<Transfer Request>>. # if we can't get, return False. STOP self.helper_check_request() result = self.helper_get_new_request_entry() if result: # 3. add the filelist in the dataset to the << Transfer File List >> condDict = {"name": result.dataset} res = self.transferDB.get_Dataset(condDict) if not res["OK"]: gLogger.error(res) return None filelist = res["Value"] # update the status in << Request >> if len(filelist) > 0: req_status = "transfer" else: req_status = "finish" self.helper_status_update( self.transferDB.tables["TransferRequest"], result.id, {"status": req_status}) self.transferDB.insert_TransferFileList(result.id, filelist) # 4. get the *new* File Again. # 5. can't get, return False. STOP # 4.prelude # If already loaded, return the last result if already_load_status and result_new_file: return result_new_file result = self.helper_get_new_File() return result def helper_get_new_request_entry(self): """ TransRequestEntryWithID( id=1L, username='******', dataset='my-dataset', srcSE='IHEP-USER', dstSE='IHEPD-USER', submit_time=datetime.datetime(2013, 3, 13, 20, 9, 34), status='new') """ condDict = {"status": "new"} res = self.transferDB.get_TransferRequest(condDict) if not res["OK"]: return None req_list = res["Value"] len_req = len(req_list) if len_req: # random select tmp_idx = random.randint(0, len_req - 1) return TransRequestEntryWithID._make(req_list[tmp_idx]) pass def helper_get_new_File(self): """ >>> helper.helper_get_new_File() TransFileListEntryWithID( id=1L, LFN='/path/does/not/exist', trans_req_id=1L, start_time=None, finish_time=None, status='new') """ condDict = {"status": "new"} res = self.transferDB.get_TransferFileList(condDict) if not res["OK"]: gLogger.error(res) return None filelist = res["Value"] gLogger.info("Filelist:") gLogger.info(filelist) len_files = len(filelist) if len_files > 0: tmp_idx = random.randint(0, len_files - 1) gLogger.info("get file entry index randomly: %d/%d" % (tmp_idx, len_files)) gLogger.info("get file entry", filelist[tmp_idx]) return TransFileListEntryWithID._make(filelist[tmp_idx]) return None def helper_status_update(self, table, id, toUpdate): res = self.transferDB.updateFields( table, updateDict=toUpdate, condDict={"id": id}, ) print res def helper_error_report(self, worker, reason): self.helper_status_update(self.transferDB.tables["TransferFileList"], worker.info["id"], {"error": reason}) def check_worker_status(self, worker): """check whether the file transfer is kill(in DB)""" res = self.transferDB.getFields( self.transferDB.tables["TransferFileList"], outFields=["status"], condDict={"id": worker.info["id"]}) if not res["OK"]: gLogger.error(res) return if not res["Value"]: return if len(res["Value"]) != 1: gLogger.error[res] return status = res["Value"][0][0] if status == "kill": gLogger.info("check worker should be killed: ", status) worker.proc.kill()
class PluginUtilities(object): """ Utility class used by plugins """ def __init__(self, plugin='Standard', transClient=None, dataManager=None, fc=None, debug=False, transInThread=None, transID=None): """ c'tor Setting defaults """ # clients if transClient is None: self.transClient = TransformationClient() else: self.transClient = transClient if dataManager is None: self.dm = DataManager() else: self.dm = dataManager if fc is None: self.fc = FileCatalog() else: self.fc = fc self.dmsHelper = DMSHelpers() self.plugin = plugin self.transID = transID self.params = {} self.groupSize = 0 self.maxFiles = 0 self.cachedLFNSize = {} self.transString = '' self.debug = debug if transInThread is None: self.transInThread = {} else: self.transInThread = transInThread self.log = gLogger.getSubLogger(plugin) def logVerbose(self, message, param=''): """ logger helper """ if self.debug: self.log.info('(V)' + self.transString + message, param) else: self.log.verbose(self.transString + message, param) def logDebug(self, message, param=''): """ logger helper """ self.log.debug(self.transString + message, param) def logInfo(self, message, param=''): """ logger helper """ self.log.info(self.transString + message, param) def logWarn(self, message, param=''): """ logger helper """ self.log.warn(self.transString + message, param) def logError(self, message, param=''): """ logger helper """ self.log.error(self.transString + message, param) def logException(self, message, param='', lException=False): """ logger helper """ self.log.exception(self.transString + message, param, lException) def setParameters(self, params): """ Set the transformation parameters and extract transID """ self.params = params self.transID = params['TransformationID'] self.transString = self.transInThread.get(self.transID, ' [NoThread] [%d] ' % self.transID) # @timeThis def groupByReplicas(self, files, status): """ Generates tasks based on the location of the input data :param dict fileReplicas: {'/this/is/at.1': ['SE1'], '/this/is/at.12': ['SE1', 'SE2'], '/this/is/at.2': ['SE2'], '/this/is/at_123': ['SE1', 'SE2', 'SE3'], '/this/is/at_23': ['SE2', 'SE3'], '/this/is/at_4': ['SE4']} """ tasks = [] nTasks = 0 if not files: return S_OK(tasks) files = dict(files) # Parameters if not self.groupSize: self.groupSize = self.getPluginParam('GroupSize', 10) flush = (status == 'Flush') self.logVerbose( "groupByReplicas: %d files, groupSize %d, flush %s" % (len(files), self.groupSize, flush)) # Consider files by groups of SEs, a file is only in one group # Then consider files site by site, but a file can now be at more than one site for groupSE in (True, False): if not files: break seFiles = getFileGroups(files, groupSE=groupSE) self.logDebug("fileGroups set: ", seFiles) for replicaSE in sortSEs(seFiles): lfns = seFiles[replicaSE] if lfns: tasksLfns = breakListIntoChunks(lfns, self.groupSize) lfnsInTasks = [] for taskLfns in tasksLfns: if flush or (len(taskLfns) >= self.groupSize): tasks.append((replicaSE, taskLfns)) lfnsInTasks += taskLfns # In case the file was at more than one site, remove it from the other sites' list # Remove files from global list for lfn in lfnsInTasks: files.pop(lfn) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks] self.logVerbose( "groupByReplicas: %d tasks created (groupSE %s)" % (len(tasks) - nTasks, str(groupSE)), "%d files not included in tasks" % len(files)) nTasks = len(tasks) return S_OK(tasks) def createTasksBySize(self, lfns, replicaSE, fileSizes=None, flush=False): """ Split files in groups according to the size and create tasks for a given SE """ tasks = [] if fileSizes is None: fileSizes = self._getFileSize(lfns).get('Value') if fileSizes is None: self.logWarn('Error getting file sizes, no tasks created') return tasks taskLfns = [] taskSize = 0 if not self.groupSize: # input size in GB converted to bytes self.groupSize = float(self.getPluginParam('GroupSize', 1.)) * 1000 * 1000 * 1000 if not self.maxFiles: # FIXME: prepare for chaging the name of the ambiguoug CS option self.maxFiles = self.getPluginParam('MaxFilesPerTask', self.getPluginParam('MaxFiles', 100)) lfns = sorted(lfns, key=fileSizes.get) for lfn in lfns: size = fileSizes.get(lfn, 0) if size: if size > self.groupSize: tasks.append((replicaSE, [lfn])) else: taskSize += size taskLfns.append(lfn) if (taskSize > self.groupSize) or (len(taskLfns) >= self.maxFiles): tasks.append((replicaSE, taskLfns)) taskLfns = [] taskSize = 0 if flush and taskLfns: tasks.append((replicaSE, taskLfns)) if not tasks and not flush and taskLfns: self.logVerbose( 'Not enough data to create a task, and flush not set (%d bytes for groupSize %d)' % (taskSize, self.groupSize)) return tasks # @timeThis def groupBySize(self, files, status): """ Generate a task for a given amount of data """ tasks = [] nTasks = 0 if not len(files): return S_OK(tasks) files = dict(files) # Parameters if not self.groupSize: # input size in GB converted to bytes self.groupSize = float(self.getPluginParam('GroupSize', 1)) * 1000 * 1000 * 1000 flush = (status == 'Flush') self.logVerbose( "groupBySize: %d files, groupSize: %d, flush: %s" % (len(files), self.groupSize, flush)) # Get the file sizes res = self._getFileSize(files.keys()) if not res['OK']: return res fileSizes = res['Value'] for groupSE in (True, False): if not files: break seFiles = getFileGroups(files, groupSE=groupSE) for replicaSE in sorted(seFiles) if groupSE else sortSEs(seFiles): lfns = seFiles[replicaSE] newTasks = self.createTasksBySize(lfns, replicaSE, fileSizes=fileSizes, flush=flush) lfnsInTasks = [] for task in newTasks: lfnsInTasks += task[1] tasks += newTasks # Remove the selected files from the size cache self.clearCachedFileSize(lfnsInTasks) if not groupSE: # Remove files from other SEs for se in [se for se in seFiles if se != replicaSE]: seFiles[se] = [lfn for lfn in seFiles[se] if lfn not in lfnsInTasks] # Remove files from global list for lfn in lfnsInTasks: files.pop(lfn) self.logVerbose( "groupBySize: %d tasks created with groupSE %s" % (len(tasks) - nTasks, str(groupSE))) self.logVerbose("groupBySize: %d files have not been included in tasks" % len(files)) nTasks = len(tasks) self.logVerbose("Grouped %d files by size" % len(files)) return S_OK(tasks) def getExistingCounters(self, normalise=False, requestedSites=[]): res = self.transClient.getCounters('TransformationFiles', ['UsedSE'], {'TransformationID': self.params['TransformationID']}) if not res['OK']: return res usageDict = {} for usedDict, count in res['Value']: usedSE = usedDict['UsedSE'] if usedSE != 'Unknown': usageDict[usedSE] = count if requestedSites: siteDict = {} for se, count in usageDict.items(): res = getSitesForSE(se) if not res['OK']: return res for site in res['Value']: if site in requestedSites: siteDict[site] = count usageDict = siteDict.copy() if normalise: usageDict = self._normaliseShares(usageDict) return S_OK(usageDict) # @timeThis def _getFileSize(self, lfns): """ Get file size from a cache, if not from the catalog #FIXME: have to fill the cachedLFNSize! """ lfns = list(lfns) cachedLFNSize = dict(self.cachedLFNSize) fileSizes = {} for lfn in [lfn for lfn in lfns if lfn in cachedLFNSize]: fileSizes[lfn] = cachedLFNSize[lfn] self.logDebug( "Found cache hit for File size for %d files out of %d" % (len(fileSizes), len(lfns))) lfns = [lfn for lfn in lfns if lfn not in cachedLFNSize] if lfns: fileSizes = self._getFileSizeFromCatalog(lfns, fileSizes) if not fileSizes['OK']: self.logError(fileSizes['Message']) return fileSizes fileSizes = fileSizes['Value'] return S_OK(fileSizes) # @timeThis def _getFileSizeFromCatalog(self, lfns, fileSizes): """ Get file size from the catalog """ lfns = list(lfns) fileSizes = dict(fileSizes) res = self.fc.getFileSize(lfns) if not res['OK']: return S_ERROR("Failed to get sizes for all files: %s" % res['Message']) if res['Value']['Failed']: errorReason = sorted(set(res['Value']['Failed'].values())) self.logWarn("Failed to get sizes for %d files:" % len(res['Value']['Failed']), errorReason) fileSizes.update(res['Value']['Successful']) self.cachedLFNSize.update((res['Value']['Successful'])) self.logVerbose("Got size of %d files from catalog" % len(lfns)) return S_OK(fileSizes) def clearCachedFileSize(self, lfns): """ Utility function """ for lfn in [lfn for lfn in lfns if lfn in self.cachedLFNSize]: self.cachedLFNSize.pop(lfn) def getPluginParam(self, name, default=None): """ Get plugin parameters using specific settings or settings defined in the CS Caution: the type returned is that of the default value """ # get the value of a parameter looking 1st in the CS if default is not None: valueType = type(default) else: valueType = None # First look at a generic value... optionPath = "TransformationPlugins/%s" % (name) value = Operations().getValue(optionPath, None) self.logVerbose("Default plugin param %s: '%s'" % (optionPath, value)) # Then look at a plugin-specific value optionPath = "TransformationPlugins/%s/%s" % (self.plugin, name) value = Operations().getValue(optionPath, value) self.logVerbose("Specific plugin param %s: '%s'" % (optionPath, value)) if value is not None: default = value # Finally look at a transformation-specific parameter value = self.params.get(name, default) self.logVerbose( "Transformation plugin param %s: '%s'. Convert to %s" % (name, value, str(valueType))) if valueType and not isinstance(value, valueType): if valueType is list: try: value = ast.literal_eval(value) if value and value != 'None' else [] # literal_eval('SE-DST') -> ValueError # literal_eval('SE_MC-DST') -> SyntaxError # Don't ask... except (ValueError, SyntaxError): value = [val for val in value.replace(' ', '').split(',') if val] elif valueType is int: value = int(value) elif valueType is float: value = float(value) elif valueType is bool: if value in ('False', 'No', 'None', None, 0): value = False else: value = bool(value) elif valueType is not str: self.logWarn( "Unknown parameter type (%s) for %s, passed as string" % (str(valueType), name)) self.logVerbose("Final plugin param %s: '%s'" % (name, value)) return value @staticmethod def _normaliseShares(originalShares): """ Normalize shares to 1 """ total = sum(float(share) for share in originalShares.values()) return dict([(site, 100. * float(share) / total if total else 0.) for site, share in originalShares.items()]) def uniqueSEs(self, ses): """ return a list of SEs that are not physically the same """ newSEs = [] for se in ses: if not self.isSameSEInList(se, newSEs): newSEs.append(se) return newSEs def isSameSE(self, se1, se2): """ Check if 2 SEs are indeed the same. :param se1: name of the first StorageElement :param se2: name of the second StorageElement :returns: True/False if they are considered the same. See :py:mod:`~DIRAC.Resources.Storage.StorageElement.StorageElementItem.isSameSE` """ if se1 == se2: return True return StorageElement(se1).isSameSE(StorageElement(se2)) def isSameSEInList(self, se1, seList): """ Check if an SE is the same as any in a list """ if se1 in seList: return True for se in seList: if self.isSameSE(se1, se): return True return False def closerSEs(self, existingSEs, targetSEs, local=False): """ Order the targetSEs such that the first ones are closer to existingSEs. Keep all elements in targetSEs """ setTarget = set(targetSEs) sameSEs = set([se1 for se1 in setTarget for se2 in existingSEs if self.isSameSE(se1, se2)]) targetSEs = setTarget - set(sameSEs) if targetSEs: # Some SEs are left, look for sites existingSites = [self.dmsHelper.getLocalSiteForSE(se).get('Value') for se in existingSEs] existingSites = set([site for site in existingSites if site]) closeSEs = set([se for se in targetSEs if self.dmsHelper.getLocalSiteForSE(se).get('Value') in existingSites]) # print existingSEs, existingSites, targetSEs, closeSEs otherSEs = targetSEs - closeSEs targetSEs = list(closeSEs) random.shuffle(targetSEs) if not local and otherSEs: otherSEs = list(otherSEs) random.shuffle(otherSEs) targetSEs += otherSEs else: targetSEs = [] return (targetSEs + list(sameSEs)) if not local else targetSEs
class DataIntegrityClient(Client): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__(self, **kwargs): super(DataIntegrityClient, self).__init__(**kwargs) self.setServer('DataManagement/DataIntegrity') self.dm = DataManager() self.fc = FileCatalog() def setFileProblematic(self, lfn, reason, sourceComponent=''): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance(lfn, list): lfns = lfn elif isinstance(lfn, basestring): lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len(lfns)) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': '', 'SE': '' } res = self.insertProblematic(sourceComponent, fileMetadata) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def reportProblematicReplicas(self, replicaTuple, se, reason): """ Simple wrapper function around setReplicaProblematic """ gLogger.info('The following %s files had %s at %s' % (len(replicaTuple), reason, se)) for lfn, _pfn, se, reason in sorted(replicaTuple): if lfn: gLogger.info(lfn) res = self.setReplicaProblematic(replicaTuple, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with replicas', res['Message']) else: gLogger.info('Successfully updated integrity DB with replicas') def setReplicaProblematic(self, replicaTuple, sourceComponent=''): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance(replicaTuple, tuple): replicaTuple = [replicaTuple] elif isinstance(replicaTuple, list): pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len(replicaTuple)) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': pfn, 'SE': se } res = self.insertProblematic(sourceComponent, replicaDict) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus(replicaDict) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error(errStr, res['Message']) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful': successful, 'Failed': failed} return S_OK(resDict) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles(self, prognosis, fileID): gLogger.info("%s file (%d) is resolved" % (prognosis, fileID)) return self.setProblematicStatus(fileID, 'Resolved') def __returnProblematicError(self, fileID, res): self.incrementProblematicRetry(fileID) gLogger.error('DataIntegrityClient failure', res['Message']) return res def __updateReplicaToChecked(self, problematicDict): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info("%s replica (%d) is updated to Checked status" % (prognosis, fileID)) return self.__updateCompletedFiles(prognosis, fileID) def resolveCatalogPFNSizeMismatch(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value'] res = returnSingleResult(StorageElement(se).getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) storageSize = res['Value'] bkKCatalog = FileCatalog(['BookkeepingDB']) res = returnSingleResult(bkKCatalog.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID) return self.__updateReplicaToChecked(problematicDict) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(res['Value']) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID) res = self.dm.removeReplica(se, lfn) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('CatalogPFNSizeMismatch', fileID) if (catalogSize != bookkeepingSize) and (bookkeepingSize == storageSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID) res = self.__updateReplicaToChecked(problematicDict) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.changeProblematicPrognosis(fileID, 'BKCatalogSizeMismatch') gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) #FIXME: Unused? def resolvePFNNotRegistered(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: # The file does not exist in the catalog res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('PFNNotRegistered', fileID) res = returnSingleResult(se.getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info("PFNNotRegistered replica (%d) found to be missing." % fileID) return self.__updateCompletedFiles('PFNNotRegistered', fileID) elif not res['OK']: return self.__returnProblematicError(fileID, res) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) # HACK until we can obtain the space token descriptions through GFAL site = seName.split('_')[0].split('-')[0] if not storageMetadata['Cached']: if lfn.endswith('.raw'): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith('/lhcb/data'): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith('/lhcb/data'): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = returnSingleResult(se.getURL(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) problematicDict['PFN'] = res['Value'] res = returnSingleResult(self.fc.addReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNNotRegistered', fileID) #FIXME: Unused? def resolveLFNCatalogMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: return self.__updateCompletedFiles('LFNCatalogMissing', fileID) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('LFNCatalogMissing', fileID) #FIXME: Unused? def resolvePFNMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: gLogger.info("PFNMissing file (%d) no longer exists in catalog" % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) res = returnSingleResult(StorageElement(se).exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: gLogger.info("PFNMissing replica (%d) is no longer missing" % fileID) return self.__updateReplicaToChecked(problematicDict) gLogger.info("PFNMissing replica (%d) does not exist" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if not res['OK']: return self.__returnProblematicError(fileID, res) replicas = res['Value'] seSite = se.split('_')[0].split('-')[0] found = False print replicas for replicaSE in replicas.keys(): if re.search(seSite, replicaSE): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID) res = returnSingleResult(self.fc.removeReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(replicas) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'LFNZeroReplicas') res = self.dm.replicateAndRegister(problematicDict['LFN'], se) if not res['OK']: return self.__returnProblematicError(fileID, res) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('PFNMissing', fileID) #FIXME: Unused? def resolvePFNUnavailable(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(StorageElement(se).getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNMissing') if (not res['OK']) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID) return self.incrementProblematicRetry(fileID) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') gLogger.info("PFNUnavailable replica (%d) is no longer Unavailable" % fileID) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked(problematicDict) #FIXME: Unused? def resolvePFNZeroSize(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(se.getFileSize(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') storageSize = res['Value'] if storageSize == 0: res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if seName not in res['Value']: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNNotRegistered') res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNZeroSize', fileID) ############################################################################################ #FIXME: Unused? def resolveLFNZeroReplicas(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if res['OK'] and res['Value']: gLogger.info("LFNZeroReplicas file (%d) found to have replicas" % fileID) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID) pfnsFound = False for storageElementName in sorted( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [])): res = self.__getStoragePathExists([lfn], storageElementName) if lfn in res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % (fileID, storageElementName)) self.reportProblematicReplicas( [(lfn, 'deprecatedUrl', storageElementName, 'PFNNotRegistered')], storageElementName, 'PFNNotRegistered') pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID) res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: gLogger.error('DataIntegrityClient: failed to remove file', res['Message']) # Increment the number of retries for this file self.server.incrementProblematicRetry(fileID) return res gLogger.info("LFNZeroReplicas file (%d) removed from catalog" % fileID) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('LFNZeroReplicas', fileID) def _reportProblematicFiles(self, lfns, reason): """ Simple wrapper function around setFileProblematic """ gLogger.info('The following %s files were found with %s' % (len(lfns), reason)) for lfn in sorted(lfns): gLogger.info(lfn) res = self.setFileProblematic(lfns, reason, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with files', res['Message']) else: gLogger.info('Successfully updated integrity DB with files')
class DataIntegrityClient(Client): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__(self, **kwargs): Client.__init__(self, **kwargs) self.setServer('DataManagement/DataIntegrity') self.dm = DataManager() self.fc = FileCatalog() ########################################################################## # # This section contains the specific methods for LFC->SE checks # def catalogDirectoryToSE(self, lfnDir): """ This obtains the replica and metadata information from the catalog for the supplied directory and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the LFC->SE check") gLogger.info("-" * 40) if type(lfnDir) in types.StringTypes: lfnDir = [lfnDir] res = self.__getCatalogDirectoryContents(lfnDir) if not res['OK']: return res replicas = res['Value']['Replicas'] catalogMetadata = res['Value']['Metadata'] res = self.__checkPhysicalFiles(replicas, catalogMetadata) if not res['OK']: return res resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } return S_OK(resDict) def catalogFileToSE(self, lfns): """ This obtains the replica and metadata information from the catalog and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the LFC->SE check") gLogger.info("-" * 40) if type(lfns) in types.StringTypes: lfns = [lfns] res = self.__getCatalogMetadata(lfns) if not res['OK']: return res catalogMetadata = res['Value'] res = self.__getCatalogReplicas(catalogMetadata.keys()) if not res['OK']: return res replicas = res['Value'] res = self.__checkPhysicalFiles(replicas, catalogMetadata) if not res['OK']: return res resDict = { 'CatalogMetadata': catalogMetadata, 'CatalogReplicas': replicas } return S_OK(resDict) def checkPhysicalFiles(self, replicas, catalogMetadata, ses=[]): """ This obtains takes the supplied replica and metadata information obtained from the catalog and checks against the storage elements. """ gLogger.info("-" * 40) gLogger.info("Performing the LFC->SE check") gLogger.info("-" * 40) return self.__checkPhysicalFiles(replicas, catalogMetadata, ses=ses) def __checkPhysicalFiles(self, replicas, catalogMetadata, ses=[]): """ This obtains the physical file metadata and checks the metadata against the catalog entries """ seLfns = {} for lfn, replicaDict in replicas.items(): for se, _url in replicaDict.items(): if (ses) and (se not in ses): continue seLfns.setdefault(se, []).append(lfn) gLogger.info('%s %s' % ('Storage Element'.ljust(20), 'Replicas'.rjust(20))) for se in sortList(seLfns): files = len(seLfns[se]) gLogger.info('%s %s' % (se.ljust(20), str(files).rjust(20))) lfns = seLfns[se] sizeMismatch = [] res = self.__checkPhysicalFileMetadata(lfns, se) if not res['OK']: gLogger.error('Failed to get physical file metadata.', res['Message']) return res for lfn, metadata in res['Value'].items(): if lfn in catalogMetadata: if (metadata['Size'] != catalogMetadata[lfn]['Size']) and ( metadata['Size'] != 0): sizeMismatch.append((lfn, 'deprecatedUrl', se, 'CatalogPFNSizeMismatch')) if sizeMismatch: self.__reportProblematicReplicas(sizeMismatch, se, 'CatalogPFNSizeMismatch') return S_OK() def __checkPhysicalFileMetadata(self, lfns, se): """ Check obtain the physical file metadata and check the files are available """ gLogger.info('Checking the integrity of %s physical files at %s' % (len(lfns), se)) res = StorageElement(se).getFileMetadata(lfns) if not res['OK']: gLogger.error('Failed to get metadata for lfns.', res['Message']) return res lfnMetadataDict = res['Value']['Successful'] # If the replicas are completely missing missingReplicas = [] for lfn, reason in res['Value']['Failed'].items(): if re.search('File does not exist', reason): missingReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNMissing')) if missingReplicas: self.__reportProblematicReplicas(missingReplicas, se, 'PFNMissing') lostReplicas = [] unavailableReplicas = [] zeroSizeReplicas = [] # If the files are not accessible for lfn, lfnMetadata in lfnMetadataDict.items(): if lfnMetadata['Lost']: lostReplicas.append((lfn, 'deprecatedUrl', se, 'PFNLost')) if lfnMetadata['Unavailable']: unavailableReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNUnavailable')) if lfnMetadata['Size'] == 0: zeroSizeReplicas.append( (lfn, 'deprecatedUrl', se, 'PFNZeroSize')) if lostReplicas: self.__reportProblematicReplicas(lostReplicas, se, 'PFNLost') if unavailableReplicas: self.__reportProblematicReplicas(unavailableReplicas, se, 'PFNUnavailable') if zeroSizeReplicas: self.__reportProblematicReplicas(zeroSizeReplicas, se, 'PFNZeroSize') gLogger.info( 'Checking the integrity of physical files at %s complete' % se) return S_OK(lfnMetadataDict) ########################################################################## # # This section contains the specific methods for SE->LFC checks # def storageDirectoryToCatalog(self, lfnDir, storageElement): """ This obtains the file found on the storage element in the supplied directories and determines whether they exist in the catalog and checks their metadata elements """ gLogger.info("-" * 40) gLogger.info("Performing the SE->LFC check at %s" % storageElement) gLogger.info("-" * 40) if type(lfnDir) in types.StringTypes: lfnDir = [lfnDir] res = self.__getStorageDirectoryContents(lfnDir, storageElement) if not res['OK']: return res storageFileMetadata = res['Value'] if storageFileMetadata: return self.__checkCatalogForSEFiles(storageFileMetadata, storageElement) return S_OK({'CatalogMetadata': {}, 'StorageMetadata': {}}) def __checkCatalogForSEFiles(self, storageMetadata, storageElement): gLogger.info('Checking %s storage files exist in the catalog' % len(storageMetadata)) res = self.fc.getReplicas(storageMetadata) if not res['OK']: gLogger.error("Failed to get replicas for LFN", res['Message']) return res failedLfns = res['Value']['Failed'] successfulLfns = res['Value']['Successful'] notRegisteredLfns = [] for lfn in storageMetadata: if lfn in failedLfns: if 'No such file or directory' in failedLfns[lfn]: notRegisteredLfns.append( (lfn, 'deprecatedUrl', storageElement, 'LFNNotRegistered')) failedLfns.pop(lfn) elif storageElement not in successfulLfns[lfn]: notRegisteredLfns.append( (lfn, 'deprecatedUrl', storageElement, 'LFNNotRegistered')) if notRegisteredLfns: self.__reportProblematicReplicas(notRegisteredLfns, storageElement, 'LFNNotRegistered') if failedLfns: return S_ERROR('Failed to obtain replicas') # For the LFNs found to be registered obtain the file metadata from the catalog and verify against the storage metadata res = self.__getCatalogMetadata(storageMetadata) if not res['OK']: return res catalogMetadata = res['Value'] sizeMismatch = [] for lfn, lfnCatalogMetadata in catalogMetadata.items(): lfnStorageMetadata = storageMetadata[lfn] if (lfnStorageMetadata['Size'] != lfnCatalogMetadata['Size']) and ( lfnStorageMetadata['Size'] != 0): sizeMismatch.append((lfn, 'deprecatedUrl', storageElement, 'CatalogPFNSizeMismatch')) if sizeMismatch: self.__reportProblematicReplicas(sizeMismatch, storageElement, 'CatalogPFNSizeMismatch') gLogger.info('Checking storage files exist in the catalog complete') resDict = { 'CatalogMetadata': catalogMetadata, 'StorageMetadata': storageMetadata } return S_OK(resDict) def getStorageDirectoryContents(self, lfnDir, storageElement): """ This obtains takes the supplied lfn directories and recursively obtains the files in the supplied storage element """ return self.__getStorageDirectoryContents(lfnDir, storageElement) def __getStorageDirectoryContents(self, lfnDir, storageElement): """ Obtians the contents of the supplied directory on the storage """ gLogger.info('Obtaining the contents for %s directories at %s' % (len(lfnDir), storageElement)) se = StorageElement(storageElement) res = se.exists(lfnDir) if not res['OK']: gLogger.error("Failed to obtain existance of directories", res['Message']) return res for directory, error in res['Value']['Failed'].items(): gLogger.error('Failed to determine existance of directory', '%s %s' % (directory, error)) if res['Value']['Failed']: return S_ERROR('Failed to determine existance of directory') directoryExists = res['Value']['Successful'] activeDirs = [] for directory in sorted(directoryExists): exists = directoryExists[directory] if exists: activeDirs.append(directory) allFiles = {} while len(activeDirs) > 0: currentDir = activeDirs[0] res = se.listDirectory(currentDir) activeDirs.remove(currentDir) if not res['OK']: gLogger.error('Failed to get directory contents', res['Message']) return res elif currentDir in res['Value']['Failed']: gLogger.error( 'Failed to get directory contents', '%s %s' % (currentDir, res['Value']['Failed'][currentDir])) return S_ERROR(res['Value']['Failed'][currentDir]) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend( se.getLFNFromURL(dirContents['SubDirs']).get( 'Value', {}).get('Successful', [])) fileURLMetadata = dirContents['Files'] fileMetadata = {} res = se.getLFNFromURL(fileURLMetadata) if not res['OK']: gLogger.error('Failed to get directory content LFNs', res['Message']) return res for url, error in res['Value']['Failed'].items(): gLogger.error("Failed to get LFN for URL", "%s %s" % (url, error)) if res['Value']['Failed']: return S_ERROR("Failed to get LFNs for PFNs") urlLfns = res['Value']['Successful'] for urlLfn, lfn in urlLfns.items(): fileMetadata[lfn] = fileURLMetadata[urlLfn] allFiles.update(fileMetadata) zeroSizeFiles = [] for lfn in sorted(allFiles): if os.path.basename(lfn) == 'dirac_directory': allFiles.pop(lfn) else: metadata = allFiles[lfn] if metadata['Size'] == 0: zeroSizeFiles.append( (lfn, 'deprecatedUrl', storageElement, 'PFNZeroSize')) if zeroSizeFiles: self.__reportProblematicReplicas(zeroSizeFiles, storageElement, 'PFNZeroSize') gLogger.info('Obtained at total of %s files for directories at %s' % (len(allFiles), storageElement)) return S_OK(allFiles) def __getStoragePathExists(self, lfnPaths, storageElement): gLogger.info('Determining the existance of %d files at %s' % (len(lfnPaths), storageElement)) se = StorageElement(storageElement) res = se.exists(lfnPaths) if not res['OK']: gLogger.error("Failed to obtain existance of paths", res['Message']) return res for lfnPath, error in res['Value']['Failed'].items(): gLogger.error('Failed to determine existance of path', '%s %s' % (lfnPath, error)) if res['Value']['Failed']: return S_ERROR('Failed to determine existance of paths') pathExists = res['Value']['Successful'] resDict = {} for lfn, exists in pathExists.items(): if exists: resDict[lfn] = True return S_OK(resDict) ########################################################################## # # This section contains the specific methods for obtaining replica and metadata information from the catalog # def __getCatalogDirectoryContents(self, lfnDir): """ Obtain the contents of the supplied directory """ gLogger.info('Obtaining the catalog contents for %s directories' % len(lfnDir)) activeDirs = lfnDir allFiles = {} while len(activeDirs) > 0: currentDir = activeDirs[0] res = self.fc.listDirectory(currentDir) activeDirs.remove(currentDir) if not res['OK']: gLogger.error('Failed to get directory contents', res['Message']) return res elif res['Value']['Failed'].has_key(currentDir): gLogger.error( 'Failed to get directory contents', '%s %s' % (currentDir, res['Value']['Failed'][currentDir])) else: dirContents = res['Value']['Successful'][currentDir] activeDirs.extend(dirContents['SubDirs']) allFiles.update(dirContents['Files']) zeroReplicaFiles = [] zeroSizeFiles = [] allReplicaDict = {} allMetadataDict = {} for lfn, lfnDict in allFiles.items(): lfnReplicas = {} for se, replicaDict in lfnDict['Replicas'].items(): lfnReplicas[se] = replicaDict['PFN'] if not lfnReplicas: zeroReplicaFiles.append(lfn) allReplicaDict[lfn] = lfnReplicas allMetadataDict[lfn] = lfnDict['MetaData'] if lfnDict['MetaData']['Size'] == 0: zeroSizeFiles.append(lfn) if zeroReplicaFiles: self.__reportProblematicFiles(zeroReplicaFiles, 'LFNZeroReplicas') if zeroSizeFiles: self.__reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') gLogger.info( 'Obtained at total of %s files for the supplied directories' % len(allMetadataDict)) resDict = {'Metadata': allMetadataDict, 'Replicas': allReplicaDict} return S_OK(resDict) def __getCatalogReplicas(self, lfns): """ Obtain the file replicas from the catalog while checking that there are replicas """ gLogger.info('Obtaining the replicas for %s files' % len(lfns)) zeroReplicaFiles = [] res = self.fc.getReplicas(lfns, allStatus=True) if not res['OK']: gLogger.error('Failed to get catalog replicas', res['Message']) return res allReplicas = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search('File has zero replicas', error): zeroReplicaFiles.append(lfn) if zeroReplicaFiles: self.__reportProblematicFiles(zeroReplicaFiles, 'LFNZeroReplicas') gLogger.info('Obtaining the replicas for files complete') return S_OK(allReplicas) def __getCatalogMetadata(self, lfns): """ Obtain the file metadata from the catalog while checking they exist """ if not lfns: return S_OK({}) gLogger.info('Obtaining the catalog metadata for %s files' % len(lfns)) missingCatalogFiles = [] zeroSizeFiles = [] res = self.fc.getFileMetadata(lfns) if not res['OK']: gLogger.error('Failed to get catalog metadata', res['Message']) return res allMetadata = res['Value']['Successful'] for lfn, error in res['Value']['Failed'].items(): if re.search('No such file or directory', error): missingCatalogFiles.append(lfn) if missingCatalogFiles: self.__reportProblematicFiles(missingCatalogFiles, 'LFNCatalogMissing') for lfn, metadata in allMetadata.items(): if metadata['Size'] == 0: zeroSizeFiles.append(lfn) if zeroSizeFiles: self.__reportProblematicFiles(zeroSizeFiles, 'LFNZeroSize') gLogger.info('Obtaining the catalog metadata complete') return S_OK(allMetadata) ########################################################################## # # This section contains the methods for inserting problematic files into the integrity DB # def __reportProblematicFiles(self, lfns, reason): """ Simple wrapper function around setFileProblematic """ gLogger.info('The following %s files were found with %s' % (len(lfns), reason)) for lfn in sortList(lfns): gLogger.info(lfn) res = self.setFileProblematic(lfns, reason, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with files', res['Message']) else: gLogger.info('Successfully updated integrity DB with files') def setFileProblematic(self, lfn, reason, sourceComponent=''): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type(lfn) == types.ListType: lfns = lfn elif type(lfn) == types.StringType: lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len(lfns)) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': '', 'SE': '' } res = self.insertProblematic(sourceComponent, fileMetadata) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def __reportProblematicReplicas(self, replicaTuple, se, reason): """ Simple wrapper function around setReplicaProblematic """ gLogger.info('The following %s files had %s at %s' % (len(replicaTuple), reason, se)) for lfn, _pfn, se, reason in sortList(replicaTuple): if lfn: gLogger.info(lfn) res = self.setReplicaProblematic(replicaTuple, sourceComponent='DataIntegrityClient') if not res['OK']: gLogger.info('Failed to update integrity DB with replicas', res['Message']) else: gLogger.info('Successfully updated integrity DB with replicas') def setReplicaProblematic(self, replicaTuple, sourceComponent=''): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if type(replicaTuple) == types.TupleType: replicaTuple = [replicaTuple] elif type(replicaTuple) == types.ListType: pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error(errStr) return S_ERROR(errStr) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len(replicaTuple)) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = { 'Prognosis': reason, 'LFN': lfn, 'PFN': pfn, 'SE': se } res = self.insertProblematic(sourceComponent, replicaDict) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus(replicaDict) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error(errStr, res['Message']) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful': successful, 'Failed': failed} return S_OK(resDict) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles(self, prognosis, fileID): gLogger.info("%s file (%d) is resolved" % (prognosis, fileID)) return self.setProblematicStatus(fileID, 'Resolved') def __returnProblematicError(self, fileID, res): self.incrementProblematicRetry(fileID) gLogger.error('DataIntegrityClient failure', res['Message']) return res # def __getRegisteredPFNLFN( self, pfn, storageElement ): # # res = StorageElement( storageElement ).getURL( pfn ) # if not res['OK']: # gLogger.error( "Failed to get registered PFN for physical files", res['Message'] ) # return res # for pfn, error in res['Value']['Failed'].items(): # gLogger.error( 'Failed to obtain registered PFN for physical file', '%s %s' % ( pfn, error ) ) # return S_ERROR( 'Failed to obtain registered PFNs from physical file' ) # registeredPFN = res['Value']['Successful'][pfn] # res = returnSingleResult( self.fc.getLFNForPFN( registeredPFN ) ) # if ( not res['OK'] ) and re.search( 'No such file or directory', res['Message'] ): # return S_OK( False ) # return S_OK( res['Value'] ) def __updateReplicaToChecked(self, problematicDict): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info("%s replica (%d) is updated to Checked status" % (prognosis, fileID)) return self.__updateCompletedFiles(prognosis, fileID) def resolveCatalogPFNSizeMismatch(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value'] res = returnSingleResult(StorageElement(se).getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) storageSize = res['Value'] bkKCatalog = FileCatalog(['BookkeepingDB']) res = returnSingleResult(bkKCatalog.getFileSize(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID) return self.__updateReplicaToChecked(problematicDict) if (catalogSize == bookkeepingSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(res['Value']) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID) res = self.dm.removeReplica(se, lfn) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('CatalogPFNSizeMismatch', fileID) if (catalogSize != bookkeepingSize) and (bookkeepingSize == storageSize): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID) res = self.__updateReplicaToChecked(problematicDict) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.changeProblematicPrognosis(fileID, 'BKCatalogSizeMismatch') gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) def resolvePFNNotRegistered(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: # The file does not exist in the catalog res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('PFNNotRegistered', fileID) res = returnSingleResult(se.getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info("PFNNotRegistered replica (%d) found to be missing." % fileID) return self.__updateCompletedFiles('PFNNotRegistered', fileID) elif not res['OK']: return self.__returnProblematicError(fileID, res) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID) return self.incrementProblematicRetry(fileID) # HACK until we can obtain the space token descriptions through GFAL site = seName.split('_')[0].split('-')[0] if not storageMetadata['Cached']: if lfn.endswith('.raw'): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith('/lhcb/data'): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith('/lhcb/data'): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = returnSingleResult(se.getURL(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) problematicDict['PFN'] = res['Value'] res = returnSingleResult(self.fc.addReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNNotRegistered', fileID) def resolveLFNCatalogMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: return self.__updateCompletedFiles('LFNCatalogMissing', fileID) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) return self.__updateCompletedFiles('LFNCatalogMissing', fileID) def resolvePFNMissing(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if not res['Value']: gLogger.info("PFNMissing file (%d) no longer exists in catalog" % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) res = returnSingleResult(StorageElement(se).exists(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if res['Value']: gLogger.info("PFNMissing replica (%d) is no longer missing" % fileID) return self.__updateReplicaToChecked(problematicDict) gLogger.info("PFNMissing replica (%d) does not exist" % fileID) res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if not res['OK']: return self.__returnProblematicError(fileID, res) replicas = res['Value'] seSite = se.split('_')[0].split('-')[0] found = False print replicas for replicaSE in replicas.keys(): if re.search(seSite, replicaSE): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID) return self.__updateCompletedFiles('PFNMissing', fileID) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID) res = returnSingleResult(self.fc.removeReplica({lfn: problematicDict})) if not res['OK']: return self.__returnProblematicError(fileID, res) if len(replicas) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'LFNZeroReplicas') res = self.dm.replicateAndRegister(problematicDict['LFN'], se) if not res['OK']: return self.__returnProblematicError(fileID, res) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('PFNMissing', fileID) def resolvePFNUnavailable(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult(StorageElement(se).getFileMetadata(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNMissing') if (not res['OK']) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID) return self.incrementProblematicRetry(fileID) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID) return self.changeProblematicPrognosis(fileID, 'PFNLost') gLogger.info("PFNUnavailable replica (%d) is no longer Unavailable" % fileID) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked(problematicDict) def resolvePFNZeroSize(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement(seName) res = returnSingleResult(se.getFileSize(lfn)) if (not res['OK']) and (re.search('File does not exist', res['Message'])): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') storageSize = res['Value'] if storageSize == 0: res = returnSingleResult(se.removeFile(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNMissing') res = returnSingleResult(self.fc.getReplicas(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) if seName not in res['Value']: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'PFNNotRegistered') res = returnSingleResult(self.fc.getFileMetadata(lfn)) if not res['OK']: return self.__returnProblematicError(fileID, res) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID']) return self.changeProblematicPrognosis(fileID, 'CatalogPFNSizeMismatch') return self.__updateCompletedFiles('PFNZeroSize', fileID) ############################################################################################ def resolveLFNZeroReplicas(self, problematicDict): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult(self.fc.getReplicas(lfn, allStatus=True)) if res['OK'] and res['Value']: gLogger.info("LFNZeroReplicas file (%d) found to have replicas" % fileID) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID) pfnsFound = False for storageElementName in sorted( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [])): res = self.__getStoragePathExists([lfn], storageElementName) if lfn in res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % (fileID, storageElementName)) self.__reportProblematicReplicas( [(lfn, 'deprecatedUrl', storageElementName, 'PFNNotRegistered')], storageElementName, 'PFNNotRegistered') pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID) res = returnSingleResult(self.fc.removeFile(lfn)) if not res['OK']: gLogger.error('DataIntegrityClient: failed to remove file', res['Message']) # Increment the number of retries for this file self.server.incrementProblematicRetry(fileID) return res gLogger.info("LFNZeroReplicas file (%d) removed from catalog" % fileID) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles('LFNZeroReplicas', fileID)
class DataIntegrityClient( Client ): """ The following methods are supported in the service but are not mentioned explicitly here: getProblematic() Obtains a problematic file from the IntegrityDB based on the LastUpdate time getPrognosisProblematics(prognosis) Obtains all the problematics of a particular prognosis from the integrityDB getProblematicsSummary() Obtains a count of the number of problematics for each prognosis found getDistinctPrognosis() Obtains the distinct prognosis found in the integrityDB getTransformationProblematics(prodID) Obtains the problematics for a given production incrementProblematicRetry(fileID) Increments the retry count for the supplied file ID changeProblematicPrognosis(fileID,newPrognosis) Changes the prognosis of the supplied file to the new prognosis setProblematicStatus(fileID,status) Updates the status of a problematic in the integrityDB removeProblematic(self,fileID) This removes the specified file ID from the integrity DB insertProblematic(sourceComponent,fileMetadata) Inserts file with supplied metadata into the integrity DB """ def __init__( self, **kwargs ): super(DataIntegrityClient, self).__init__( **kwargs ) self.setServer( 'DataManagement/DataIntegrity' ) self.dm = DataManager() self.fc = FileCatalog() def setFileProblematic( self, lfn, reason, sourceComponent = '' ): """ This method updates the status of the file in the FileCatalog and the IntegrityDB lfn - the lfn of the file reason - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance( lfn, list ): lfns = lfn elif isinstance( lfn, basestring ): lfns = [lfn] else: errStr = "DataIntegrityClient.setFileProblematic: Supplied file info must be list or a single LFN." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setFileProblematic: Attempting to update %s files." % len( lfns ) ) fileMetadata = {} for lfn in lfns: fileMetadata[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':'', 'SE':''} res = self.insertProblematic( sourceComponent, fileMetadata ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematics to integrity DB" ) return res def reportProblematicReplicas( self, replicaTuple, se, reason ): """ Simple wrapper function around setReplicaProblematic """ gLogger.info( 'The following %s files had %s at %s' % ( len( replicaTuple ), reason, se ) ) for lfn, _pfn, se, reason in sorted( replicaTuple ): if lfn: gLogger.info( lfn ) res = self.setReplicaProblematic( replicaTuple, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with replicas', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with replicas' ) def setReplicaProblematic( self, replicaTuple, sourceComponent = '' ): """ This method updates the status of the replica in the FileCatalog and the IntegrityDB The supplied replicaDict should be of the form {lfn :{'PFN':pfn,'SE':se,'Prognosis':prognosis} lfn - the lfn of the file pfn - the pfn if available (otherwise '') se - the storage element of the problematic replica (otherwise '') prognosis - this is given to the integrity DB and should reflect the problem observed with the file sourceComponent is the component issuing the request. """ if isinstance( replicaTuple, tuple ): replicaTuple = [replicaTuple] elif isinstance( replicaTuple, list ): pass else: errStr = "DataIntegrityClient.setReplicaProblematic: Supplied replica info must be a tuple or list of tuples." gLogger.error( errStr ) return S_ERROR( errStr ) gLogger.info( "DataIntegrityClient.setReplicaProblematic: Attempting to update %s replicas." % len( replicaTuple ) ) replicaDict = {} for lfn, pfn, se, reason in replicaTuple: replicaDict[lfn] = {'Prognosis':reason, 'LFN':lfn, 'PFN':pfn, 'SE':se} res = self.insertProblematic( sourceComponent, replicaDict ) if not res['OK']: gLogger.error( "DataIntegrityClient.setReplicaProblematic: Failed to insert problematic to integrity DB" ) return res for lfn in replicaDict.keys(): replicaDict[lfn]['Status'] = 'Problematic' res = self.fc.setReplicaStatus( replicaDict ) if not res['OK']: errStr = "DataIntegrityClient.setReplicaProblematic: Completely failed to update replicas." gLogger.error( errStr, res['Message'] ) return res failed = res['Value']['Failed'] successful = res['Value']['Successful'] resDict = {'Successful':successful, 'Failed':failed} return S_OK( resDict ) ########################################################################## # # This section contains the resolution methods for various prognoses # def __updateCompletedFiles( self, prognosis, fileID ): gLogger.info( "%s file (%d) is resolved" % ( prognosis, fileID ) ) return self.setProblematicStatus( fileID, 'Resolved' ) def __returnProblematicError( self, fileID, res ): self.incrementProblematicRetry( fileID ) gLogger.error( 'DataIntegrityClient failure', res['Message'] ) return res def __updateReplicaToChecked( self, problematicDict ): lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] prognosis = problematicDict['Prognosis'] problematicDict['Status'] = 'Checked' res = returnSingleResult( self.fc.setReplicaStatus( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "%s replica (%d) is updated to Checked status" % ( prognosis, fileID ) ) return self.__updateCompletedFiles( prognosis, fileID ) def resolveCatalogPFNSizeMismatch( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the CatalogPFNSizeMismatch prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value'] res = returnSingleResult( StorageElement( se ).getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) storageSize = res['Value'] bkKCatalog = FileCatalog( ['BookkeepingDB'] ) res = returnSingleResult( bkKCatalog.getFileSize( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) bookkeepingSize = res['Value'] if bookkeepingSize == catalogSize == storageSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) matched all registered sizes." % fileID ) return self.__updateReplicaToChecked( problematicDict ) if catalogSize == bookkeepingSize: gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to mismatch the bookkeeping also" % fileID ) res = returnSingleResult( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( res['Value'] ) <= 1: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has no other replicas." % fileID ) return S_ERROR( "Not removing catalog file mismatch since the only replica" ) else: gLogger.info( "CatalogPFNSizeMismatch replica (%d) has other replicas. Removing..." % fileID ) res = self.dm.removeReplica( se, lfn ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'CatalogPFNSizeMismatch', fileID ) if ( catalogSize != bookkeepingSize ) and ( bookkeepingSize == storageSize ): gLogger.info( "CatalogPFNSizeMismatch replica (%d) found to match the bookkeeping size" % fileID ) res = self.__updateReplicaToChecked( problematicDict ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.changeProblematicPrognosis( fileID, 'BKCatalogSizeMismatch' ) gLogger.info( "CatalogPFNSizeMismatch replica (%d) all sizes found mismatch. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) #FIXME: Unused? def resolvePFNNotRegistered( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNNotRegistered prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = returnSingleResult( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: # The file does not exist in the catalog res = returnSingleResult( se.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) res = returnSingleResult( se.getFileMetadata( lfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNNotRegistered replica (%d) found to be missing." % fileID ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) elif not res['OK']: return self.__returnProblematicError( fileID, res ) storageMetadata = res['Value'] if storageMetadata['Lost']: gLogger.info( "PFNNotRegistered replica (%d) found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) if storageMetadata['Unavailable']: gLogger.info( "PFNNotRegistered replica (%d) found to be Unavailable. Updating retry count" % fileID ) return self.incrementProblematicRetry( fileID ) # HACK until we can obtain the space token descriptions through GFAL site = seName.split( '_' )[0].split( '-' )[0] if not storageMetadata['Cached']: if lfn.endswith( '.raw' ): seName = '%s-RAW' % site else: seName = '%s-RDST' % site elif storageMetadata['Migrated']: if lfn.startswith( '/lhcb/data' ): seName = '%s_M-DST' % site else: seName = '%s_MC_M-DST' % site else: if lfn.startswith( '/lhcb/data' ): seName = '%s-DST' % site else: seName = '%s_MC-DST' % site problematicDict['SE'] = seName res = returnSingleResult( se.getURL( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) problematicDict['PFN'] = res['Value'] res = returnSingleResult( self.fc.addReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) res = returnSingleResult( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']['Size'] != storageMetadata['Size']: gLogger.info( "PFNNotRegistered replica (%d) found with catalog size mismatch. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNNotRegistered', fileID ) #FIXME: Unused? def resolveLFNCatalogMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the LFNCatalogMissing prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) # Remove the file from all catalogs # RF_NOTE : here I can do it because it's a single file, but otherwise I would need to sort the path res = returnSingleResult( self.fc.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) return self.__updateCompletedFiles( 'LFNCatalogMissing', fileID ) #FIXME: Unused? def resolvePFNMissing( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNMissing prognosis """ se = problematicDict['SE'] lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if not res['Value']: gLogger.info( "PFNMissing file (%d) no longer exists in catalog" % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) res = returnSingleResult( StorageElement( se ).exists( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if res['Value']: gLogger.info( "PFNMissing replica (%d) is no longer missing" % fileID ) return self.__updateReplicaToChecked( problematicDict ) gLogger.info( "PFNMissing replica (%d) does not exist" % fileID ) res = returnSingleResult( self.fc.getReplicas( lfn, allStatus = True ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) replicas = res['Value'] seSite = se.split( '_' )[0].split( '-' )[0] found = False print replicas for replicaSE in replicas.keys(): if re.search( seSite, replicaSE ): found = True problematicDict['SE'] = replicaSE se = replicaSE if not found: gLogger.info( "PFNMissing replica (%d) is no longer registered at SE. Resolved." % fileID ) return self.__updateCompletedFiles( 'PFNMissing', fileID ) gLogger.info( "PFNMissing replica (%d) does not exist. Removing from catalog..." % fileID ) res = returnSingleResult( self.fc.removeReplica( {lfn:problematicDict} ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if len( replicas ) == 1: gLogger.info( "PFNMissing replica (%d) had a single replica. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'LFNZeroReplicas' ) res = self.dm.replicateAndRegister( problematicDict['LFN'], se ) if not res['OK']: return self.__returnProblematicError( fileID, res ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'PFNMissing', fileID ) #FIXME: Unused? def resolvePFNUnavailable( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolved the PFNUnavailable prognosis """ lfn = problematicDict['LFN'] se = problematicDict['SE'] fileID = problematicDict['FileID'] res = returnSingleResult( StorageElement( se ).getFileMetadata( lfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): # The file is no longer Unavailable but has now dissapeared completely gLogger.info( "PFNUnavailable replica (%d) found to be missing. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) if ( not res['OK'] ) or res['Value']['Unavailable']: gLogger.info( "PFNUnavailable replica (%d) found to still be Unavailable" % fileID ) return self.incrementProblematicRetry( fileID ) if res['Value']['Lost']: gLogger.info( "PFNUnavailable replica (%d) is now found to be Lost. Updating prognosis" % fileID ) return self.changeProblematicPrognosis( fileID, 'PFNLost' ) gLogger.info( "PFNUnavailable replica (%d) is no longer Unavailable" % fileID ) # Need to make the replica okay in the Catalog return self.__updateReplicaToChecked( problematicDict ) #FIXME: Unused? def resolvePFNZeroSize( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the PFNZeroSize prognosis """ lfn = problematicDict['LFN'] seName = problematicDict['SE'] fileID = problematicDict['FileID'] se = StorageElement( seName ) res = returnSingleResult( se.getFileSize( lfn ) ) if ( not res['OK'] ) and ( re.search( 'File does not exist', res['Message'] ) ): gLogger.info( "PFNZeroSize replica (%d) found to be missing. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) storageSize = res['Value'] if storageSize == 0: res = returnSingleResult( se.removeFile( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) gLogger.info( "PFNZeroSize replica (%d) removed. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNMissing' ) res = returnSingleResult( self.fc.getReplicas( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) if seName not in res['Value']: gLogger.info( "PFNZeroSize replica (%d) not registered in catalog. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'PFNNotRegistered' ) res = returnSingleResult( self.fc.getFileMetadata( lfn ) ) if not res['OK']: return self.__returnProblematicError( fileID, res ) catalogSize = res['Value']['Size'] if catalogSize != storageSize: gLogger.info( "PFNZeroSize replica (%d) size found to differ from registered metadata. Updating prognosis" % problematicDict['FileID'] ) return self.changeProblematicPrognosis( fileID, 'CatalogPFNSizeMismatch' ) return self.__updateCompletedFiles( 'PFNZeroSize', fileID ) ############################################################################################ #FIXME: Unused? def resolveLFNZeroReplicas( self, problematicDict ): """ This takes the problematic dictionary returned by the integrity DB and resolves the LFNZeroReplicas prognosis """ lfn = problematicDict['LFN'] fileID = problematicDict['FileID'] res = returnSingleResult( self.fc.getReplicas( lfn, allStatus = True ) ) if res['OK'] and res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found to have replicas" % fileID ) else: gLogger.info( "LFNZeroReplicas file (%d) does not have replicas. Checking storage..." % fileID ) pfnsFound = False for storageElementName in sorted( gConfig.getValue( 'Resources/StorageElementGroups/Tier1_MC_M-DST', [] ) ): res = self.__getStoragePathExists( [lfn], storageElementName ) if lfn in res['Value']: gLogger.info( "LFNZeroReplicas file (%d) found storage file at %s" % ( fileID, storageElementName ) ) self.reportProblematicReplicas( [( lfn, 'deprecatedUrl', storageElementName, 'PFNNotRegistered' )], storageElementName, 'PFNNotRegistered' ) pfnsFound = True if not pfnsFound: gLogger.info( "LFNZeroReplicas file (%d) did not have storage files. Removing..." % fileID ) res = returnSingleResult( self.fc.removeFile( lfn ) ) if not res['OK']: gLogger.error( 'DataIntegrityClient: failed to remove file', res['Message'] ) # Increment the number of retries for this file self.server.incrementProblematicRetry( fileID ) return res gLogger.info( "LFNZeroReplicas file (%d) removed from catalog" % fileID ) # If we get here the problem is solved so we can update the integrityDB return self.__updateCompletedFiles( 'LFNZeroReplicas', fileID ) def _reportProblematicFiles( self, lfns, reason ): """ Simple wrapper function around setFileProblematic """ gLogger.info( 'The following %s files were found with %s' % ( len( lfns ), reason ) ) for lfn in sorted( lfns ): gLogger.info( lfn ) res = self.setFileProblematic( lfns, reason, sourceComponent = 'DataIntegrityClient' ) if not res['OK']: gLogger.info( 'Failed to update integrity DB with files', res['Message'] ) else: gLogger.info( 'Successfully updated integrity DB with files' )