def initialize( self ): # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) self.storageDB = StorageManagementDB() #self.stagerClient = StorageManagerClient() return S_OK()
def initialize(self): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # pin lifetime = 1 day self.pinLifetime = self.am_getOption('PinLifetime', THROTTLING_TIME) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK()
def initialize( self ): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # pin lifetime = 1 day self.pinLifetime = self.am_getOption( 'PinLifetime', 60 * 60 * 24 ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK()
class StageRequestAgent(AgentModule): def initialize(self): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # pin lifetime = 1 day self.pinLifetime = self.am_getOption('PinLifetime', THROTTLING_TIME) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): # Get the current submitted stage space and the amount of pinned space for each storage element res = self.getStorageUsage() if not res['OK']: return res return self.submitStageRequests() def getStorageUsage(self): """ Fill the current Status of the SE Caches from the DB """ self.storageElementCache = {} res = self.storageDB.getSubmittedStagePins() if not res['OK']: gLogger.fatal( "StageRequest.getStorageUsage: Failed to obtain submitted requests from StorageManagementDB.", res['Message']) return res self.storageElementUsage = res['Value'] if self.storageElementUsage: gLogger.info( "StageRequest.getStorageUsage: Active stage/pin requests found at the following sites:" ) for storageElement in sortList(self.storageElementUsage.keys()): seDict = self.storageElementUsage[storageElement] # Convert to GB for printout seDict['TotalSize'] = seDict['TotalSize'] / (1000 * 1000 * 1000.0) gLogger.info( "StageRequest.getStorageUsage: %s: %s replicas with a size of %.3f GB." % (storageElement.ljust(15), str( seDict['Replicas']).rjust(6), seDict['TotalSize'])) if not self.storageElementUsage: gLogger.info( "StageRequest.getStorageUsage: No active stage/pin requests found." ) return S_OK() def submitStageRequests(self): """ This manages the following transitions of the Replicas * Waiting -> Offline (if the file is not found Cached) * Waiting -> StageSubmitted (if the file is found Cached) * Offline -> StageSubmitted (if there are not more Waiting replicas) """ # Retry Replicas that have not been Staged in a previous attempt res = self._getMissingReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res seReplicas = res['Value']['SEReplicas'] allReplicaInfo = res['Value']['AllReplicaInfo'] if seReplicas: gLogger.info( "StageRequest.submitStageRequests: Completing partially Staged Tasks" ) for storageElement, seReplicaIDs in seReplicas.items(): gLogger.debug('Staging at %s:' % storageElement, seReplicaIDs) self._issuePrestageRequests(storageElement, seReplicaIDs, allReplicaInfo) # Check Waiting Replicas and select those found Online and all other Replicas from the same Tasks res = self._getOnlineReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res seReplicas = res['Value']['SEReplicas'] allReplicaInfo = res['Value']['AllReplicaInfo'] # Check Offline Replicas that fit in the Cache and all other Replicas from the same Tasks res = self._getOfflineReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res # Merge info from both results for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): if storageElement not in seReplicas: seReplicas[storageElement] = seReplicaIDs else: for replicaID in seReplicaIDs: if replicaID not in seReplicas[storageElement]: seReplicas[storageElement].append(replicaID) allReplicaInfo.update(res['Value']['AllReplicaInfo']) gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas for staging." % len(allReplicaInfo)) for storageElement, seReplicaIDs in seReplicas.items(): gLogger.debug('Staging at %s:' % storageElement, seReplicaIDs) self._issuePrestageRequests(storageElement, seReplicaIDs, allReplicaInfo) return S_OK() def _getMissingReplicas(self): """ This recovers Replicas that were not Staged on a previous attempt (the stage request failed or timed out), while other Replicas of the same task are already Staged. If left behind they can produce a deadlock. All SEs are considered, even if their Cache is full """ # Get Replicas that are in Staged/StageSubmitted gLogger.info( 'StageRequest._getMissingReplicas: Checking Staged Replicas') res = self.__getStagedReplicas() if not res['OK']: gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get replicas from StorageManagementDB.", res['Message']) return res seReplicas = {} allReplicaInfo = res['Value']['AllReplicaInfo'] replicasToStage = [] for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): # Consider all SEs replicasToStage.extend(seReplicaIDs) # Get Replicas from the same Tasks as those selected res = self.__addAssociatedReplicas(replicasToStage, seReplicas, allReplicaInfo) if not res['OK']: gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get associated Replicas.", res['Message']) return res def _getOnlineReplicas(self): """ This manages the transition * Waiting -> Offline (if the file is not found Cached) and returns the list of Cached Replicas for which the pin time has to be extended SEs for which the cache is currently full are not considered """ # Get all Replicas in Waiting Status associated to Staging Tasks gLogger.verbose( 'StageRequest._getOnlineReplicas: Checking Online Replicas to be handled' ) res = self.__getWaitingReplicas() if not res['OK']: gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get replicas from StorageManagementDB.", res['Message']) return res seReplicas = {} allReplicaInfo = res['Value']['AllReplicaInfo'] if not len(allReplicaInfo): gLogger.info( "StageRequest._getOnlineReplicas: There were no Waiting replicas found" ) return res gLogger.info( "StageRequest._getOnlineReplicas: Obtained %s replicas Waiting for staging." % len(allReplicaInfo)) replicasToStage = [] for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): if not self.__usage(storageElement) < self.__cache(storageElement): gLogger.info( 'StageRequest._getOnlineReplicas: Skipping %s, current usage above limit ( %s GB )' % (storageElement, self.__cache(storageElement))) # Do not consider those SE that have the Cache full continue # Check if the Replica Metadata is OK and find out if they are Online or Offline res = self.__checkIntegrity(storageElement, seReplicaIDs, allReplicaInfo) if not res['OK']: gLogger.error( 'StageRequest._getOnlineReplicas: Failed to check Replica Metadata', '(%s): %s' % (storageElement, res['Message'])) else: # keep only Online Replicas seReplicas[storageElement] = res['Value']['Online'] replicasToStage.extend(res['Value']['Online']) # Get Replicas from the same Tasks as those selected res = self.__addAssociatedReplicas(replicasToStage, seReplicas, allReplicaInfo) if not res['OK']: gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get associated Replicas.", res['Message']) return res def _getOfflineReplicas(self): """ This checks Replicas in Offline status and returns the list of Replicas to be Staged SEs for which the cache is currently full are not considered """ # Get all Replicas in Waiting Status associated to Staging Tasks gLogger.verbose( 'StageRequest._getOfflineReplicas: Checking Offline Replicas to be handled' ) res = self.__getOfflineReplicas() if not res['OK']: gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get replicas from StorageManagementDB.", res['Message']) return res seReplicas = {} allReplicaInfo = res['Value']['AllReplicaInfo'] if not len(allReplicaInfo): gLogger.info( "StageRequest._getOfflineReplicas: There were no Offline replicas found" ) return res gLogger.info( "StageRequest._getOfflineReplicas: Obtained %s replicas Offline for staging." % len(allReplicaInfo)) replicasToStage = [] for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): if not self.__usage(storageElement) < self.__cache(storageElement): gLogger.info( 'StageRequest._getOfflineReplicas: Skipping %s, current usage above limit ( %s GB )' % (storageElement, self.__cache(storageElement))) # Do not consider those SE that have the Cache full continue seReplicas[storageElement] = [] for replicaID in sorted(seReplicaIDs): seReplicas[storageElement].append(replicaID) replicasToStage.append(replicaID) self.__add(storageElement, allReplicaInfo[replicaID]['Size']) if not self.__usage(storageElement) < self.__cache( storageElement): # Stop adding Replicas when the cache is full break # Get Replicas from the same Tasks as those selected res = self.__addAssociatedReplicas(replicasToStage, seReplicas, allReplicaInfo) if not res['OK']: gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get associated Replicas.", res['Message']) return res def __usage(self, storageElement): """ Retrieve current usage of SE """ if not storageElement in self.storageElementUsage: self.storageElementUsage[storageElement] = {'TotalSize': 0.} return self.storageElementUsage[storageElement]['TotalSize'] def __cache(self, storageElement): """ Retrieve cache size for SE """ if not storageElement in self.storageElementCache: self.storageElementCache[storageElement] = gConfig.getValue( "/Resources/StorageElements/%s/DiskCacheTB" % storageElement, 1.) * 1000. / THROTTLING_STEPS return self.storageElementCache[storageElement] def __add(self, storageElement, size): """ Add size (in bytes) to current usage of storageElement (in GB) """ if not storageElement in self.storageElementUsage: self.storageElementUsage[storageElement] = {'TotalSize': 0.} size = size / (1000 * 1000 * 1000.0) self.storageElementUsage[storageElement]['TotalSize'] += size return size def _issuePrestageRequests(self, storageElement, seReplicaIDs, allReplicaInfo): """ Make the request to the SE and update the DB """ pfnRepIDs = {} for replicaID in seReplicaIDs: pfn = allReplicaInfo[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID # Now issue the prestage requests for the remaining replicas stageRequestMetadata = {} updatedPfnIDs = [] if pfnRepIDs: gLogger.info( "StageRequest._issuePrestageRequests: Submitting %s stage requests for %s." % (len(pfnRepIDs), storageElement)) res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime=self.pinLifetime) gLogger.debug( "StageRequest._issuePrestageRequests: replicaManager.prestageStorageFile: res=", res) #Daniela: fishy result from ReplicaManager!!! Should NOT return OK #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}} #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}} if not res['OK']: gLogger.error( "StageRequest._issuePrestageRequests: Completely failed to submit stage requests for replicas.", res['Message']) else: for pfn, requestID in res['Value']['Successful'].items(): if not stageRequestMetadata.has_key(requestID): stageRequestMetadata[requestID] = [] stageRequestMetadata[requestID].append(pfnRepIDs[pfn]) updatedPfnIDs.append(pfnRepIDs[pfn]) if stageRequestMetadata: gLogger.info( "StageRequest._issuePrestageRequests: %s stage request metadata to be updated." % len(stageRequestMetadata)) res = self.storageDB.insertStageRequest(stageRequestMetadata, self.pinLifetime) if not res['OK']: gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert stage request metadata.", res['Message']) return res res = self.storageDB.updateReplicaStatus(updatedPfnIDs, 'StageSubmitted') if not res['OK']: gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert replica status.", res['Message']) return def __sortBySE(self, replicaDict): seReplicas = {} replicaIDs = {} for replicaID, info in replicaDict.items(): lfn = info['LFN'] storageElement = info['SE'] size = info['Size'] pfn = info['PFN'] replicaIDs[replicaID] = { 'LFN': lfn, 'PFN': pfn, 'Size': size, 'StorageElement': storageElement } if not seReplicas.has_key(storageElement): seReplicas[storageElement] = [] seReplicas[storageElement].append(replicaID) return S_OK({'SEReplicas': seReplicas, 'AllReplicaInfo': replicaIDs}) def __getStagedReplicas(self): """ This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getStagedReplicas() if not res['OK']: gLogger.error( "StageRequest.__getStagedReplicas: Failed to get replicas with Waiting status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageRequest.__getStagedReplicas: No Waiting replicas found to process." ) else: gLogger.debug( "StageRequest.__getStagedReplicas: Obtained %s Waiting replicas(s) to process." % len(res['Value'])) return self.__sortBySE(res['Value']) def __getWaitingReplicas(self): """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getWaitingReplicas() if not res['OK']: gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." ) else: gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len(res['Value'])) return self.__sortBySE(res['Value']) def __getOfflineReplicas(self): """ This obtains the Offline replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getOfflineReplicas() if not res['OK']: gLogger.error( "StageRequest.__getOfflineReplicas: Failed to get replicas with Waiting status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageRequest.__getOfflineReplicas: No Waiting replicas found to process." ) else: gLogger.debug( "StageRequest.__getOfflineReplicas: Obtained %s Waiting replicas(s) to process." % len(res['Value'])) return self.__sortBySE(res['Value']) def __addAssociatedReplicas(self, replicasToStage, seReplicas, allReplicaInfo): """ Retrieve the list of Replicas that belong to the same Tasks as the provided list """ res = self.storageDB.getAssociatedReplicas(replicasToStage) if not res['OK']: gLogger.fatal( "StageRequest.__addAssociatedReplicas: Failed to get associated Replicas.", res['Message']) return res addReplicas = {'Offline': {}, 'Waiting': {}} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] size = info['Size'] pfn = info['PFN'] status = info['Status'] if status not in ['Waiting', 'Offline']: continue if not addReplicas[status].has_key(storageElement): addReplicas[status][storageElement] = [] replicaIDs[replicaID] = { 'LFN': lfn, 'PFN': pfn, 'Size': size, 'StorageElement': storageElement } addReplicas[status][storageElement].append(replicaID) waitingReplicas = addReplicas['Waiting'] offlineReplicas = addReplicas['Offline'] newReplicaInfo = replicaIDs allReplicaInfo.update(newReplicaInfo) # First handle Waiting Replicas for which metadata is to be checked for storageElement, seReplicaIDs in waitingReplicas.items(): for replicaID in list(seReplicaIDs): if replicaID in replicasToStage: seReplicaIDs.remove(replicaID) res = self.__checkIntegrity(storageElement, seReplicaIDs, allReplicaInfo) if not res['OK']: gLogger.error( 'StageRequest.__addAssociatedReplicas: Failed to check Replica Metadata', '(%s): %s' % (storageElement, res['Message'])) else: # keep all Replicas (Online and Offline) if not storageElement in seReplicas: seReplicas[storageElement] = [] seReplicas[storageElement].extend(res['Value']['Online']) replicasToStage.extend(res['Value']['Online']) seReplicas[storageElement].extend(res['Value']['Offline']) replicasToStage.extend(res['Value']['Offline']) # Then handle Offline Replicas for which metadata is already checked for storageElement, seReplicaIDs in offlineReplicas.items(): if not storageElement in seReplicas: seReplicas[storageElement] = [] for replicaID in sorted(seReplicaIDs): if replicaID in replicasToStage: seReplicaIDs.remove(replicaID) seReplicas[storageElement].extend(seReplicaIDs) replicasToStage.extend(seReplicaIDs) for replicaID in allReplicaInfo.keys(): if replicaID not in replicasToStage: del allReplicaInfo[replicaID] totalSize = 0 for storageElement in sorted(seReplicas.keys()): replicaIDs = seReplicas[storageElement] size = 0 for replicaID in replicaIDs: size += self.__add(storageElement, allReplicaInfo[replicaID]['Size']) gLogger.info( 'StageRequest.__addAssociatedReplicas: Considering %s GB to be staged at %s' % (size, storageElement)) totalSize += size gLogger.info( "StageRequest.__addAssociatedReplicas: Obtained %s GB for staging." % totalSize) return S_OK({ 'SEReplicas': seReplicas, 'AllReplicaInfo': allReplicaInfo }) def __checkIntegrity(self, storageElement, seReplicaIDs, allReplicaInfo): """ Check the integrity of the files to ensure they are available Updates status of Offline Replicas for a later pass Return list of Online replicas to be Stage """ if not seReplicaIDs: return S_OK({'Online': [], 'Offline': []}) pfnRepIDs = {} for replicaID in seReplicaIDs: pfn = allReplicaInfo[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % (len(pfnRepIDs), storageElement)) res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message']) return res terminalReplicaIDs = {} onlineReplicaIDs = [] offlineReplicaIDs = [] for pfn, metadata in res['Value']['Successful'].items(): if metadata['Size'] != allReplicaInfo[pfnRepIDs[pfn]]['Size']: gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn) terminalReplicaIDs[pfnRepIDs[ pfn]] = 'PFN StorageElement size does not match FileCatalog' pfnRepIDs.pop(pfn) elif metadata['Lost']: gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn) terminalReplicaIDs[ pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement' pfnRepIDs.pop(pfn) elif metadata['Unavailable']: gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn) terminalReplicaIDs[pfnRepIDs[ pfn]] = 'PFN is declared Unavailable by the StorageElement' pfnRepIDs.pop(pfn) else: if metadata['Cached']: gLogger.verbose( "StageRequest.__checkIntegrity: Cache hit for file.") onlineReplicaIDs.append(pfnRepIDs[pfn]) else: offlineReplicaIDs.append(pfnRepIDs[pfn]) for pfn, reason in res['Value']['Failed'].items(): if re.search('File does not exist', reason): gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn) terminalReplicaIDs[pfnRepIDs[ pfn]] = 'PFN does not exist in the StorageElement' pfnRepIDs.pop(pfn) # Update the states of the replicas in the database #TODO Sent status to integrity DB if terminalReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.storageDB.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message']) if onlineReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Online." % len(onlineReplicaIDs)) if offlineReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Offline." % len(offlineReplicaIDs)) res = self.storageDB.updateReplicaStatus(offlineReplicaIDs, 'Offline') return S_OK({'Online': onlineReplicaIDs, 'Offline': offlineReplicaIDs}) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res
def initializeHandler(cls, serviceInfoDict): """Initialization of DB object""" cls.storageManagementDB = StorageManagementDB() return S_OK()
def initializeStorageManagerHandler( serviceInfo ): global storageDB storageDB = StorageManagementDB() return S_OK()
class RequestPreparationAgent( AgentModule ): def initialize( self ): self.fileCatalog = FileCatalog() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): res = self.prepareNewReplicas() return res def prepareNewReplicas( self ): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "There were no New replicas found" ) return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len( replicaIDs ) ) # Check that the files exist in the FileCatalog res = self.__getExistingFiles( replicas.keys() ) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine existance of any files' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len( exist ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len( terminal ) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize( exist ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len( fileSizes ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len( terminal ) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas( fileSizes.keys() ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len( fileReplicas ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len( terminal ) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas[lfn] for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop( requestedSE ) else: replicaMetadata.append( ( replicaID, lfnReplicas[requestedSE], fileSizes[lfn] ) ) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) #res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.storageDB.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message'] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len( replicaMetadata ) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.storageDB.updateReplicaInformation( replicaMetadata ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message'] ) return S_OK() def __getNewReplicas( self ): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.storageDB.getCacheReplicas( {'Status':'New'} ) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len( res['Value'] ) ) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] if not replicas.has_key( lfn ): replicas[lfn] = {} replicas[lfn][storageElement] = replicaID replicaIDs[replicaID] = ( lfn, storageElement ) return S_OK( {'Replicas':replicas, 'ReplicaIDs':replicaIDs} ) def __getExistingFiles( self, lfns ): """ This checks that the files exist in the FileCatalog. """ filesExist = [] missing = {} res = self.fileCatalog.exists( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, exists in res['Value']['Successful'].items(): if exists: filesExist.append( lfn ) else: missing[lfn] = 'LFN not registered in the FileCatalog' if missing: for lfn, reason in missing.items(): gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, lfn ) self.__reportProblematicFiles( missing.keys(), 'LFN-LFC-DoesntExist' ) return S_OK( {'Exist':filesExist, 'Missing':missing, 'Failed':failed} ) def __getFileSize( self, lfns ): """ This obtains the file size from the FileCatalog. """ failed = [] fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn( "RequestPreparation.__getFileSize: %s" % reason, lfn ) self.__reportProblematicFiles( zeroSize.keys(), 'LFN-LFC-ZeroSize' ) return S_OK( {'FileSizes':fileSizes, 'ZeroSize':zeroSize, 'Failed':failed} ) def __getFileReplicas( self, lfns ): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.fileCatalog.getReplicas( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len( lfnReplicas.keys() ) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn ) self.__reportProblematicFiles( noReplicas.keys(), 'LFN-LFC-NoReplicas' ) return S_OK( {'Replicas':replicas, 'ZeroReplicas':noReplicas, 'Failed':failed} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res
class StageMonitorAgent( AgentModule ): def initialize( self ): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): res = self.monitorStageRequests() return res def monitorStageRequests( self ): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) ) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs ) return S_OK() def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ): terminalReplicaIDs = {} stagedReplicas = [] pfnRepIDs = {} pfnReqIDs = {} for replicaID in seReplicaIDs: pfn = replicaIDs[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID requestID = replicaIDs[replicaID].get( 'RequestID', None ) if requestID: pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) ) res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] ) return prestageStatus = res['Value'] for pfn, reason in prestageStatus['Failed'].items(): if re.search( 'File does not exist', reason ): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement' for pfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: stagedReplicas.append( pfnRepIDs[pfn] ) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) res = self.storageDB.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] ) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) ) res = self.storageDB.setStageComplete( stagedReplicas ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] ) res = self.storageDB.updateReplicaStatus( stagedReplicas, 'Staged' ) if not res['OK']: gLogger.error( "StageRequest.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] ) return def __getStageSubmittedReplicas( self ): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.storageDB.getCacheReplicas( {'Status':'StageSubmitted'} ) if not res['OK']: gLogger.error( "StageRequest.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageRequest.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageRequest.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) ) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key( storageElement ): seReplicas[storageElement] = [] seReplicas[storageElement].append( replicaID ) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.storageDB.getStageRequests( {'ReplicaID':replicaIDs.keys()} ) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) ) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res
class StageRequestAgent(AgentModule): def initialize(self): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # pin lifetime = 1 day self.pinLifetime = self.am_getOption('PinLifetime', 60 * 60 * 24) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): # Get the current submitted stage space and the amount of pinned space for each storage element res = self.storageDB.getSubmittedStagePins() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to obtain submitted requests from StorageManagementDB.", res['Message']) return res self.storageElementUsage = res['Value'] if self.storageElementUsage: gLogger.info( "StageRequest.execute: Active stage/pin requests found at the following sites:" ) for storageElement in sortList(self.storageElementUsage.keys()): seDict = self.storageElementUsage[storageElement] # Daniela: fishy? Changed it to GB and division by 1024 instead of 1000 gLogger.info( "StageRequest.execute: %s: %s replicas with a size of %.3f GB." % (storageElement.ljust(15), str( seDict['Replicas']).rjust(6), seDict['TotalSize'] / (1024 * 1024 * 1024.0))) if not self.storageElementUsage: gLogger.info( "StageRequest.execute: No active stage/pin requests found.") res = self.submitStageRequests() return res def submitStageRequests(self): """ This manages the Waiting->StageSubmitted transition of the Replicas """ res = self.__getWaitingReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res if not res['Value']: gLogger.info( "StageRequest.submitStageRequests: There were no Waiting replicas found" ) return res seReplicas = res['Value']['SEReplicas'] allReplicaInfo = res['Value']['ReplicaIDs'] gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas Waiting for staging." % len(allReplicaInfo)) for storageElement, seReplicaIDs in seReplicas.items(): self.__issuePrestageRequests(storageElement, seReplicaIDs, allReplicaInfo) return S_OK() def __issuePrestageRequests(self, storageElement, seReplicaIDs, allReplicaInfo): # First select which files can be eligible for prestaging based on the available space usedSpace = 0 if self.storageElementUsage.has_key(storageElement): usedSpace = self.storageElementUsage[storageElement]['TotalSize'] totalSpace = gConfig.getValue( "/Resources/StorageElements/%s/CacheSize" % storageElement, 0) if not totalSpace: gLogger.info( "StageRequest__issuePrestageRequests: No space restriction at %s" % (storageElement)) selectedReplicaIDs = seReplicaIDs elif (totalSpace > usedSpace): gLogger.debug( "StageRequest__issuePrestageRequests: total space = %s, used space = %s" % (totalSpace, usedSpace)) gLogger.info( "StageRequest__issuePrestageRequests: %.4f GB available at %s" % ((totalSpace - usedSpace) / (1024 * 1024 * 1024.0), storageElement)) selectedReplicaIDs = [] #logic was bad here, before the first comparison test, the single selected file for staging could be larger than the available space for replicaID in seReplicaIDs: if (totalSpace - usedSpace) > allReplicaInfo[replicaID]['Size']: usedSpace += allReplicaInfo[replicaID]['Size'] selectedReplicaIDs.append(replicaID) else: gLogger.info( "StageRequest__issuePrestageRequests: %.2f GB used at %s (limit %2.f GB)" % ((usedSpace) / (1024 * 1024 * 1024.0), storageElement, totalSpace / (1024 * 1024 * 1024.0))) return gLogger.info( "StageRequest__issuePrestageRequests: Selected %s files eligible for staging at %s." % (len(selectedReplicaIDs), storageElement)) # Now check that the integrity of the eligible files pfnRepIDs = {} for replicaID in selectedReplicaIDs: pfn = allReplicaInfo[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID res = self.__checkIntegrity(storageElement, pfnRepIDs, allReplicaInfo) if not res['OK']: return res pfnRepIDs = res['Value'] # Now issue the prestage requests for the remaining replicas stageRequestMetadata = {} updatedPfnIDs = [] if pfnRepIDs: gLogger.info( "StageRequest.__issuePrestageRequests: Submitting %s stage requests for %s." % (len(pfnRepIDs), storageElement)) res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime=self.pinLifetime) gLogger.debug( "StageRequest.__issuePrestageRequests: replicaManager.prestageStorageFile: res=", res) #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}} #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}} if not res['OK']: gLogger.error( "StageRequest.__issuePrestageRequests: Completely failed to sumbmit stage requests for replicas.", res['Message']) else: for pfn, requestID in res['Value']['Successful'].items(): if not stageRequestMetadata.has_key(requestID): stageRequestMetadata[requestID] = [] stageRequestMetadata[requestID].append(pfnRepIDs[pfn]) updatedPfnIDs.append(pfnRepIDs[pfn]) if stageRequestMetadata: gLogger.info( "StageRequest.__issuePrestageRequests: %s stage request metadata to be updated." % len(stageRequestMetadata)) res = self.storageDB.insertStageRequest(stageRequestMetadata, self.pinLifetime) if not res['OK']: gLogger.error( "StageRequest.__issuePrestageRequests: Failed to insert stage request metadata.", res['Message']) res = self.storageDB.updateReplicaStatus(updatedPfnIDs, 'StageSubmitted') if not res['OK']: gLogger.error( "StageRequest.__issuePrestageRequests: Failed to insert replica status.", res['Message']) return def __getWaitingReplicas(self): """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getWaitingReplicas() if not res['OK']: gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." ) return S_OK() else: gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len(res['Value'])) seReplicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] size = info['Size'] pfn = info['PFN'] # lfn,storageElement,size,pfn = info replicaIDs[replicaID] = { 'LFN': lfn, 'PFN': pfn, 'Size': size, 'StorageElement': storageElement } if not seReplicas.has_key(storageElement): seReplicas[storageElement] = [] seReplicas[storageElement].append(replicaID) return S_OK({'SEReplicas': seReplicas, 'ReplicaIDs': replicaIDs}) def __checkIntegrity(self, storageElement, pfnRepIDs, replicaIDs): # Check the integrity of the files to ensure they are available terminalReplicaIDs = {} gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % (len(pfnRepIDs), storageElement)) res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message']) return res for pfn, metadata in res['Value']['Successful'].items(): if metadata['Cached']: gLogger.info( "StageRequest.__checkIntegrity: Cache hit for file.") if metadata['Size'] != replicaIDs[pfnRepIDs[pfn]]['Size']: gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn) terminalReplicaIDs[pfnRepIDs[ pfn]] = 'PFN StorageElement size does not match FileCatalog' pfnRepIDs.pop(pfn) elif metadata['Lost']: gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn) terminalReplicaIDs[ pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement' pfnRepIDs.pop(pfn) elif metadata['Unavailable']: gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn) terminalReplicaIDs[pfnRepIDs[ pfn]] = 'PFN is declared Unavailable by the StorageElement' pfnRepIDs.pop(pfn) for pfn, reason in res['Value']['Failed'].items(): if re.search('File does not exist', reason): gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn) terminalReplicaIDs[pfnRepIDs[ pfn]] = 'PFN does not exist in the StorageElement' pfnRepIDs.pop(pfn) # Update the states of the replicas in the database #TODO Sent status to integrity DB if terminalReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.storageDB.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message']) return S_OK(pfnRepIDs) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res
class StageRequestAgent( AgentModule ): def initialize( self ): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # pin lifetime = 1 day self.pinLifetime = self.am_getOption( 'PinLifetime', 60 * 60 * 24 ) # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): # Get the current submitted stage space and the amount of pinned space for each storage element res = self.storageDB.getSubmittedStagePins() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to obtain submitted requests from StorageManagementDB.", res['Message'] ) return res self.storageElementUsage = res['Value'] if self.storageElementUsage: gLogger.info( "StageRequest.execute: Active stage/pin requests found at the following sites:" ) for storageElement in sortList( self.storageElementUsage.keys() ): seDict = self.storageElementUsage[storageElement] # Daniela: fishy? Changed it to GB and division by 1024 instead of 1000 gLogger.info( "StageRequest.execute: %s: %s replicas with a size of %.3f GB." % ( storageElement.ljust( 15 ), str( seDict['Replicas'] ).rjust( 6 ), seDict['TotalSize'] / ( 1024 * 1024 * 1024.0 ) ) ) if not self.storageElementUsage: gLogger.info( "StageRequest.execute: No active stage/pin requests found." ) res = self.submitStageRequests() return res def submitStageRequests( self ): """ This manages the Waiting->StageSubmitted transition of the Replicas """ res = self.__getWaitingReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "StageRequest.submitStageRequests: There were no Waiting replicas found" ) return res seReplicas = res['Value']['SEReplicas'] allReplicaInfo = res['Value']['ReplicaIDs'] gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas Waiting for staging." % len( allReplicaInfo ) ) for storageElement, seReplicaIDs in seReplicas.items(): self.__issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo ) return S_OK() def __issuePrestageRequests( self, storageElement, seReplicaIDs, allReplicaInfo ): # First select which files can be eligible for prestaging based on the available space usedSpace = 0 if self.storageElementUsage.has_key( storageElement ): usedSpace = self.storageElementUsage[storageElement]['TotalSize'] totalSpace = gConfig.getValue( "/Resources/StorageElements/%s/CacheSize" % storageElement, 0 ) if not totalSpace: gLogger.info( "StageRequest__issuePrestageRequests: No space restriction at %s" % ( storageElement ) ) selectedReplicaIDs = seReplicaIDs elif ( totalSpace > usedSpace ): gLogger.debug( "StageRequest__issuePrestageRequests: total space = %s, used space = %s" % ( totalSpace, usedSpace ) ) gLogger.info( "StageRequest__issuePrestageRequests: %.4f GB available at %s" % ( ( totalSpace - usedSpace ) / ( 1024 * 1024 * 1024.0 ), storageElement ) ) selectedReplicaIDs = [] #logic was bad here, before the first comparison test, the single selected file for staging could be larger than the available space for replicaID in seReplicaIDs: if ( totalSpace - usedSpace ) > allReplicaInfo[replicaID]['Size']: usedSpace += allReplicaInfo[replicaID]['Size'] selectedReplicaIDs.append( replicaID ) else: gLogger.info( "StageRequest__issuePrestageRequests: %.2f GB used at %s (limit %2.f GB)" % ( ( usedSpace ) / ( 1024 * 1024 * 1024.0 ), storageElement, totalSpace / ( 1024 * 1024 * 1024.0 ) ) ) return gLogger.info( "StageRequest__issuePrestageRequests: Selected %s files eligible for staging at %s." % ( len( selectedReplicaIDs ), storageElement ) ) # Now check that the integrity of the eligible files pfnRepIDs = {} for replicaID in selectedReplicaIDs: pfn = allReplicaInfo[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID res = self.__checkIntegrity( storageElement, pfnRepIDs, allReplicaInfo ) if not res['OK']: return res pfnRepIDs = res['Value'] # Now issue the prestage requests for the remaining replicas stageRequestMetadata = {} updatedPfnIDs = [] if pfnRepIDs: gLogger.info( "StageRequest.__issuePrestageRequests: Submitting %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) ) res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime = self.pinLifetime ) gLogger.debug( "StageRequest.__issuePrestageRequests: replicaManager.prestageStorageFile: res=", res ) #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}} #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}} if not res['OK']: gLogger.error( "StageRequest.__issuePrestageRequests: Completely failed to sumbmit stage requests for replicas.", res['Message'] ) else: for pfn, requestID in res['Value']['Successful'].items(): if not stageRequestMetadata.has_key( requestID ): stageRequestMetadata[requestID] = [] stageRequestMetadata[requestID].append( pfnRepIDs[pfn] ) updatedPfnIDs.append( pfnRepIDs[pfn] ) if stageRequestMetadata: gLogger.info( "StageRequest.__issuePrestageRequests: %s stage request metadata to be updated." % len( stageRequestMetadata ) ) res = self.storageDB.insertStageRequest( stageRequestMetadata, self.pinLifetime ) if not res['OK']: gLogger.error( "StageRequest.__issuePrestageRequests: Failed to insert stage request metadata.", res['Message'] ) res = self.storageDB.updateReplicaStatus( updatedPfnIDs, 'StageSubmitted' ) if not res['OK']: gLogger.error( "StageRequest.__issuePrestageRequests: Failed to insert replica status.", res['Message'] ) return def __getWaitingReplicas( self ): """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getWaitingReplicas() if not res['OK']: gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." ) return S_OK() else: gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) ) seReplicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] size = info['Size'] pfn = info['PFN'] # lfn,storageElement,size,pfn = info replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement} if not seReplicas.has_key( storageElement ): seReplicas[storageElement] = [] seReplicas[storageElement].append( replicaID ) return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} ) def __checkIntegrity( self, storageElement, pfnRepIDs, replicaIDs ): # Check the integrity of the files to ensure they are available terminalReplicaIDs = {} gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % ( len( pfnRepIDs ), storageElement ) ) res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement ) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message'] ) return res for pfn, metadata in res['Value']['Successful'].items(): if metadata['Cached']: gLogger.info( "StageRequest.__checkIntegrity: Cache hit for file." ) if metadata['Size'] != replicaIDs[pfnRepIDs[pfn]]['Size']: gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN StorageElement size does not match FileCatalog' pfnRepIDs.pop( pfn ) elif metadata['Lost']: gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement' pfnRepIDs.pop( pfn ) elif metadata['Unavailable']: gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN is declared Unavailable by the StorageElement' pfnRepIDs.pop( pfn ) for pfn, reason in res['Value']['Failed'].items(): if re.search( 'File does not exist', reason ): gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN does not exist in the StorageElement' pfnRepIDs.pop( pfn ) # Update the states of the replicas in the database #TODO Sent status to integrity DB if terminalReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) res = self.storageDB.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message'] ) return S_OK( pfnRepIDs ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res
class RequestFinalizationAgent( AgentModule ): def initialize( self ): # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) self.storageDB = StorageManagementDB() #self.stagerClient = StorageManagerClient() return S_OK() def execute( self ): res = self.clearFailedTasks() res = self.callbackStagedTasks() res = self.removeUnlinkedReplicas() res = self.setOldTasksAsFailed( self.am_getOption( 'FailIntervalDay', 3 ) ) return res def clearFailedTasks( self ): """ This obtains the tasks which are marked as Failed and remove all the associated records """ res = self.storageDB.getTasksWithStatus( 'Failed' ) if not res['OK']: gLogger.fatal( "RequestFinalization.clearFailedTasks: Failed to get Failed Tasks from StagerDB.", res['Message'] ) return res failedTasks = res['Value'] gLogger.info( "RequestFinalization.clearFailedTasks: Obtained %s tasks in the 'Failed' status." % len( failedTasks ) ) for taskID, ( source, callback, sourceTask ) in failedTasks.items(): if ( callback and sourceTask ): res = self.__performCallback( 'Failed', callback, sourceTask ) if not res['OK']: failedTasks.pop( taskID ) if not failedTasks: gLogger.info( "RequestFinalization.clearFailedTasks: No tasks to remove." ) return S_OK() gLogger.info( "RequestFinalization.clearFailedTasks: Removing %s tasks..." % len( failedTasks ) ) res = self.storageDB.removeTasks( failedTasks.keys() ) if not res['OK']: gLogger.error( "RequestFinalization.clearFailedTasks: Failed to remove tasks.", res['Message'] ) return res gLogger.info( "RequestFinalization.clearFailedTasks: ...removed." ) return S_OK() def callbackDoneTasks( self ): """ This issues the call back message for the Tasks with a State='Done' """ res = self.storageDB.getTasksWithStatus( 'Done' ) if not res['OK']: gLogger.fatal( "RequestFinalization.callbackDoneTasks: Failed to get Done Tasks from StorageManagementDB.", res['Message'] ) return res doneTasks = res['Value'] gLogger.info( "RequestFinalization.callbackDoneTasks: Obtained %s tasks in the 'Done' status." % len( doneTasks ) ) for taskID, ( source, callback, sourceTask ) in doneTasks.items(): if ( callback and sourceTask ): res = self.__performCallback( 'Done', callback, sourceTask ) if not res['OK']: doneTasks.pop( taskID ) if not doneTasks: gLogger.info( "RequestFinalization.callbackDoneTasks: No tasks to update to Done." ) return S_OK() res = self.storageDB.removeTasks( doneTasks.keys() ) if not res['OK']: gLogger.fatal( "RequestFinalization.callbackDoneTasks: Failed to remove Done tasks.", res['Message'] ) return res def callbackStagedTasks( self ): """ This updates the status of the Tasks to Done then issues the call back message """ res = self.storageDB.getTasksWithStatus( 'Staged' ) if not res['OK']: gLogger.fatal( "RequestFinalization.callbackStagedTasks: Failed to get Staged Tasks from StagerDB.", res['Message'] ) return res stagedTasks = res['Value'] gLogger.info( "RequestFinalization.callbackStagedTasks: Obtained %s tasks in the 'Staged' status." % len( stagedTasks ) ) for taskID, ( source, callback, sourceTask ) in stagedTasks.items(): if ( callback and sourceTask ): res = self.__performCallback( 'Done', callback, sourceTask ) if not res['OK']: stagedTasks.pop( taskID ) else: gLogger.info( "RequestFinalization.callbackStagedTasks, Task = %s: %s" % ( sourceTask, res['Value'] ) ) if not stagedTasks: gLogger.info( "RequestFinalization.callbackStagedTasks: No tasks to update to Done." ) return S_OK() # Daniela: Why is the line below commented out? #res = self.stagerClient.setTasksDone(stagedTasks.keys()) res = self.storageDB.removeTasks( stagedTasks.keys() ) if not res['OK']: gLogger.fatal( "RequestFinalization.callbackStagedTasks: Failed to remove staged Tasks.", res['Message'] ) return res def __performCallback( self, status, callback, sourceTask ): method, service = callback.split( '@' ) gLogger.debug( "RequestFinalization.__performCallback: Attempting to perform call back for %s with %s status" % ( sourceTask, status ) ) client = RPCClient( service ) gLogger.debug( "RequestFinalization.__performCallback: Created RPCClient to %s" % service ) execString = "res = client.%s('%s','%s')" % ( method, sourceTask, status ) gLogger.debug( "RequestFinalization.__performCallback: Attempting to invoke %s service method" % method ) exec( execString ) if not res['OK']: gLogger.error( "RequestFinalization.__performCallback: Failed to perform callback", res['Message'] ) else: gLogger.info( "RequestFinalization.__performCallback: Successfully issued callback to %s for %s with %s status" % ( callback, sourceTask, status ) ) return res def removeUnlinkedReplicas( self ): gLogger.info( "RequestFinalization.removeUnlinkedReplicas: Attempting to cleanup unlinked Replicas." ) res = self.storageDB.removeUnlinkedReplicas() if not res['OK']: gLogger.error( "RequestFinalization.removeUnlinkedReplicas: Failed to cleanup unlinked Replicas.", res['Message'] ) else: gLogger.info( "RequestFinalization.removeUnlinkedReplicas: Successfully removed unlinked Replicas." ) return res def clearReleasedTasks( self ): # TODO: issue release of the pins associated to this task res = self.storageDB.getTasksWithStatus( 'Released' ) if not res['OK']: gLogger.fatal( "RequestFinalization.clearReleasedTasks: Failed to get Released Tasks from StagerDB.", res['Message'] ) return res stagedTasks = res['Value'] gLogger.info( "RequestFinalization.clearReleasedTasks: Removing %s tasks..." % len( stagedTasks ) ) res = self.storageDB.removeTasks( stagedTasks.keys() ) if not res['OK']: gLogger.error( "RequestFinalization.clearReleasedTasks: Failed to remove tasks.", res['Message'] ) return res gLogger.info( "RequestFinalization.clearReleasedTasks: ...removed." ) return S_OK() def setOldTasksAsFailed( self, daysOld ): gLogger.debug( "RequestFinalization.setOldTasksAsFailed: Attempting...." ) res = self.storageDB.setOldTasksAsFailed( daysOld ) if not res['OK']: gLogger.error( "RequestFinalization.setOldTasksAsFailed: Failed to set old tasks to a Failed state.", res['Message'] ) return res return S_OK()
def initializeStorageManagerHandler( serviceInfo ): global storageDB storageDB = StorageManagementDB() return storageDB._checkTable()
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = self.prepareNewReplicas() return res def prepareNewReplicas(self): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message']) return res if not res['Value']: gLogger.info("There were no New replicas found") return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)) # Check that the files exist in the FileCatalog res = self.__getExistingFiles(replicas.keys()) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine existance of any files' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal)) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(fileSizes.keys()) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal)) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas[lfn] for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[ replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append( (replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)) #res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.storageDB.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message']) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.storageDB.updateReplicaInformation(replicaMetadata) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message']) return S_OK() def __getNewReplicas(self): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.storageDB.getCacheReplicas({'Status': 'New'}) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message']) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res['Value'])) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] if not replicas.has_key(lfn): replicas[lfn] = {} replicas[lfn][storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({'Replicas': replicas, 'ReplicaIDs': replicaIDs}) def __getExistingFiles(self, lfns): """ This checks that the files exist in the FileCatalog. """ filesExist = [] missing = {} res = self.fileCatalog.exists(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message']) return res failed = res['Value']['Failed'] for lfn, exists in res['Value']['Successful'].items(): if exists: filesExist.append(lfn) else: missing[lfn] = 'LFN not registered in the FileCatalog' if missing: for lfn, reason in missing.items(): gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, lfn) self.__reportProblematicFiles(missing.keys(), 'LFN-LFC-DoesntExist') return S_OK({ 'Exist': filesExist, 'Missing': missing, 'Failed': failed }) def __getFileSize(self, lfns): """ This obtains the file size from the FileCatalog. """ failed = [] fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message']) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[ lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), 'LFN-LFC-ZeroSize') return S_OK({ 'FileSizes': fileSizes, 'ZeroSize': zeroSize, 'Failed': failed }) def __getFileReplicas(self, lfns): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.fileCatalog.getReplicas(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message']) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len(lfnReplicas.keys()) == 0: noReplicas[ lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(noReplicas.keys(), 'LFN-LFC-NoReplicas') return S_OK({ 'Replicas': replicas, 'ZeroReplicas': noReplicas, 'Failed': failed }) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res
class StageMonitorAgent( AgentModule ): def initialize( self ): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): res = getProxyInfo( disableVOMS = True ) if not res['OK']: return res self.proxyInfoDict = res['Value'] res = self.monitorStageRequests() return res def monitorStageRequests( self ): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len( replicaIDs ) ) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests( storageElement, seReplicaIDs, replicaIDs ) gDataStoreClient.commit() return S_OK() def __monitorStorageElementStageRequests( self, storageElement, seReplicaIDs, replicaIDs ): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] pfnRepIDs = {} pfnReqIDs = {} for replicaID in seReplicaIDs: pfn = replicaIDs[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID requestID = replicaIDs[replicaID].get( 'RequestID', None ) if requestID: pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) ) oAccounting = DataOperation() oAccounting.setStartTime() res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message'] ) return prestageStatus = res['Value'] accountingDict = self.__newAccountingDict( storageElement ) for pfn, reason in prestageStatus['Failed'].items(): accountingDict['TransferTotal'] += 1 if re.search( 'File does not exist', reason ): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement' for pfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: accountingDict['TransferTotal'] += 1 accountingDict['TransferOK'] += 1 accountingDict['TransferSize'] += staged['Size'] stagedReplicas.append( pfnRepIDs[pfn] ) if staged and 'Cached' in staged and not staged['Cached']: oldRequests.append( pfnRepIDs[pfn] ); #only ReplicaIDs oAccounting.setValuesFromDict( accountingDict ) oAccounting.setEndTime() gDataStoreClient.addRegister( oAccounting ) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) res = self.storageDB.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message'] ) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len( stagedReplicas ) ) res = self.storageDB.setStageComplete( stagedReplicas ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message'] ) res = self.storageDB.updateReplicaStatus( stagedReplicas, 'Staged' ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message'] ) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len( oldRequests ) ) res = self.__wakeupOldRequests( oldRequests ) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message'] ) return def __newAccountingDict( self, storageElement ): """ Generate a new accounting Dict """ accountingDict = {} accountingDict['OperationType'] = 'Stage' accountingDict['User'] = self.proxyInfoDict['username'] accountingDict['Protocol'] = 'Stager' accountingDict['RegistrationTime'] = 0.0 accountingDict['RegistrationOK'] = 0 accountingDict['RegistrationTotal'] = 0 accountingDict['FinalStatus'] = 'Successful' accountingDict['Source'] = storageElement accountingDict['Destination'] = storageElement accountingDict['ExecutionSite'] = siteName() accountingDict['TransferTotal'] = 0 accountingDict['TransferOK'] = 0 accountingDict['TransferSize'] = 0 accountingDict['TransferTime'] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas( self ): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.storageDB.getCacheReplicas( {'Status':'StageSubmitted'} ) if not res['OK']: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len( res['Value'] ) ) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key( storageElement ): seReplicas[storageElement] = [] seReplicas[storageElement].append( replicaID ) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.storageDB.getStageRequests( {'ReplicaID':replicaIDs.keys()} ) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % ( replicaIDs.keys() ) ) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK( {'SEReplicas':seReplicas, 'ReplicaIDs':replicaIDs} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name ) if not res['OK']: gLogger.error( "StageMonitor.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "StageMonitor.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "StageMonitor.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res def __wakeupOldRequests( self, oldRequests ): gLogger.info( "StageMonitor.__wakeupOldRequests: Attempting..." ) retryInterval = self.am_getOption( 'RetryIntervalHour', 2 ) res = self.storageDB.wakeupOldRequests( oldRequests, retryInterval ) if not res['OK']: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message'] ) return res return S_OK()
class StageMonitorAgent(AgentModule): def initialize(self): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = getProxyInfo(disableVOMS=True) if not res['OK']: return res self.proxyInfoDict = res['Value'] res = self.monitorStageRequests() return res def monitorStageRequests(self): """ This is the third logical task manages the StageSubmitted->Staged transition of the Replicas """ res = self.__getStageSubmittedReplicas() if not res['OK']: gLogger.fatal( "StageMonitor.monitorStageRequests: Failed to get replicas from StorageManagementDB.", res['Message']) return res if not res['Value']: gLogger.info( "StageMonitor.monitorStageRequests: There were no StageSubmitted replicas found" ) return res seReplicas = res['Value']['SEReplicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "StageMonitor.monitorStageRequests: Obtained %s StageSubmitted replicas for monitoring." % len(replicaIDs)) for storageElement, seReplicaIDs in seReplicas.items(): self.__monitorStorageElementStageRequests(storageElement, seReplicaIDs, replicaIDs) gDataStoreClient.commit() return S_OK() def __monitorStorageElementStageRequests(self, storageElement, seReplicaIDs, replicaIDs): terminalReplicaIDs = {} oldRequests = [] stagedReplicas = [] pfnRepIDs = {} pfnReqIDs = {} for replicaID in seReplicaIDs: pfn = replicaIDs[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID requestID = replicaIDs[replicaID].get('RequestID', None) if requestID: pfnReqIDs[pfn] = replicaIDs[replicaID]['RequestID'] gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: Monitoring %s stage requests for %s." % (len(pfnRepIDs), storageElement)) oAccounting = DataOperation() oAccounting.setStartTime() res = self.replicaManager.getStorageFileMetadata( pfnReqIDs.keys(), storageElement) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Completely failed to monitor stage requests for replicas.", res['Message']) return prestageStatus = res['Value'] accountingDict = self.__newAccountingDict(storageElement) for pfn, reason in prestageStatus['Failed'].items(): accountingDict['TransferTotal'] += 1 if re.search('File does not exist', reason): gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: PFN did not exist in the StorageElement", pfn) terminalReplicaIDs[ pfnRepIDs[pfn]] = 'PFN did not exist in the StorageElement' for pfn, staged in prestageStatus['Successful'].items(): if staged and 'Cached' in staged and staged['Cached']: accountingDict['TransferTotal'] += 1 accountingDict['TransferOK'] += 1 accountingDict['TransferSize'] += staged['Size'] stagedReplicas.append(pfnRepIDs[pfn]) if staged and 'Cached' in staged and not staged['Cached']: oldRequests.append(pfnRepIDs[pfn]) #only ReplicaIDs oAccounting.setValuesFromDict(accountingDict) oAccounting.setEndTime() gDataStoreClient.addRegister(oAccounting) # Update the states of the replicas in the database if terminalReplicaIDs: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s replicas are terminally failed." % len(terminalReplicaIDs)) res = self.storageDB.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to update replica failures.", res['Message']) if stagedReplicas: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s staged replicas to be updated." % len(stagedReplicas)) res = self.storageDB.setStageComplete(stagedReplicas) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to updated staged replicas.", res['Message']) res = self.storageDB.updateReplicaStatus(stagedReplicas, 'Staged') if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to insert replica status.", res['Message']) if oldRequests: gLogger.info( "StageMonitor.__monitorStorageElementStageRequests: %s old requests will be retried." % len(oldRequests)) res = self.__wakeupOldRequests(oldRequests) if not res['OK']: gLogger.error( "StageMonitor.__monitorStorageElementStageRequests: Failed to wakeup old requests.", res['Message']) return def __newAccountingDict(self, storageElement): """ Generate a new accounting Dict """ accountingDict = {} accountingDict['OperationType'] = 'Stage' accountingDict['User'] = self.proxyInfoDict['username'] accountingDict['Protocol'] = 'Stager' accountingDict['RegistrationTime'] = 0.0 accountingDict['RegistrationOK'] = 0 accountingDict['RegistrationTotal'] = 0 accountingDict['FinalStatus'] = 'Successful' accountingDict['Source'] = storageElement accountingDict['Destination'] = storageElement accountingDict['ExecutionSite'] = siteName() accountingDict['TransferTotal'] = 0 accountingDict['TransferOK'] = 0 accountingDict['TransferSize'] = 0 accountingDict['TransferTime'] = self.am_getPollingTime() return accountingDict def __getStageSubmittedReplicas(self): """ This obtains the StageSubmitted replicas from the Replicas table and the RequestID from the StageRequests table """ res = self.storageDB.getCacheReplicas({'Status': 'StageSubmitted'}) if not res['OK']: gLogger.error( "StageMonitor.__getStageSubmittedReplicas: Failed to get replicas with StageSubmitted status.", res['Message']) return res if not res['Value']: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: No StageSubmitted replicas found to process." ) return S_OK() else: gLogger.debug( "StageMonitor.__getStageSubmittedReplicas: Obtained %s StageSubmitted replicas(s) to process." % len(res['Value'])) seReplicas = {} replicaIDs = res['Value'] for replicaID, info in replicaIDs.items(): storageElement = info['SE'] if not seReplicas.has_key(storageElement): seReplicas[storageElement] = [] seReplicas[storageElement].append(replicaID) # RequestID was missing from replicaIDs dictionary BUGGY? res = self.storageDB.getStageRequests({'ReplicaID': replicaIDs.keys()}) if not res['OK']: return res if not res['Value']: return S_ERROR( 'Could not obtain request IDs for replicas %s from StageRequests table' % (replicaIDs.keys())) for replicaID, info in res['Value'].items(): reqID = info['RequestID'] replicaIDs[replicaID]['RequestID'] = reqID return S_OK({'SEReplicas': seReplicas, 'ReplicaIDs': replicaIDs}) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name) if not res['OK']: gLogger.error( "StageMonitor.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "StageMonitor.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "StageMonitor.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res def __wakeupOldRequests(self, oldRequests): gLogger.info("StageMonitor.__wakeupOldRequests: Attempting...") retryInterval = self.am_getOption('RetryIntervalHour', 2) res = self.storageDB.wakeupOldRequests(oldRequests, retryInterval) if not res['OK']: gLogger.error( "StageMonitor.__wakeupOldRequests: Failed to resubmit old requests.", res['Message']) return res return S_OK()
class StageRequestAgent( AgentModule ): def initialize( self ): self.replicaManager = ReplicaManager() #self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() self.storageDB = StorageManagementDB() # pin lifetime = 1 day self.pinLifetime = self.am_getOption( 'PinLifetime', THROTTLING_TIME ) # Resources helper self.resources = Resources() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): # Get the current submitted stage space and the amount of pinned space for each storage element res = self.getStorageUsage() if not res['OK']: return res return self.submitStageRequests() def getStorageUsage( self ): """ Fill the current Status of the SE Caches from the DB """ self.storageElementCache = {} res = self.storageDB.getSubmittedStagePins() if not res['OK']: gLogger.fatal( "StageRequest.getStorageUsage: Failed to obtain submitted requests from StorageManagementDB.", res['Message'] ) return res self.storageElementUsage = res['Value'] if self.storageElementUsage: gLogger.info( "StageRequest.getStorageUsage: Active stage/pin requests found at the following sites:" ) for storageElement in sortList( self.storageElementUsage.keys() ): seDict = self.storageElementUsage[storageElement] # Convert to GB for printout seDict['TotalSize'] = seDict['TotalSize'] / ( 1000 * 1000 * 1000.0 ) gLogger.info( "StageRequest.getStorageUsage: %s: %s replicas with a size of %.3f GB." % ( storageElement.ljust( 15 ), str( seDict['Replicas'] ).rjust( 6 ), seDict['TotalSize'] ) ) if not self.storageElementUsage: gLogger.info( "StageRequest.getStorageUsage: No active stage/pin requests found." ) return S_OK() def submitStageRequests( self ): """ This manages the following transitions of the Replicas * Waiting -> Offline (if the file is not found Cached) * Waiting -> StageSubmitted (if the file is found Cached) * Offline -> StageSubmitted (if there are not more Waiting replicas) """ # Retry Replicas that have not been Staged in a previous attempt res = self._getMissingReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res seReplicas = res['Value']['SEReplicas'] allReplicaInfo = res['Value']['AllReplicaInfo'] if seReplicas: gLogger.info( "StageRequest.submitStageRequests: Completing partially Staged Tasks" ) for storageElement, seReplicaIDs in seReplicas.items(): gLogger.debug( 'Staging at %s:' % storageElement, seReplicaIDs ) self._issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo ) # Check Waiting Replicas and select those found Online and all other Replicas from the same Tasks res = self._getOnlineReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res seReplicas = res['Value']['SEReplicas'] allReplicaInfo = res['Value']['AllReplicaInfo'] # Check Offline Replicas that fit in the Cache and all other Replicas from the same Tasks res = self._getOfflineReplicas() if not res['OK']: gLogger.fatal( "StageRequest.submitStageRequests: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res # Merge info from both results for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): if storageElement not in seReplicas: seReplicas[storageElement] = seReplicaIDs else: for replicaID in seReplicaIDs: if replicaID not in seReplicas[storageElement]: seReplicas[storageElement].append( replicaID ) allReplicaInfo.update( res['Value']['AllReplicaInfo'] ) gLogger.info( "StageRequest.submitStageRequests: Obtained %s replicas for staging." % len( allReplicaInfo ) ) for storageElement, seReplicaIDs in seReplicas.items(): gLogger.debug( 'Staging at %s:' % storageElement, seReplicaIDs ) self._issuePrestageRequests( storageElement, seReplicaIDs, allReplicaInfo ) return S_OK() def _getMissingReplicas( self ): """ This recovers Replicas that were not Staged on a previous attempt (the stage request failed or timed out), while other Replicas of the same task are already Staged. If left behind they can produce a deadlock. All SEs are considered, even if their Cache is full """ # Get Replicas that are in Staged/StageSubmitted gLogger.info( 'StageRequest._getMissingReplicas: Checking Staged Replicas' ) res = self.__getStagedReplicas() if not res['OK']: gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res seReplicas = {} allReplicaInfo = res['Value']['AllReplicaInfo'] replicasToStage = [] for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): # Consider all SEs replicasToStage.extend( seReplicaIDs ) # Get Replicas from the same Tasks as those selected res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo ) if not res['OK']: gLogger.fatal( "StageRequest._getMissingReplicas: Failed to get associated Replicas.", res['Message'] ) return res def _getOnlineReplicas( self ): """ This manages the transition * Waiting -> Offline (if the file is not found Cached) and returns the list of Cached Replicas for which the pin time has to be extended SEs for which the cache is currently full are not considered """ # Get all Replicas in Waiting Status associated to Staging Tasks gLogger.verbose( 'StageRequest._getOnlineReplicas: Checking Online Replicas to be handled' ) res = self.__getWaitingReplicas() if not res['OK']: gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res seReplicas = {} allReplicaInfo = res['Value']['AllReplicaInfo'] if not len( allReplicaInfo ): gLogger.info( "StageRequest._getOnlineReplicas: There were no Waiting replicas found" ) return res gLogger.info( "StageRequest._getOnlineReplicas: Obtained %s replicas Waiting for staging." % len( allReplicaInfo ) ) replicasToStage = [] for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): if not self.__usage( storageElement ) < self.__cache( storageElement ): gLogger.info( 'StageRequest._getOnlineReplicas: Skipping %s, current usage above limit ( %s GB )' % ( storageElement, self.__cache( storageElement ) ) ) # Do not consider those SE that have the Cache full continue # Check if the Replica Metadata is OK and find out if they are Online or Offline res = self.__checkIntegrity( storageElement, seReplicaIDs, allReplicaInfo ) if not res['OK']: gLogger.error( 'StageRequest._getOnlineReplicas: Failed to check Replica Metadata', '(%s): %s' % ( storageElement, res['Message'] ) ) else: # keep only Online Replicas seReplicas[storageElement] = res['Value']['Online'] replicasToStage.extend( res['Value']['Online'] ) # Get Replicas from the same Tasks as those selected res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo ) if not res['OK']: gLogger.fatal( "StageRequest._getOnlineReplicas: Failed to get associated Replicas.", res['Message'] ) return res def _getOfflineReplicas( self ): """ This checks Replicas in Offline status and returns the list of Replicas to be Staged SEs for which the cache is currently full are not considered """ # Get all Replicas in Waiting Status associated to Staging Tasks gLogger.verbose( 'StageRequest._getOfflineReplicas: Checking Offline Replicas to be handled' ) res = self.__getOfflineReplicas() if not res['OK']: gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get replicas from StorageManagementDB.", res['Message'] ) return res seReplicas = {} allReplicaInfo = res['Value']['AllReplicaInfo'] if not len( allReplicaInfo ): gLogger.info( "StageRequest._getOfflineReplicas: There were no Offline replicas found" ) return res gLogger.info( "StageRequest._getOfflineReplicas: Obtained %s replicas Offline for staging." % len( allReplicaInfo ) ) replicasToStage = [] for storageElement, seReplicaIDs in res['Value']['SEReplicas'].items(): if not self.__usage( storageElement ) < self.__cache( storageElement ): gLogger.info( 'StageRequest._getOfflineReplicas: Skipping %s, current usage above limit ( %s GB )' % ( storageElement, self.__cache( storageElement ) ) ) # Do not consider those SE that have the Cache full continue seReplicas[storageElement] = [] for replicaID in sorted( seReplicaIDs ): seReplicas[storageElement].append( replicaID ) replicasToStage.append( replicaID ) self.__add( storageElement, allReplicaInfo[replicaID]['Size'] ) if not self.__usage( storageElement ) < self.__cache( storageElement ): # Stop adding Replicas when the cache is full break # Get Replicas from the same Tasks as those selected res = self.__addAssociatedReplicas( replicasToStage, seReplicas, allReplicaInfo ) if not res['OK']: gLogger.fatal( "StageRequest._getOfflineReplicas: Failed to get associated Replicas.", res['Message'] ) return res def __usage( self, storageElement ): """ Retrieve current usage of SE """ if not storageElement in self.storageElementUsage: self.storageElementUsage[storageElement] = {'TotalSize': 0.} return self.storageElementUsage[storageElement]['TotalSize'] def __cache( self, storageElement ): """ Retrieve cache size for SE """ if not storageElement in self.storageElementCache: diskCache = self.resources.getStorageElementValue( storageElement, 'DiskCacheTB', 1. ) self.storageElementCache[storageElement] = diskCache * 1000. / THROTTLING_STEPS return self.storageElementCache[storageElement] def __add( self, storageElement, size ): """ Add size (in bytes) to current usage of storageElement (in GB) """ if not storageElement in self.storageElementUsage: self.storageElementUsage[storageElement] = {'TotalSize': 0.} size = size / ( 1000 * 1000 * 1000.0 ) self.storageElementUsage[storageElement]['TotalSize'] += size return size def _issuePrestageRequests( self, storageElement, seReplicaIDs, allReplicaInfo ): """ Make the request to the SE and update the DB """ pfnRepIDs = {} for replicaID in seReplicaIDs: pfn = allReplicaInfo[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID # Now issue the prestage requests for the remaining replicas stageRequestMetadata = {} updatedPfnIDs = [] if pfnRepIDs: gLogger.info( "StageRequest._issuePrestageRequests: Submitting %s stage requests for %s." % ( len( pfnRepIDs ), storageElement ) ) res = self.replicaManager.prestageStorageFile( pfnRepIDs.keys(), storageElement, lifetime = self.pinLifetime ) gLogger.debug( "StageRequest._issuePrestageRequests: replicaManager.prestageStorageFile: res=", res ) #Daniela: fishy result from ReplicaManager!!! Should NOT return OK #res= {'OK': True, 'Value': {'Successful': {}, 'Failed': {'srm://srm-lhcb.cern.ch/castor/cern.ch/grid/lhcb/data/2010/RAW/EXPRESS/LHCb/COLLISION10/71476/071476_0000000241.raw': ' SRM2Storage.__gfal_exec: Failed to perform gfal_prestage.[SE][BringOnline][SRM_INVALID_REQUEST] httpg://srm-lhcb.cern.ch:8443/srm/managerv2: User not able to access specified space token\n'}}} #res= {'OK': True, 'Value': {'Successful': {'srm://gridka-dCache.fzk.de/pnfs/gridka.de/lhcb/data/2009/RAW/FULL/LHCb/COLLISION09/63495/063495_0000000001.raw': '-2083846379'}, 'Failed': {}}} if not res['OK']: gLogger.error( "StageRequest._issuePrestageRequests: Completely failed to submit stage requests for replicas.", res['Message'] ) else: for pfn, requestID in res['Value']['Successful'].items(): if not stageRequestMetadata.has_key( requestID ): stageRequestMetadata[requestID] = [] stageRequestMetadata[requestID].append( pfnRepIDs[pfn] ) updatedPfnIDs.append( pfnRepIDs[pfn] ) if stageRequestMetadata: gLogger.info( "StageRequest._issuePrestageRequests: %s stage request metadata to be updated." % len( stageRequestMetadata ) ) res = self.storageDB.insertStageRequest( stageRequestMetadata, self.pinLifetime ) if not res['OK']: gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert stage request metadata.", res['Message'] ) return res res = self.storageDB.updateReplicaStatus( updatedPfnIDs, 'StageSubmitted' ) if not res['OK']: gLogger.error( "StageRequest._issuePrestageRequests: Failed to insert replica status.", res['Message'] ) return def __sortBySE( self, replicaDict ): seReplicas = {} replicaIDs = {} for replicaID, info in replicaDict.items(): lfn = info['LFN'] storageElement = info['SE'] size = info['Size'] pfn = info['PFN'] replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement} if not seReplicas.has_key( storageElement ): seReplicas[storageElement] = [] seReplicas[storageElement].append( replicaID ) return S_OK( {'SEReplicas':seReplicas, 'AllReplicaInfo':replicaIDs} ) def __getStagedReplicas( self ): """ This obtains the Staged replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getStagedReplicas() if not res['OK']: gLogger.error( "StageRequest.__getStagedReplicas: Failed to get replicas with Waiting status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageRequest.__getStagedReplicas: No Waiting replicas found to process." ) else: gLogger.debug( "StageRequest.__getStagedReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) ) return self.__sortBySE( res['Value'] ) def __getWaitingReplicas( self ): """ This obtains the Waiting replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getWaitingReplicas() if not res['OK']: gLogger.error( "StageRequest.__getWaitingReplicas: Failed to get replicas with Waiting status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageRequest.__getWaitingReplicas: No Waiting replicas found to process." ) else: gLogger.debug( "StageRequest.__getWaitingReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) ) return self.__sortBySE( res['Value'] ) def __getOfflineReplicas( self ): """ This obtains the Offline replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the Waiting replicas from the Replicas table res = self.storageDB.getOfflineReplicas() if not res['OK']: gLogger.error( "StageRequest.__getOfflineReplicas: Failed to get replicas with Waiting status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "StageRequest.__getOfflineReplicas: No Waiting replicas found to process." ) else: gLogger.debug( "StageRequest.__getOfflineReplicas: Obtained %s Waiting replicas(s) to process." % len( res['Value'] ) ) return self.__sortBySE( res['Value'] ) def __addAssociatedReplicas( self, replicasToStage, seReplicas, allReplicaInfo ): """ Retrieve the list of Replicas that belong to the same Tasks as the provided list """ res = self.storageDB.getAssociatedReplicas( replicasToStage ) if not res['OK']: gLogger.fatal( "StageRequest.__addAssociatedReplicas: Failed to get associated Replicas.", res['Message'] ) return res addReplicas = {'Offline': {}, 'Waiting': {}} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] size = info['Size'] pfn = info['PFN'] status = info['Status'] if status not in ['Waiting', 'Offline']: continue if not addReplicas[status].has_key( storageElement ): addReplicas[status][storageElement] = [] replicaIDs[replicaID] = {'LFN':lfn, 'PFN':pfn, 'Size':size, 'StorageElement':storageElement } addReplicas[status][storageElement].append( replicaID ) waitingReplicas = addReplicas['Waiting'] offlineReplicas = addReplicas['Offline'] newReplicaInfo = replicaIDs allReplicaInfo.update( newReplicaInfo ) # First handle Waiting Replicas for which metadata is to be checked for storageElement, seReplicaIDs in waitingReplicas.items(): for replicaID in list( seReplicaIDs ): if replicaID in replicasToStage: seReplicaIDs.remove( replicaID ) res = self.__checkIntegrity( storageElement, seReplicaIDs, allReplicaInfo ) if not res['OK']: gLogger.error( 'StageRequest.__addAssociatedReplicas: Failed to check Replica Metadata', '(%s): %s' % ( storageElement, res['Message'] ) ) else: # keep all Replicas (Online and Offline) if not storageElement in seReplicas: seReplicas[storageElement] = [] seReplicas[storageElement].extend( res['Value']['Online'] ) replicasToStage.extend( res['Value']['Online'] ) seReplicas[storageElement].extend( res['Value']['Offline'] ) replicasToStage.extend( res['Value']['Offline'] ) # Then handle Offline Replicas for which metadata is already checked for storageElement, seReplicaIDs in offlineReplicas.items(): if not storageElement in seReplicas: seReplicas[storageElement] = [] for replicaID in sorted( seReplicaIDs ): if replicaID in replicasToStage: seReplicaIDs.remove( replicaID ) seReplicas[storageElement].extend( seReplicaIDs ) replicasToStage.extend( seReplicaIDs ) for replicaID in allReplicaInfo.keys(): if replicaID not in replicasToStage: del allReplicaInfo[replicaID] totalSize = 0 for storageElement in sorted( seReplicas.keys() ): replicaIDs = seReplicas[storageElement] size = 0 for replicaID in replicaIDs: size += self.__add( storageElement, allReplicaInfo[replicaID]['Size'] ) gLogger.info( 'StageRequest.__addAssociatedReplicas: Considering %s GB to be staged at %s' % ( size, storageElement ) ) totalSize += size gLogger.info( "StageRequest.__addAssociatedReplicas: Obtained %s GB for staging." % totalSize ) return S_OK( {'SEReplicas':seReplicas, 'AllReplicaInfo':allReplicaInfo} ) def __checkIntegrity( self, storageElement, seReplicaIDs, allReplicaInfo ): """ Check the integrity of the files to ensure they are available Updates status of Offline Replicas for a later pass Return list of Online replicas to be Stage """ if not seReplicaIDs: return S_OK( {'Online': [], 'Offline': []} ) pfnRepIDs = {} for replicaID in seReplicaIDs: pfn = allReplicaInfo[replicaID]['PFN'] pfnRepIDs[pfn] = replicaID gLogger.info( "StageRequest.__checkIntegrity: Checking the integrity of %s replicas at %s." % ( len( pfnRepIDs ), storageElement ) ) res = self.replicaManager.getStorageFileMetadata( pfnRepIDs.keys(), storageElement ) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Completely failed to obtain metadata for replicas.", res['Message'] ) return res terminalReplicaIDs = {} onlineReplicaIDs = [] offlineReplicaIDs = [] for pfn, metadata in res['Value']['Successful'].items(): if metadata['Size'] != allReplicaInfo[pfnRepIDs[pfn]]['Size']: gLogger.error( "StageRequest.__checkIntegrity: PFN StorageElement size does not match FileCatalog", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN StorageElement size does not match FileCatalog' pfnRepIDs.pop( pfn ) elif metadata['Lost']: gLogger.error( "StageRequest.__checkIntegrity: PFN has been Lost by the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN has been Lost by the StorageElement' pfnRepIDs.pop( pfn ) elif metadata['Unavailable']: gLogger.error( "StageRequest.__checkIntegrity: PFN is declared Unavailable by the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN is declared Unavailable by the StorageElement' pfnRepIDs.pop( pfn ) else: if metadata['Cached']: gLogger.verbose( "StageRequest.__checkIntegrity: Cache hit for file." ) onlineReplicaIDs.append( pfnRepIDs[pfn] ) else: offlineReplicaIDs.append( pfnRepIDs[pfn] ) for pfn, reason in res['Value']['Failed'].items(): if re.search( 'File does not exist', reason ): gLogger.error( "StageRequest.__checkIntegrity: PFN does not exist in the StorageElement", pfn ) terminalReplicaIDs[pfnRepIDs[pfn]] = 'PFN does not exist in the StorageElement' pfnRepIDs.pop( pfn ) # Update the states of the replicas in the database #TODO Sent status to integrity DB if terminalReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) res = self.storageDB.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "StageRequest.__checkIntegrity: Failed to update replica failures.", res['Message'] ) if onlineReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Online." % len( onlineReplicaIDs ) ) if offlineReplicaIDs: gLogger.info( "StageRequest.__checkIntegrity: %s replicas found Offline." % len( offlineReplicaIDs ) ) res = self.storageDB.updateReplicaStatus( offlineReplicaIDs, 'Offline' ) return S_OK( {'Online': onlineReplicaIDs, 'Offline': offlineReplicaIDs} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, self.name ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res