def getFile(lfn, se=""): dm = DataManager() download_ok = 0 get_active_replicas_ok = False lfn_on_se = False error_msg = "" if se: for i in range(0, 5): result = dm.getActiveReplicas(lfn) if result["OK"] and result["Value"]["Successful"]: get_active_replicas_ok = True lfnReplicas = result["Value"]["Successful"] if se in lfnReplicas[lfn]: lfn_on_se = True break time.sleep(3) print "- Get replicas for %s failed, try again" % lfn if not get_active_replicas_ok: return S_ERROR("Get replicas error: %s" % lfn) if lfn_on_se: se = StorageElement(se) # try 5 times for j in range(0, 5): result = se.getFile(lfn) if result["OK"] and result["Value"]["Successful"] and result["Value"]["Successful"].has_key(lfn): break time.sleep(random.randint(180, 600)) print "- %s getStorageFile(%s) failed, try again" % (lfn, se) if result["OK"]: if result["Value"]["Successful"] and result["Value"]["Successful"].has_key(lfn): download_ok = 1 else: error_msg = "Downloading %s from SE %s error!" % (lfn, se) else: error_msg = result["Message"] else: if se: print 'File %s not found on SE "%s" after %s tries, trying other SE' % (lfn, se, i + 1) # try 5 times for j in range(0, 5): result = dm.getFile(lfn) if result["OK"] and result["Value"]["Successful"] and result["Value"]["Successful"].has_key(lfn): break time.sleep(random.randint(180, 600)) print "- getFile(%s) failed, try again" % lfn if result["OK"]: if result["Value"]["Successful"] and result["Value"]["Successful"].has_key(lfn): download_ok = 2 else: error_msg = "Downloading %s from random SE error!" % lfn else: error_msg = result["Message"] if download_ok: return S_OK({lfn: {"DownloadOK": download_ok, "Retry": j + 1}}) return S_ERROR(error_msg)
def getFilesToStage( lfnList ): """ Utility that returns out of a list of LFNs those files that are offline, and those for which at least one copy is online """ onlineLFNs = set() offlineLFNsDict = {} if not lfnList: return S_OK( {'onlineLFNs':list( onlineLFNs ), 'offlineLFNs': offlineLFNsDict} ) dm = DataManager() lfnListReplicas = dm.getActiveReplicas( lfnList, getUrl = False ) if not lfnListReplicas['OK']: return lfnListReplicas seToLFNs = dict() if lfnListReplicas['Value']['Failed']: return S_ERROR( "Failures in getting replicas" ) for lfn, ld in lfnListReplicas['Value']['Successful'].iteritems(): for se in ld: seToLFNs.setdefault( se, list() ).append( lfn ) failed = {} for se, lfnsInSEList in seToLFNs.iteritems(): fileMetadata = StorageElement( se ).getFileMetadata( lfnsInSEList ) if not fileMetadata['OK']: failed[se] = dict.fromkeys( lfnsInSEList, fileMetadata['Message'] ) else: failed[se] = fileMetadata['Value']['Failed'] # is there at least one online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): if mDict['Cached']: onlineLFNs.add( lfn ) # If the file was found staged, ignore possible errors, but print out errors if failed: for se, seFailed in failed.items(): gLogger.error( "Errors when getting files metadata", 'at %s' % se ) for lfn, reason in seFailed.items(): gLogger.info( '%s: %s' % ( lfn, reason ) ) if lfn in onlineLFNs: failed[se].pop( lfn ) if not failed[se]: failed.pop( se ) if failed: return S_ERROR( 'Could not get metadata for %d files' % \ len( set( [lfn for lfnList in failed.values() for lfn in lfnList] ) ) ) offlineLFNs = set( lfnList ) - onlineLFNs for offlineLFN in offlineLFNs: ses = lfnListReplicas['Value']['Successful'][offlineLFN].keys() random.shuffle( ses ) se = ses[0] offlineLFNsDict.setdefault( se, list() ).append( offlineLFN ) return S_OK( {'onlineLFNs':list( onlineLFNs ), 'offlineLFNs': offlineLFNsDict} )
def getFilesToStage( lfnList ): """ Utility that returns out of a list of LFNs those files that are offline, and those for which at least one copy is online """ onlineLFNs = set() offlineLFNsDict = {} if not lfnList: return S_OK( {'onlineLFNs':list( onlineLFNs ), 'offlineLFNs': offlineLFNsDict} ) dm = DataManager() lfnListReplicas = dm.getActiveReplicas( lfnList, getUrl = False ) if not lfnListReplicas['OK']: return lfnListReplicas seToLFNs = dict() if lfnListReplicas['Value']['Failed']: return S_ERROR( "Failures in getting replicas" ) for lfn, ld in lfnListReplicas['Value']['Successful'].iteritems(): for se in ld: seToLFNs.setdefault( se, list() ).append( lfn ) failed = {} for se, lfnsInSEList in seToLFNs.iteritems(): fileMetadata = StorageElement( se ).getFileMetadata( lfnsInSEList ) if not fileMetadata['OK']: failed[se] = dict.fromkeys( lfnsInSEList, fileMetadata['Message'] ) else: if fileMetadata['Value']['Failed']: failed[se] = fileMetadata['Value']['Failed'] # is there at least one online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): if 'Cached' not in mDict: failed.setdefault( se, {} )[lfn] = 'No Cached item returned as metadata' elif mDict['Cached']: onlineLFNs.add( lfn ) # If the file was found staged, ignore possible errors, but print out errors for se, seFailed in failed.items(): gLogger.error( "Errors when getting files metadata", 'at %s' % se ) for lfn, reason in seFailed.items(): gLogger.info( '%s: %s' % ( lfn, reason ) ) if lfn in onlineLFNs: failed[se].pop( lfn ) if not failed[se]: failed.pop( se ) if failed: gLogger.error( "Could not get metadata", "for %d files" % len( set( [lfn for lfnList in failed.values() for lfn in lfnList] ) ) ) return S_ERROR( "Could not get metadata for files" ) offlineLFNs = set( lfnList ) - onlineLFNs for offlineLFN in offlineLFNs: ses = lfnListReplicas['Value']['Successful'][offlineLFN].keys() if ses: offlineLFNsDict.setdefault( random.choice( ses ), list() ).append( offlineLFN ) return S_OK( {'onlineLFNs':list( onlineLFNs ), 'offlineLFNs': offlineLFNsDict} )
def doesFileExist(lfn): from DIRAC.DataManagementSystem.Client.DataManager import DataManager dm = DataManager() result = dm.getActiveReplicas(lfn) if result[('Value')][('Successful')]: return True else: return False
def doesFileExist(lfn): from DIRAC.DataManagementSystem.Client.DataManager import DataManager dm = DataManager() result = dm.getActiveReplicas(lfn) if result[("Value")][("Successful")]: return True # print 'File exists.' else: return False
def doesFileExist(lfn): from DIRAC.DataManagementSystem.Client.DataManager import DataManager dm = DataManager() result = dm.getActiveReplicas(lfn) if not result['OK']: print "ERROR",result['Message'] return False if lfn in result['Value']['Successful']: return True else: return False
def doesFileExist(lfn): from DIRAC.DataManagementSystem.Client.DataManager import DataManager dm = DataManager() result = dm.getActiveReplicas(lfn) if not result['OK']: print "ERROR", result['Message'] return False if lfn in result['Value']['Successful']: return True else: return False
def getFilesToStage(lfnList): """ Utility that returns out of a list of LFNs those files that are offline, and those for which at least one copy is online """ onlineLFNs = set() offlineLFNsDict = {} if not lfnList: return S_OK({ 'onlineLFNs': list(onlineLFNs), 'offlineLFNs': offlineLFNsDict }) dm = DataManager() lfnListReplicas = dm.getActiveReplicas(lfnList) if not lfnListReplicas['OK']: return lfnListReplicas seObjectsDict = dict() seToLFNs = dict() if lfnListReplicas['Value']['Failed']: return S_ERROR("Failures in getting replicas") for lfn, ld in lfnListReplicas['Value']['Successful'].iteritems(): for se, _ in ld.iteritems(): seObjectsDict.setdefault(se, StorageElement(se)) seToLFNs.setdefault(se, list()).append(lfn) for se, lfnsInSEList in seToLFNs.iteritems(): fileMetadata = seObjectsDict[se].getFileMetadata(lfnsInSEList) if not fileMetadata['OK']: return fileMetadata if fileMetadata['Value']['Failed']: return S_ERROR("Failures in getting file metadata") # is there at least one online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): if mDict['Cached']: onlineLFNs.add(lfn) offlineLFNs = set(lfnList).difference(onlineLFNs) for offlineLFN in offlineLFNs: ses = lfnListReplicas['Value']['Successful'][offlineLFN].keys() random.shuffle(ses) se = ses[0] offlineLFNsDict.setdefault(se, list()).append(offlineLFN) return S_OK({ 'onlineLFNs': list(onlineLFNs), 'offlineLFNs': offlineLFNsDict })
def getFilesToStage( lfnList ): """ Utility that returns out of a list of LFNs those files that are offline, and those for which at least one copy is online """ onlineLFNs = set() offlineLFNsDict = {} if not lfnList: return S_OK( {'onlineLFNs':list( onlineLFNs ), 'offlineLFNs': offlineLFNsDict} ) dm = DataManager() lfnListReplicas = dm.getActiveReplicas( lfnList ) if not lfnListReplicas['OK']: return lfnListReplicas seObjectsDict = dict() seToLFNs = dict() if lfnListReplicas['Value']['Failed']: return S_ERROR( "Failures in getting replicas" ) for lfn, ld in lfnListReplicas['Value']['Successful'].iteritems(): for se, _ in ld.iteritems(): seObjectsDict.setdefault( se, StorageElement( se ) ) seToLFNs.setdefault( se, list() ).append( lfn ) for se, lfnsInSEList in seToLFNs.iteritems(): fileMetadata = seObjectsDict[se].getFileMetadata( lfnsInSEList ) if not fileMetadata['OK']: return fileMetadata if fileMetadata['Value']['Failed']: return S_ERROR( "Failures in getting file metadata" ) # is there at least one online? for lfn, mDict in fileMetadata['Value']['Successful'].iteritems(): if mDict['Cached']: onlineLFNs.add( lfn ) offlineLFNs = set( lfnList ).difference( onlineLFNs ) for offlineLFN in offlineLFNs: ses = lfnListReplicas['Value']['Successful'][offlineLFN].keys() random.shuffle( ses ) se = ses[0] offlineLFNsDict.setdefault( se, list() ).append( offlineLFN ) return S_OK( {'onlineLFNs':list( onlineLFNs ), 'offlineLFNs': offlineLFNsDict} )
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption("shifterProxy", "DataManager") return S_OK() def execute(self): """This is the first logical task to be executed and manages the New->Waiting transition of the Replicas""" res = self.__getNewReplicas() if not res["OK"]: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res["Message"] ) return res if not res["Value"]: gLogger.info("There were no New replicas found") return res replicas = res["Value"]["Replicas"] replicaIDs = res["Value"]["ReplicaIDs"] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs) ) # Check if the files exist in the FileCatalog res = self.__getExistingFiles(replicas) if not res["OK"]: return res exist = res["Value"]["Exist"] terminal = res["Value"]["Missing"] failed = res["Value"]["Failed"] if not exist: gLogger.error("RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file") return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for replicaID in replicas[lfn].values(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info("RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res["OK"]: return res failed.update(res["Value"]["Failed"]) terminal = res["Value"]["ZeroSize"] fileSizes = res["Value"]["FileSizes"] if not fileSizes: gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine sizes of any files") return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(list(fileSizes)) if not res["OK"]: return res failed.update(res["Value"]["Failed"]) terminal = res["Value"]["ZeroReplicas"] fileReplicas = res["Value"]["Replicas"] if not fileReplicas: gLogger.error("RequestPreparation.prepareNewReplicas: Failed determine replicas for any files") return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas.get(lfn) # This should not happen in principle, but it was seen # after a corrupted staging request has entered the DB if not lfnReplicas: gLogger.error("Missing replicas information", "%s %s" % (lfn, requestedSEs)) continue for requestedSE, replicaID in requestedSEs.items(): if requestedSE not in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append((replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs) ) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res["OK"]: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res["Message"] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation(replicaMetadata) if not res["OK"]: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res["Message"] ) return S_OK() def __getNewReplicas(self): """This obtains the New replicas from the Replicas table and for each LFN the requested storage element""" # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas({"Status": "New"}) if not res["OK"]: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res["Message"] ) return res if not res["Value"]: gLogger.debug("RequestPreparation.__getNewReplicas: No New replicas found to process.") return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res["Value"]) ) replicas = {} replicaIDs = {} for replicaID, info in res["Value"].items(): lfn = info["LFN"] storageElement = info["SE"] replicas.setdefault(lfn, {})[storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({"Replicas": replicas, "ReplicaIDs": replicaIDs}) def __getExistingFiles(self, lfns): """This checks that the files exist in the FileCatalog.""" res = self.fileCatalog.exists(list(set(lfns))) if not res["OK"]: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res["Message"] ) return res failed = res["Value"]["Failed"] success = res["Value"]["Successful"] exist = [lfn for lfn, exists in success.items() if exists] missing = list(set(success) - set(exist)) if missing: reason = "LFN not registered in the FC" gLogger.warn("RequestPreparation.__getExistingFiles: %s" % reason, "\n".join([""] + missing)) self.__reportProblematicFiles(missing, "LFN-LFC-DoesntExist") missing = dict.fromkeys(missing, reason) else: missing = {} return S_OK({"Exist": exist, "Missing": missing, "Failed": failed}) def __getFileSize(self, lfns): """This obtains the file size from the FileCatalog.""" fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res["OK"]: gLogger.error("RequestPreparation.__getFileSize: Failed to get sizes for files.", res["Message"]) return res failed = res["Value"]["Failed"] for lfn, size in res["Value"]["Successful"].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), "LFN-LFC-ZeroSize") return S_OK({"FileSizes": fileSizes, "ZeroSize": zeroSize, "Failed": failed}) def __getFileReplicas(self, lfns): """This obtains the replicas from the FileCatalog.""" replicas = {} noReplicas = {} res = self.dm.getActiveReplicas(lfns) if not res["OK"]: gLogger.error("RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res["Message"]) return res failed = res["Value"]["Failed"] for lfn, lfnReplicas in res["Value"]["Successful"].items(): if len(lfnReplicas) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn("RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(list(noReplicas), "LFN-LFC-NoReplicas") return S_OK({"Replicas": replicas, "ZeroReplicas": noReplicas, "Failed": failed}) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic(lfns, reason, sourceComponent="RequestPreparationAgent") if not res["OK"]: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res["Message"] ) return res if res["Value"]["Successful"]: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res["Value"]["Successful"]) ) if res["Value"]["Failed"]: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res["Value"]["Failed"]) ) return res
class FTSClient( Client ): """ .. class:: FTSClient """ def __init__( self, useCertificates = False ): """c'tor :param self: self reference :param bool useCertificates: flag to enable/disable certificates """ Client.__init__( self ) self.log = gLogger.getSubLogger( "DataManagement/FTSClient" ) self.setServer( "DataManagement/FTSManager" ) # getting other clients self.ftsValidator = FTSValidator() self.dataManager = DataManager() self.storageFactory = StorageFactory() url = PathFinder.getServiceURL( "DataManagement/FTSManager" ) if not url: raise RuntimeError( "CS option DataManagement/FTSManager URL is not set!" ) self.ftsManager = RPCClient( url ) def getFTSFileList( self, statusList = None, limit = None ): """ get list of FTSFiles with status in statusList """ statusList = statusList if statusList else [ "Waiting" ] limit = limit if limit else 1000 getFTSFileList = self.ftsManager.getFTSFileList( statusList, limit ) if not getFTSFileList['OK']: self.log.error( "getFTSFileList: %s" % getFTSFileList['Message'] ) return getFTSFileList getFTSFileList = getFTSFileList['Value'] return S_OK( [ FTSFile( ftsFile ) for ftsFile in getFTSFileList ] ) def getFTSJobList( self, statusList = None, limit = None ): """ get FTSJobs wit statues in :statusList: """ statusList = statusList if statusList else list( FTSJob.INITSTATES + FTSJob.TRANSSTATES ) limit = limit if limit else 500 getFTSJobList = self.ftsManager.getFTSJobList( statusList, limit ) if not getFTSJobList['OK']: self.log.error( "getFTSJobList: %s" % getFTSJobList['Message'] ) return getFTSJobList getFTSJobList = getFTSJobList['Value'] return S_OK( [ FTSJob( ftsJobDict ) for ftsJobDict in getFTSJobList ] ) def getFTSFilesForRequest( self, requestID, statusList = None ): """ read FTSFiles for a given :requestID: :param int requestID: ReqDB.Request.RequestID :param list statusList: List of statuses (default: Waiting) """ ftsFiles = self.ftsManager.getFTSFilesForRequest( requestID, statusList ) if not ftsFiles['OK']: self.log.error( "getFTSFilesForRequest: %s" % ftsFiles['Message'] ) return ftsFiles return S_OK( [ FTSFile( ftsFileDict ) for ftsFileDict in ftsFiles['Value'] ] ) def getAllFTSFilesForRequest( self, requestID ): """ read FTSFiles for a given :requestID: :param int requestID: ReqDB.Request.RequestID """ ftsFiles = self.ftsManager.getAllFTSFilesForRequest( requestID ) if not ftsFiles['OK']: self.log.error( "getFTSFilesForRequest: %s" % ftsFiles['Message'] ) return ftsFiles return S_OK( [ FTSFile( ftsFileDict ) for ftsFileDict in ftsFiles['Value'] ] ) def getFTSJobsForRequest( self, requestID, statusList = None ): """ get list of FTSJobs with statues in :statusList: given requestID :param int requestID: ReqDB.Request.RequestID :param list statusList: list with FTSJob statuses :return: [ FTSJob, FTSJob, ... ] """ statusList = statusList if statusList else list( FTSJob.INITSTATES + FTSJob.TRANSSTATES ) getJobs = self.ftsManager.getFTSJobsForRequest( requestID, statusList ) if not getJobs['OK']: self.log.error( "getFTSJobsForRequest: %s" % getJobs['Message'] ) return getJobs return S_OK( [ FTSJob( ftsJobDict ) for ftsJobDict in getJobs['Value'] ] ) def getFTSFile( self, ftsFileID = None ): """ get FTSFile :param int ftsFileID: FTSFileID """ getFile = self.ftsManager.getFTSFile( ftsFileID ) if not getFile['OK']: self.log.error( getFile['Message'] ) # # de-serialize if getFile['Value']: ftsFile = FTSFile( getFile['Value'] ) return S_OK( ftsFile ) def putFTSJob( self, ftsJob ): """ put FTSJob into FTSDB :param FTSJob ftsJob: FTSJob instance """ isValid = self.ftsValidator.validate( ftsJob ) if not isValid['OK']: self.log.error( isValid['Message'] ) return isValid ftsJobJSON = ftsJob.toJSON() if not ftsJobJSON['OK']: self.log.error( ftsJobJSON['Message'] ) return ftsJobJSON return self.ftsManager.putFTSJob( ftsJobJSON['Value'] ) def getFTSJob( self, ftsJobID ): """ get FTS job, change its status to 'Assigned' :param int ftsJobID: FTSJobID """ getJob = self.ftsManager.getFTSJob( ftsJobID ) if not getJob['OK']: self.log.error( getJob['Message'] ) return getJob setStatus = self.ftsManager.setFTSJobStatus( ftsJobID, 'Assigned' ) if not setStatus['OK']: self.log.error( setStatus['Message'] ) # # de-serialize # if getJob['Value']: # getJob = FTSJob( getJob['Value'] ) return getJob def peekFTSJob( self, ftsJobID ): """ just peek FTSJob :param int ftsJobID: FTSJobID """ getJob = self.ftsManager.getFTSJob( ftsJobID ) if not getJob['OK']: self.log.error( getJob['Message'] ) return getJob return getJob def deleteFTSJob( self, ftsJobID ): """ delete FTSJob into FTSDB :param int ftsJob: FTSJobID """ deleteJob = self.ftsManager.deleteFTSJob( ftsJobID ) if not deleteJob['OK']: self.log.error( deleteJob['Message'] ) return deleteJob def getFTSJobIDs( self, statusList = None ): """ get list of FTSJobIDs for a given status list """ statusList = statusList if statusList else [ "Submitted", "Ready", "Active" ] ftsJobIDs = self.ftsManager.getFTSJobIDs( statusList ) if not ftsJobIDs['OK']: self.log.error( ftsJobIDs['Message'] ) return ftsJobIDs def getFTSFileIDs( self, statusList = None ): """ get list of FTSFileIDs for a given status list """ statusList = statusList if statusList else [ "Waiting" ] ftsFileIDs = self.ftsManager.getFTSFileIDs( statusList ) if not ftsFileIDs['OK']: self.log.error( ftsFileIDs['Message'] ) return ftsFileIDs def getFTSHistory( self ): """ get FTS history snapshot """ getFTSHistory = self.ftsManager.getFTSHistory() if not getFTSHistory['OK']: self.log.error( getFTSHistory['Message'] ) return getFTSHistory getFTSHistory = getFTSHistory['Value'] return S_OK( [ FTSHistoryView( ftsHistory ) for ftsHistory in getFTSHistory ] ) def getDBSummary( self ): """ get FTDB summary """ dbSummary = self.ftsManager.getDBSummary() if not dbSummary['OK']: self.log.error( "getDBSummary: %s" % dbSummary['Message'] ) return dbSummary def setFTSFilesWaiting( self, operationID, sourceSE, opFileIDList = None ): """ update status for waiting FTSFiles from 'Waiting#SourceSE' to 'Waiting' :param int operationID: ReqDB.Operation.OperationID :param str sourceSE: source SE name :param opFileIDList: [ ReqDB.File.FileID, ... ] """ return self.ftsManager.setFTSFilesWaiting( operationID, sourceSE, opFileIDList ) def deleteFTSFiles( self, operationID, opFileIDList = None ): """ delete FTSFiles for rescheduling :param int operationID: ReqDB.Operation.OperationID :param list opFileIDList: [ ReqDB.File.FileID, ... ] """ return self.ftsManager.deleteFTSFiles( operationID, opFileIDList ) def ftsSchedule( self, requestID, operationID, opFileList ): """ schedule lfn for FTS job :param int requestID: RequestDB.Request.RequestID :param int operationID: RequestDB.Operation.OperationID :param list opFileList: list of tuples ( File.toJSON()['Value'], sourcesList, targetList ) """ fileIDs = [int( fileJSON.get( 'FileID', 0 ) ) for fileJSON, _sourceSEs, _targetSEs in opFileList ] res = self.ftsManager.cleanUpFTSFiles( requestID, fileIDs ) if not res['OK']: self.log.error( "ftsSchedule: %s" % res['Message'] ) return S_ERROR( "ftsSchedule: %s" % res['Message'] ) ftsFiles = [] # # this will be returned on success result = { "Successful": [], "Failed": {} } for fileJSON, sourceSEs, targetSEs in opFileList: lfn = fileJSON.get( "LFN", "" ) size = int( fileJSON.get( "Size", 0 ) ) fileID = int( fileJSON.get( "FileID", 0 ) ) opID = int( fileJSON.get( "OperationID", 0 ) ) self.log.verbose( "ftsSchedule: LFN=%s FileID=%s OperationID=%s sources=%s targets=%s" % ( lfn, fileID, opID, sourceSEs, targetSEs ) ) res = self.dataManager.getActiveReplicas( lfn ) if not res['OK']: self.log.error( "ftsSchedule: %s" % res['Message'] ) result["Failed"][fileID] = res['Message'] continue replicaDict = res['Value'] if lfn in replicaDict["Failed"] and lfn not in replicaDict["Successful"]: result["Failed"][fileID] = "no active replicas found" continue replicaDict = replicaDict["Successful"].get( lfn, {} ) # # use valid replicas only validReplicasDict = dict( [ ( se, pfn ) for se, pfn in replicaDict.items() if se in sourceSEs ] ) if not validReplicasDict: self.log.warn( "No active replicas found in sources" ) result["Failed"][fileID] = "no active replicas found in sources" continue tree = self.ftsManager.getReplicationTree( sourceSEs, targetSEs, size ) if not tree['OK']: self.log.error( "ftsSchedule: %s cannot be scheduled: %s" % ( lfn, tree['Message'] ) ) result["Failed"][fileID] = tree['Message'] continue tree = tree['Value'] self.log.verbose( "LFN=%s tree=%s" % ( lfn, tree ) ) for repDict in tree.values(): self.log.verbose( "Strategy=%s Ancestor=%s SourceSE=%s TargetSE=%s" % ( repDict["Strategy"], repDict["Ancestor"], repDict["SourceSE"], repDict["TargetSE"] ) ) transferSURLs = self._getTransferURLs( lfn, repDict, sourceSEs, validReplicasDict ) if not transferSURLs['OK']: result["Failed"][fileID] = transferSURLs['Message'] continue sourceSURL, targetSURL, fileStatus = transferSURLs['Value'] if sourceSURL == targetSURL: result["Failed"][fileID] = "sourceSURL equals to targetSURL for %s" % lfn continue self.log.verbose( "sourceURL=%s targetURL=%s FTSFile.Status=%s" % ( sourceSURL, targetSURL, fileStatus ) ) ftsFile = FTSFile() for key in ( "LFN", "FileID", "OperationID", "Checksum", "ChecksumType", "Size" ): if fileJSON.get( key ): setattr( ftsFile, key, fileJSON.get( key ) ) ftsFile.RequestID = requestID ftsFile.OperationID = operationID ftsFile.SourceSURL = sourceSURL ftsFile.TargetSURL = targetSURL ftsFile.SourceSE = repDict["SourceSE"] ftsFile.TargetSE = repDict["TargetSE"] ftsFile.Status = fileStatus ftsFiles.append( ftsFile ) if not ftsFiles: self.log.info( "ftsSchedule: no FTSFiles to put for request %d" % requestID ) return S_OK( result ) ftsFilesJSONList = [ftsFile.toJSON()['Value'] for ftsFile in ftsFiles] res = self.ftsManager.putFTSFileList( ftsFilesJSONList ) if not res['OK']: self.log.error( "ftsSchedule: %s" % res['Message'] ) return S_ERROR( "ftsSchedule: %s" % res['Message'] ) result['Successful'] += [ fileID for fileID in fileIDs if fileID not in result['Failed']] # # if we land here some files have been properly scheduled return S_OK( result ) ################################################################################################################ # Some utilities function def _getSurlForLFN( self, targetSE, lfn ): """ Get the targetSURL for the storage and LFN supplied. :param self: self reference :param str targetSE: target SE :param str lfn: LFN """ res = self.storageFactory.getStorages( targetSE, protocolList = ["SRM2"] ) if not res['OK']: errStr = "_getSurlForLFN: Failed to create SRM2 storage for %s: %s" % ( targetSE, res['Message'] ) self.log.error( errStr ) return S_ERROR( errStr ) storageObjects = res['Value']["StorageObjects"] for storageObject in storageObjects: res = storageObject.getCurrentURL( lfn ) if res['OK']: return res self.log.error( "_getSurlForLFN: Failed to get SRM compliant storage.", targetSE ) return S_ERROR( "_getSurlForLFN: Failed to get SRM compliant storage." ) def _getTransferURLs( self, lfn, repDict, replicas, replicaDict ): """ prepare TURLs for given LFN and replication tree :param self: self reference :param str lfn: LFN :param dict repDict: replication dictionary :param dict replicas: LFN replicas """ hopSourceSE = repDict["SourceSE"] hopTargetSE = repDict["TargetSE"] hopAncestor = repDict["Ancestor"] # # get targetSURL res = self._getSurlForLFN( hopTargetSE, lfn ) if not res['OK']: self.log.error( "_getTransferURLs: %s" % res['Message'] ) return res targetSURL = res['Value'] status = "Waiting" # # get the sourceSURL if hopAncestor: status = "Waiting#%s" % ( hopAncestor ) res = self._getSurlForLFN( hopSourceSE, lfn ) sourceSURL = res.get( 'Value', replicaDict.get( hopSourceSE, None ) ) if not sourceSURL: self.log.error( "_getTransferURLs: %s" % res['Message'] ) return res return S_OK( ( sourceSURL, targetSURL, status ) )
class FTSClient( Client ): """ .. class:: FTSClient """ def __init__( self, useCertificates = False ): """c'tor :param self: self reference :param bool useCertificates: flag to enable/disable certificates """ Client.__init__( self ) self.log = gLogger.getSubLogger( "DataManagement/FTSClient" ) self.setServer( "DataManagement/FTSManager" ) # getting other clients self.ftsValidator = FTSValidator() self.dataManager = DataManager() self.storageFactory = StorageFactory() url = PathFinder.getServiceURL( "DataManagement/FTSManager" ) if not url: raise RuntimeError( "CS option DataManagement/FTSManager URL is not set!" ) self.ftsManager = RPCClient( url ) def getFTSFileList( self, statusList = None, limit = None ): """ get list of FTSFiles with status in statusList """ statusList = statusList if statusList else [ "Waiting" ] limit = limit if limit else 1000 getFTSFileList = self.ftsManager.getFTSFileList( statusList, limit ) if not getFTSFileList['OK']: self.log.error( "Failed getFTSFileList", "%s" % getFTSFileList['Message'] ) return getFTSFileList getFTSFileList = getFTSFileList['Value'] return S_OK( [ FTSFile( ftsFile ) for ftsFile in getFTSFileList ] ) def getFTSJobList( self, statusList = None, limit = None ): """ get FTSJobs wit statues in :statusList: """ statusList = statusList if statusList else list( FTSJob.INITSTATES + FTSJob.TRANSSTATES ) limit = limit if limit else 500 getFTSJobList = self.ftsManager.getFTSJobList( statusList, limit ) if not getFTSJobList['OK']: self.log.error( "Failed getFTSJobList", "%s" % getFTSJobList['Message'] ) return getFTSJobList getFTSJobList = getFTSJobList['Value'] return S_OK( [ FTSJob( ftsJobDict ) for ftsJobDict in getFTSJobList ] ) def getFTSFilesForRequest( self, requestID, statusList = None ): """ read FTSFiles for a given :requestID: :param int requestID: ReqDB.Request.RequestID :param list statusList: List of statuses (default: Waiting) """ ftsFiles = self.ftsManager.getFTSFilesForRequest( requestID, statusList ) if not ftsFiles['OK']: self.log.error( "Failed getFTSFilesForRequest", "%s" % ftsFiles['Message'] ) return ftsFiles return S_OK( [ FTSFile( ftsFileDict ) for ftsFileDict in ftsFiles['Value'] ] ) def getAllFTSFilesForRequest( self, requestID ): """ read FTSFiles for a given :requestID: :param int requestID: ReqDB.Request.RequestID """ ftsFiles = self.ftsManager.getAllFTSFilesForRequest( requestID ) if not ftsFiles['OK']: self.log.error( "Failed getFTSFilesForRequest", "%s" % ftsFiles['Message'] ) return ftsFiles return S_OK( [ FTSFile( ftsFileDict ) for ftsFileDict in ftsFiles['Value'] ] ) def getFTSJobsForRequest( self, requestID, statusList = None ): """ get list of FTSJobs with statues in :statusList: given requestID :param int requestID: ReqDB.Request.RequestID :param list statusList: list with FTSJob statuses :return: [ FTSJob, FTSJob, ... ] """ statusList = statusList if statusList else list( FTSJob.INITSTATES + FTSJob.TRANSSTATES ) getJobs = self.ftsManager.getFTSJobsForRequest( requestID, statusList ) if not getJobs['OK']: self.log.error( "Failed getFTSJobsForRequest", "%s" % getJobs['Message'] ) return getJobs return S_OK( [ FTSJob( ftsJobDict ) for ftsJobDict in getJobs['Value'] ] ) def getFTSFile( self, ftsFileID = None ): """ get FTSFile :param int ftsFileID: FTSFileID """ getFile = self.ftsManager.getFTSFile( ftsFileID ) if not getFile['OK']: self.log.error( 'Failed to get FTS file', getFile['Message'] ) # # de-serialize if getFile['Value']: ftsFile = FTSFile( getFile['Value'] ) return S_OK( ftsFile ) def putFTSJob( self, ftsJob ): """ put FTSJob into FTSDB :param FTSJob.FTSJob ftsJob: FTSJob instance """ ftsJobJSON = ftsJob.toJSON() if not ftsJobJSON['OK']: self.log.error( 'Failed to get JSON of an FTS job', ftsJobJSON['Message'] ) return ftsJobJSON isValid = self.ftsValidator.validate( ftsJob ) if not isValid['OK']: self.log.error( "Failed to validate FTS job", "%s %s" % ( isValid['Message'], str( ftsJobJSON['Value'] ) ) ) return isValid return self.ftsManager.putFTSJob( ftsJobJSON['Value'] ) def getFTSJob( self, ftsJobID ): """ get FTS job, change its status to 'Assigned' :param int ftsJobID: FTSJobID """ getJob = self.ftsManager.getFTSJob( ftsJobID ) if not getJob['OK']: self.log.error( 'Failed to get FTS job', getJob['Message'] ) return getJob setStatus = self.ftsManager.setFTSJobStatus( ftsJobID, 'Assigned' ) if not setStatus['OK']: self.log.error( 'Failed to set status of FTS job', setStatus['Message'] ) # # de-serialize # if getJob['Value']: # getJob = FTSJob( getJob['Value'] ) return getJob def peekFTSJob( self, ftsJobID ): """ just peek FTSJob :param int ftsJobID: FTSJobID """ getJob = self.ftsManager.getFTSJob( ftsJobID ) if not getJob['OK']: self.log.error( 'Failed to get FTS job', getJob['Message'] ) return getJob return getJob def deleteFTSJob( self, ftsJobID ): """ delete FTSJob into FTSDB :param int ftsJob: FTSJobID """ deleteJob = self.ftsManager.deleteFTSJob( ftsJobID ) if not deleteJob['OK']: self.log.error( 'Failed to delete FTS job', deleteJob['Message'] ) return deleteJob def getFTSJobIDs( self, statusList = None ): """ get list of FTSJobIDs for a given status list """ statusList = statusList if statusList else [ "Submitted", "Ready", "Active" ] ftsJobIDs = self.ftsManager.getFTSJobIDs( statusList ) if not ftsJobIDs['OK']: self.log.error( 'Failed to get FTS job IDs', ftsJobIDs['Message'] ) return ftsJobIDs def getFTSFileIDs( self, statusList = None ): """ get list of FTSFileIDs for a given status list """ statusList = statusList if statusList else [ "Waiting" ] ftsFileIDs = self.ftsManager.getFTSFileIDs( statusList ) if not ftsFileIDs['OK']: self.log.error( 'Failed to get FTS file IDs', ftsFileIDs['Message'] ) return ftsFileIDs def getFTSHistory( self ): """ get FTS history snapshot """ getFTSHistory = self.ftsManager.getFTSHistory() if not getFTSHistory['OK']: self.log.error( 'Failed to get FTS history', getFTSHistory['Message'] ) return getFTSHistory getFTSHistory = getFTSHistory['Value'] return S_OK( [ FTSHistoryView( ftsHistory ) for ftsHistory in getFTSHistory ] ) def getDBSummary( self ): """ get FTDB summary """ dbSummary = self.ftsManager.getDBSummary() if not dbSummary['OK']: self.log.error( "Failed getDBSummary", "%s" % dbSummary['Message'] ) return dbSummary def setFTSFilesWaiting( self, operationID, sourceSE, opFileIDList = None ): """ update status for waiting FTSFiles from 'Waiting#SourceSE' to 'Waiting' :param int operationID: ReqDB.Operation.OperationID :param str sourceSE: source SE name :param opFileIDList: [ ReqDB.File.FileID, ... ] """ return self.ftsManager.setFTSFilesWaiting( operationID, sourceSE, opFileIDList ) def deleteFTSFiles( self, operationID, opFileIDList = None ): """ delete FTSFiles for rescheduling :param int operationID: ReqDB.Operation.OperationID :param list opFileIDList: [ ReqDB.File.FileID, ... ] """ return self.ftsManager.deleteFTSFiles( operationID, opFileIDList ) def ftsSchedule( self, requestID, operationID, opFileList ): """ schedule lfn for FTS job :param int requestID: RequestDB.Request.RequestID :param int operationID: RequestDB.Operation.OperationID :param list opFileList: list of tuples ( File.toJSON()['Value'], sourcesList, targetList ) """ # Check whether there are duplicates fList = [] for fileJSON, sourceSEs, targetSEs in opFileList: fTuple = ( json.loads( fileJSON ), sourceSEs, targetSEs ) if fTuple not in fList: fList.append( fTuple ) else: self.log.warn( 'File list for FTS scheduling has duplicates, fix it:\n', fTuple ) fileIDs = [int( fileJSON.get( 'FileID', 0 ) ) for fileJSON, _sourceSEs, _targetSEs in fList ] res = self.ftsManager.cleanUpFTSFiles( requestID, fileIDs ) if not res['OK']: self.log.error( "Failed ftsSchedule", "%s" % res['Message'] ) return S_ERROR( "ftsSchedule: %s" % res['Message'] ) ftsFiles = [] # # this will be returned on success result = { "Successful": [], "Failed": {} } for fileJSON, sourceSEs, targetSEs in fList: lfn = fileJSON.get( "LFN", "" ) size = int( fileJSON.get( "Size", 0 ) ) fileID = int( fileJSON.get( "FileID", 0 ) ) opID = int( fileJSON.get( "OperationID", 0 ) ) self.log.verbose( "ftsSchedule: LFN=%s FileID=%s OperationID=%s sources=%s targets=%s" % ( lfn, fileID, opID, sourceSEs, targetSEs ) ) res = self.dataManager.getActiveReplicas( lfn ) if not res['OK']: self.log.error( "Failed ftsSchedule", "%s" % res['Message'] ) result["Failed"][fileID] = res['Message'] continue replicaDict = res['Value'] if lfn in replicaDict["Failed"] and lfn not in replicaDict["Successful"]: result["Failed"][fileID] = "no active replicas found" continue replicaDict = replicaDict["Successful"].get( lfn, {} ) # # use valid replicas only validReplicasDict = dict( [ ( se, pfn ) for se, pfn in replicaDict.items() if se in sourceSEs ] ) if not validReplicasDict: self.log.warn( "No active replicas found in sources" ) result["Failed"][fileID] = "no active replicas found in sources" continue tree = self.ftsManager.getReplicationTree( sourceSEs, targetSEs, size ) if not tree['OK']: self.log.error( "Failed ftsSchedule", "%s cannot be scheduled: %s" % ( lfn, tree['Message'] ) ) result["Failed"][fileID] = tree['Message'] continue tree = tree['Value'] self.log.verbose( "LFN=%s tree=%s" % ( lfn, tree ) ) treeBranches = [] printed = False for repDict in tree.values(): if repDict in treeBranches: if not printed: self.log.warn( 'Duplicate tree branch', str( tree ) ) printed = True else: treeBranches.append( repDict ) for repDict in treeBranches: self.log.verbose( "Strategy=%s Ancestor=%s SourceSE=%s TargetSE=%s" % ( repDict["Strategy"], repDict["Ancestor"], repDict["SourceSE"], repDict["TargetSE"] ) ) transferSURLs = self._getTransferURLs( lfn, repDict, sourceSEs, validReplicasDict ) if not transferSURLs['OK']: result["Failed"][fileID] = transferSURLs['Message'] continue sourceSURL, targetSURL, fileStatus = transferSURLs['Value'] if sourceSURL == targetSURL: result["Failed"][fileID] = "sourceSURL equals to targetSURL for %s" % lfn continue self.log.verbose( "sourceURL=%s targetURL=%s FTSFile.Status=%s" % ( sourceSURL, targetSURL, fileStatus ) ) ftsFile = FTSFile() for key in ( "LFN", "FileID", "OperationID", "Checksum", "ChecksumType", "Size" ): if fileJSON.get( key ): setattr( ftsFile, key, fileJSON.get( key ) ) ftsFile.RequestID = requestID ftsFile.OperationID = operationID ftsFile.SourceSURL = sourceSURL ftsFile.TargetSURL = targetSURL ftsFile.SourceSE = repDict["SourceSE"] ftsFile.TargetSE = repDict["TargetSE"] ftsFile.Status = fileStatus ftsFiles.append( ftsFile ) if not ftsFiles: self.log.info( "ftsSchedule: no FTSFiles to put for request %d" % requestID ) return S_OK( result ) ftsFilesJSONList = [ftsFile.toJSON()['Value'] for ftsFile in ftsFiles] res = self.ftsManager.putFTSFileList( ftsFilesJSONList ) if not res['OK']: self.log.error( "Failed ftsSchedule", "%s" % res['Message'] ) return S_ERROR( "ftsSchedule: %s" % res['Message'] ) result['Successful'] += [ fileID for fileID in fileIDs if fileID not in result['Failed']] # # if we land here some files have been properly scheduled return S_OK( result ) ################################################################################################################ # Some utilities function def _getSurlForLFN( self, targetSE, lfn ): """ Get the targetSURL for the storage and LFN supplied. :param self: self reference :param str targetSE: target SE :param str lfn: LFN """ res = StorageFactory().getStorages( targetSE, pluginList = ["SRM2"] ) if not res['OK']: errStr = "_getSurlForLFN: Failed to create SRM2 storage for %s: %s" % ( targetSE, res['Message'] ) self.log.error( "_getSurlForLFN: Failed to create SRM2 storage", "%s: %s" % ( targetSE, res['Message'] ) ) return S_ERROR( errStr ) storageObjects = res['Value']["StorageObjects"] for storageObject in storageObjects: res = storageObject.getCurrentURL( lfn ) if res['OK']: return res self.log.error( "_getSurlForLFN: Failed to get SRM compliant storage.", targetSE ) return S_ERROR( "_getSurlForLFN: Failed to get SRM compliant storage." ) def _getTransferURLs( self, lfn, repDict, replicas, replicaDict ): """ prepare TURLs for given LFN and replication tree :param self: self reference :param str lfn: LFN :param dict repDict: replication dictionary :param dict replicas: LFN replicas """ hopSourceSE = repDict["SourceSE"] hopTargetSE = repDict["TargetSE"] hopAncestor = repDict["Ancestor"] # # get targetSURL res = self._getSurlForLFN( hopTargetSE, lfn ) if not res['OK']: self.log.error( "Failed _getTransferURLs", "%s" % res['Message'] ) return res targetSURL = res['Value'] status = "Waiting" # # get the sourceSURL if hopAncestor: status = "Waiting#%s" % ( hopAncestor ) res = self._getSurlForLFN( hopSourceSE, lfn ) sourceSURL = res.get( 'Value', replicaDict.get( hopSourceSE, None ) ) if not sourceSURL: self.log.error( "Failed _getTransferURLs", "%s" % res['Message'] ) return res return S_OK( ( sourceSURL, targetSURL, status ) )
class RequestPreparationAgent(AgentModule): def initialize(self): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') return S_OK() def execute(self): res = self.prepareNewReplicas() return res def prepareNewReplicas(self): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message']) return res if not res['Value']: gLogger.info("There were no New replicas found") return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len(replicaIDs)) # Check that the files exist in the FileCatalog res = self.__getExistingFiles(replicas.keys()) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine existance of any files' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len(exist)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len(terminal)) # Obtain the file sizes from the FileCatalog res = self.__getFileSize(exist) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len(fileSizes)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len(terminal)) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas(fileSizes.keys()) if not res['OK']: return res failed.update(res['Value']['Failed']) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop(lfn) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len(fileReplicas)) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len(terminal)) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas[lfn] for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[ replicaID] = "LFN not registered at requested SE" replicas[lfn].pop(requestedSE) else: replicaMetadata.append( (replicaID, lfnReplicas[requestedSE], fileSizes[lfn])) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len(terminalReplicaIDs)) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure(terminalReplicaIDs) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message']) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len(replicaMetadata)) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation(replicaMetadata) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message']) return S_OK() def __getNewReplicas(self): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas({'Status': 'New'}) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message']) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len(res['Value'])) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] if not replicas.has_key(lfn): replicas[lfn] = {} replicas[lfn][storageElement] = replicaID replicaIDs[replicaID] = (lfn, storageElement) return S_OK({'Replicas': replicas, 'ReplicaIDs': replicaIDs}) def __getExistingFiles(self, lfns): """ This checks that the files exist in the FileCatalog. """ filesExist = [] missing = {} res = self.fileCatalog.exists(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message']) return res failed = res['Value']['Failed'] for lfn, exists in res['Value']['Successful'].items(): if exists: filesExist.append(lfn) else: missing[lfn] = 'LFN not registered in the FileCatalog' if missing: for lfn, reason in missing.items(): gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, lfn) self.__reportProblematicFiles(missing.keys(), 'LFN-LFC-DoesntExist') return S_OK({ 'Exist': filesExist, 'Missing': missing, 'Failed': failed }) def __getFileSize(self, lfns): """ This obtains the file size from the FileCatalog. """ fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message']) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[ lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn("RequestPreparation.__getFileSize: %s" % reason, lfn) self.__reportProblematicFiles(zeroSize.keys(), 'LFN-LFC-ZeroSize') return S_OK({ 'FileSizes': fileSizes, 'ZeroSize': zeroSize, 'Failed': failed }) def __getFileReplicas(self, lfns): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.dm.getActiveReplicas(lfns) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message']) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len(lfnReplicas.keys()) == 0: noReplicas[ lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn) self.__reportProblematicFiles(noReplicas.keys(), 'LFN-LFC-NoReplicas') return S_OK({ 'Replicas': replicas, 'ZeroReplicas': noReplicas, 'Failed': failed }) def __reportProblematicFiles(self, lfns, reason): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent='RequestPreparationAgent') if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message']) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len(res['Value']['Successful'])) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len(res['Value']['Failed'])) return res
def filterReplicas(opFile, logger=None, dataManager=None): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger("filterReplicas") result = defaultdict(list) replicas = dataManager.getActiveReplicas(opFile.LFN, getUrl=False) if not replicas["OK"]: log.error('Failed to get active replicas', replicas["Message"]) return replicas reNotExists = re.compile(r".*such file.*") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas(opFile.LFN, getUrl=False) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {}) if not allReplicas: result['NoReplicas'].append(None) noReplicas = True else: # There are replicas but we cannot get metadata because the replica is not active result['NoActiveReplicas'] += list(allReplicas) log.verbose( "File has no%s replica in File Catalog" % ('' if noReplicas else ' active'), opFile.LFN) else: return allReplicas if not opFile.Checksum or hexAdlerToInt(opFile.Checksum) is False: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get('Value', {}).get('Successful', {}).get(opFile.LFN, {}).get('Checksum') # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: if hexAdlerToInt(fcChecksum) is not False: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][ opFile.LFN].get('ChecksumType', 'Adler32') else: opFile.Checksum = None # If no replica was found, return what we collected as information if not replicas: return S_OK(result) for repSEName in replicas: repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN) error = repSEMetadata.get( 'Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(opFile.LFN)) if error: log.warn( 'unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', '')) if 'File does not exist' in error: result['NoReplicas'].append(repSEName) else: result["NoMetadata"].append(repSEName) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = hexAdlerToInt(repSEMetadata.get("Checksum")) # As from here seChecksum is an integer or False, not a hex string! if seChecksum is False and opFile.Checksum: result['NoMetadata'].append(repSEName) elif not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and (not opFile.Checksum or opFile.Checksum == 'False'): # Use the SE checksum (convert to hex) and force type to be Adler32 opFile.Checksum = intAdlerToHex(seChecksum) opFile.ChecksumType = 'Adler32' if not opFile.Checksum or not seChecksum or compareAdler( intAdlerToHex(seChecksum), opFile.Checksum): # # All checksums are OK result["Valid"].append(repSEName) else: log.warn(" %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, intAdlerToHex(seChecksum))) result["Bad"].append(repSEName) else: # If a replica was found somewhere, don't set the file as no replicas result['NoReplicas'] = [] return S_OK(result)
def filterReplicas( opFile, logger = None, dataManager = None ): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( 'Failed to get active replicas', replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas( opFile.LFN ) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {} ) if not allReplicas: ret['NoReplicas'].append( None ) noReplicas = True else: # We try inactive replicas to see if maybe the file doesn't exist at all replicas = allReplicas log.warn( "File has no%s replica in File Catalog" % ( '' if noReplicas else ' active' ), opFile.LFN ) else: return allReplicas if not opFile.Checksum or hexAdlerToInt( opFile.Checksum ) == False: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' ) # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: if hexAdlerToInt( fcChecksum ) != False: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' ) else: opFile.Checksum = None for repSEName in replicas: repSEMetadata = StorageElement( repSEName ).getFileMetadata( opFile.LFN ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = hexAdlerToInt( repSEMetadata.get( "Checksum" ) ) if seChecksum == False and opFile.Checksum: ret['NoMetadata'].append( repSEName ) elif not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and ( not opFile.Checksum or opFile.Checksum == 'False' ): # Use the SE checksum and force type to be Adler32 opFile.Checksum = seChecksum opFile.ChecksumType = 'Adler32' if not opFile.Checksum or not seChecksum or compareAdler( seChecksum, opFile.Checksum ): # # All checksums are OK ret["Valid"].append( repSEName ) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) else: # If a replica was found somewhere, don't set the file as no replicas ret['NoReplicas'] = [] return S_OK( ret )
def filterReplicas(opFile, logger=None, dataManager=None, seCache=None): """ filter out banned/invalid source SEs """ if not logger: logger = gLogger if not dataManager: dataManager = DataManager() if not seCache: seCache = {} log = logger.getSubLogger("filterReplicas") ret = { "Valid": [], "NoMetadata": [], "Bad": [], 'NoReplicas': [], 'NoPFN': [] } replicas = dataManager.getActiveReplicas(opFile.LFN) if not replicas["OK"]: log.error(replicas["Message"]) return replicas reNotExists = re.compile("not such file or directory") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) for repSEName in replicas: repSE = seCache[repSEName] if repSEName in seCache else \ seCache.setdefault( repSEName, StorageElement( repSEName ) ) pfn = repSE.getPfnForLfn(opFile.LFN) if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']: log.warn( "unable to create pfn for %s lfn at %s: %s" % (opFile.LFN, repSEName, pfn.get( 'Message', pfn.get('Value', {}).get('Failed', {}).get(opFile.LFN)))) ret["NoPFN"].append(repSEName) else: pfn = pfn["Value"]['Successful'][opFile.LFN] repSEMetadata = repSE.getFileMetadata(pfn) error = repSEMetadata.get( 'Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(pfn)) if error: log.warn( 'unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', '')) if 'File does not exist' in error: ret['NoReplicas'].append(repSEName) else: ret["NoMetadata"].append(repSEName) else: repSEMetadata = repSEMetadata['Value']['Successful'][pfn] seChecksum = repSEMetadata.get("Checksum") if opFile.Checksum and seChecksum and not compareAdler( seChecksum, opFile.Checksum): # The checksum in the request may be wrong, check with FC fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get('Value', {}).get( 'Successful', {}).get(opFile.LFN, {}).get('Checksum') if fcChecksum and fcChecksum != opFile.Checksum and compareAdler( fcChecksum, seChecksum): opFile.Checksum = fcChecksum ret['Valid'].append(repSEName) else: log.warn(" %s checksum mismatch, request: %s @%s: %s" % (opFile.LFN, opFile.Checksum, repSEName, seChecksum)) ret["Bad"].append(repSEName) else: # # if we're here repSE is OK ret["Valid"].append(repSEName) return S_OK(ret)
def filterReplicas( opFile, logger = None, dataManager = None ): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( 'Failed to get active replicas', replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) if not opFile.Checksum: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum', '' ) # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' ) for repSEName in replicas: repSE = StorageElement( repSEName ) repSEMetadata = repSE.getFileMetadata( opFile.LFN ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) else: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = repSEMetadata.get( "Checksum" ) if ( opFile.Checksum and seChecksum and compareAdler( seChecksum, opFile.Checksum ) ) or\ ( not opFile.Checksum and not seChecksum ): # # All checksums are OK ret["Valid"].append( repSEName ) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) return S_OK( ret )
def filterReplicas( opFile, logger = None, dataManager = None, seCache = None ): """ filter out banned/invalid source SEs """ if not logger: logger = gLogger if not dataManager: dataManager = DataManager() if not seCache: seCache = {} log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) for repSEName in replicas: repSE = seCache[repSEName] if repSEName in seCache else \ seCache.setdefault( repSEName, StorageElement( repSEName ) ) pfn = repSE.getPfnForLfn( opFile.LFN ) if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']: log.warn( "unable to create pfn for %s lfn at %s: %s" % ( opFile.LFN, repSEName, pfn.get( 'Message', pfn.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) ) ) ret["NoPFN"].append( repSEName ) else: pfn = pfn["Value"]['Successful'][ opFile.LFN ] repSEMetadata = repSE.getFileMetadata( pfn ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( pfn ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) else: repSEMetadata = repSEMetadata['Value']['Successful'][pfn] seChecksum = repSEMetadata.get( "Checksum" ) if opFile.Checksum and seChecksum and not compareAdler( seChecksum, opFile.Checksum ) : # The checksum in the request may be wrong, check with FC fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' ) if fcChecksum and fcChecksum != opFile.Checksum and compareAdler( fcChecksum , seChecksum ): opFile.Checksum = fcChecksum ret['Valid'].append( repSEName ) else: log.warn( " %s checksum mismatch, request: %s @%s: %s" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) else: # # if we're here repSE is OK ret["Valid"].append( repSEName ) return S_OK( ret )
def getFile(lfn, se=''): dm = DataManager() download_ok = 0 get_active_replicas_ok = False lfn_on_se = False error_msg = '' if se: for i in range(0, 5): result = dm.getActiveReplicas(lfn) if result['OK'] and result['Value']['Successful']: get_active_replicas_ok = True lfnReplicas = result['Value']['Successful'] if se in lfnReplicas[lfn]: lfn_on_se = True break time.sleep(3) print '- Get replicas for %s failed, try again' % lfn if not get_active_replicas_ok: return S_ERROR('Get replicas error: %s' % lfn) if lfn_on_se: se = StorageElement(se) # try 5 times for j in range(0, 5): result = se.getFile(lfn) if result['OK'] and result['Value']['Successful'] and result[ 'Value']['Successful'].has_key(lfn): break time.sleep(random.randint(180, 600)) print '- %s getStorageFile(%s) failed, try again' % (lfn, se) if result['OK']: if result['Value']['Successful'] and result['Value'][ 'Successful'].has_key(lfn): download_ok = 1 else: error_msg = 'Downloading %s from SE %s error!' % (lfn, se) else: error_msg = result['Message'] else: if se: print 'File %s not found on SE "%s" after %s tries, trying other SE' % ( lfn, se, i + 1) # try 5 times for j in range(0, 5): result = dm.getFile(lfn) if result['OK'] and result['Value']['Successful'] and result[ 'Value']['Successful'].has_key(lfn): break time.sleep(random.randint(180, 600)) print '- getFile(%s) failed, try again' % lfn if result['OK']: if result['Value']['Successful'] and result['Value'][ 'Successful'].has_key(lfn): download_ok = 2 else: error_msg = 'Downloading %s from random SE error!' % lfn else: error_msg = result['Message'] if download_ok: return S_OK({lfn: {'DownloadOK': download_ok, 'Retry': j + 1}}) return S_ERROR(error_msg)
def filterReplicas(opFile, logger=None, dataManager=None): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger("filterReplicas") ret = {"Valid": [], "NoMetadata": [], "Bad": [], "NoReplicas": [], "NoPFN": []} replicas = dataManager.getActiveReplicas(opFile.LFN) if not replicas["OK"]: log.error("Failed to get active replicas", replicas["Message"]) return replicas reNotExists = re.compile(r".*such file.*") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas(opFile.LFN) if allReplicas["OK"]: allReplicas = allReplicas["Value"]["Successful"].get(opFile.LFN, {}) if not allReplicas: ret["NoReplicas"].append(None) noReplicas = True else: # We try inactive replicas to see if maybe the file doesn't exist at all replicas = allReplicas log.warn("File has no%s replica in File Catalog" % ("" if noReplicas else " active"), opFile.LFN) else: return allReplicas if not opFile.Checksum: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get("Value", {}).get("Successful", {}).get(opFile.LFN, {}).get("Checksum") # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata["Value"]["Successful"][opFile.LFN].get("ChecksumType", "Adler32") for repSEName in replicas: repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN) error = repSEMetadata.get("Message", repSEMetadata.get("Value", {}).get("Failed", {}).get(opFile.LFN)) if error: log.warn("unable to get metadata at %s for %s" % (repSEName, opFile.LFN), error.replace("\n", "")) if "File does not exist" in error: ret["NoReplicas"].append(repSEName) else: ret["NoMetadata"].append(repSEName) elif not noReplicas: repSEMetadata = repSEMetadata["Value"]["Successful"][opFile.LFN] seChecksum = repSEMetadata.get("Checksum") if not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and not opFile.Checksum: opFile.Checksum = seChecksum if not opFile.Checksum or not seChecksum or compareAdler(seChecksum, opFile.Checksum): # # All checksums are OK ret["Valid"].append(repSEName) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, seChecksum) ) ret["Bad"].append(repSEName) else: # If a replica was found somewhere, don't set the file as no replicas ret["NoReplicas"] = [] return S_OK(ret)
class SoftwareManager(object): """ Manage software setup """ def __init__(self, soft_category): """ Constructor """ self.CVMFS_DIR = '/cvmfs/sw.cta-observatory.org/software' self.LFN_ROOT = '/vo.cta.in2p3.fr/software' self.SOFT_CATEGORY_DICT = soft_category self.dm = DataManager() def _search_software(self, package, version, compiler, use_cvmfs): ''' Look for sotfware package ''' # software package category category = self.SOFT_CATEGORY_DICT[package] # look for software on cvmfs if use_cvmfs: package_dir = os.path.join(self.CVMFS_DIR, 'centos7', compiler, category, package, version) if os.path.isdir(package_dir): DIRAC.gLogger.notice('Found package %s version %s at:\n%s' % (package, version, package_dir)) return DIRAC.S_OK({'Source':'cvmfs', 'Path':package_dir}) else: DIRAC.gLogger.warn('%s\n not found on cvmfs'%package_dir) # look for tarball in the Dirac file catalog else: package_dir = os.path.join(self.LFN_ROOT, 'centos7', compiler, category, package, version) DIRAC.gLogger.notice('Looking for tarball in %s'%package_dir) results = self.dm.getFilesFromDirectory(package_dir) try: first_file_path = results['Value'][0] if first_file_path[-7:] == '.tar.gz': results = self.dm.getActiveReplicas(first_file_path) if results['OK']: return DIRAC.S_OK({'Source':'tarball', 'Path':package_dir}) except: DIRAC.gLogger.warn('No usual tarball found in the directory') return DIRAC.S_ERROR('Could not find package %s / %s / %s in any location' % (package, version, compiler)) def find_software(self, package, version, compiler='gcc48_default'): """ check if the software package is installed in any software area Keyword arguments: package -- package name as the directory name version -- software version as the directory name compiler -- compiler version and configuration """ # first check if cvmfs is available ops_helper = Operations() use_cvmfs = ops_helper.getValue('SoftwarePolicy/UseCvmfs', bool) DIRAC.gLogger.notice('SoftwarePolicy for UseCvmfs is:', use_cvmfs) # get platform and cpu information try: os_name, cpu_name, inst = get_os_and_cpu_info() DIRAC.gLogger.notice('Running %s on a %s ' %(os_name, cpu_name)) except: inst = 'sse4' DIRAC.gLogger.warn('Could not determine platform and cpu information') if compiler == 'gcc48_default': results = self._search_software(package, version, compiler, use_cvmfs) return results elif compiler == 'gcc48_sse4': # assume all processors have at least sse4 results = self._search_software(package, version, compiler, use_cvmfs) return results elif compiler == 'gcc48_avx': if inst in ['avx', 'avx2', 'avx512']: results = self._search_software(package, version, compiler, use_cvmfs) return results else: DIRAC.gLogger.warn('CPU has no avx instructions, running sse4 version') compiler = 'gcc48_sse4' results = self._search_software(package, version, compiler, use_cvmfs) return results elif compiler == 'gcc48_avx2': if inst in ['avx2', 'avx512']: results = self._search_software(package, version, compiler, use_cvmfs) return results else: DIRAC.gLogger.warn('CPU has no avx2 instructions, running sse4 version') compiler = 'gcc48_sse4' results = self._search_software(package, version, compiler, use_cvmfs) return results elif compiler == 'gcc48_avx512': if inst is 'avx512': results = self._search_software(package, version, compiler, use_cvmfs) return results else: DIRAC.gLogger.warn('CPU has no avx512 instructions, running sse4 version') compiler = 'gcc48_sse4' results = self._search_software(package, version, compiler, use_cvmfs) return results elif compiler == 'gcc48_matchcpu': compiler = 'gcc48_%s'%inst results = self._search_software(package, version, compiler, use_cvmfs) return results else: DIRAC.S_ERROR('Unknown compiler specified: %s'%compiler) return DIRAC.S_ERROR('Could not find package %s version %s / %s in any location' % (package, version, compiler)) def install_dirac_scripts(self, package_dir): """ copy DIRAC scripts in the current directory """ dirac_scripts = glob.glob(os.path.join(package_dir, 'dirac_*')) try: for one_file in dirac_scripts: shutil.copy2(one_file, '.') return DIRAC.S_OK() except shutil.Error as error: return DIRAC.S_ERROR('Failed to install DIRAC scripts:\n%s'%error) def dump_setup_script_path(self, package_dir, textfilename = 'setup_script_path.txt'): """ dump the path to setupPackage.sh in a one line ascii file to be read and source by the following script """ script_path = os.path.join(package_dir, 'setupPackage.sh') open(textfilename, 'w').writelines(script_path + '\n') return DIRAC.S_OK() def install_software(self, tar_lfn, target_dir='.'): """ install software package in the current directory """ DIRAC.gLogger.notice('Installing package at %s'%tar_lfn) # Download the tar file DIRAC.gLogger.notice('Trying to download package:', tar_lfn) res = self.dm.getFile(tar_lfn) if not res['OK']: return res if tar_lfn in res['Value']['Successful']: DIRAC.gLogger.notice(' Package downloaded successfully:', tar_lfn) else: error = 'Failed to download package:', tar_lfn return DIRAC.S_ERROR(error) # Extract the tar file to the target directory tar_mode = "r|*" tar = tarfile.open(tar_lfn, tar_mode) for tarInfo in tar: tar.extract(tarInfo, target_dir) tar.close() os.unlink(tar_lfn) # Done DIRAC.gLogger.notice('Package %s installed successfully at:\n%s' %(tar_lfn, target_dir)) return DIRAC.S_OK(target_dir)
class InputDataAgent( OptimizerModule ): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer( self ): """Initialize specific parameters for JobSanityAgent. """ self.failedMinorStatus = self.am_getOption( '/FailedJobStatus', 'Input Data Not Available' ) #this will ignore failover SE files self.checkFileMetadata = self.am_getOption( 'CheckFileMetadata', True ) self.dataManager = DataManager() self.resourceStatus = ResourceStatus() self.fc = FileCatalog() self.seToSiteMapping = {} self.lastCScheck = 0 self.cacheLength = 600 return S_OK() ############################################################################# def checkJob( self, job, classAdJob ): """ This method does the optimization corresponding to this Agent, it is call for each job by the Optimizer framework """ result = self.jobDB.getInputData( job ) if not result['OK']: self.log.warn( 'Failed to get input data from JobdB for %s' % ( job ) ) self.log.warn( result['Message'] ) return result if not result['Value']: self.log.verbose( 'Job %s has no input data requirement' % ( job ) ) return self.setNextOptimizer( job ) #Check if we already executed this Optimizer and the input data is resolved res = self.getOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ) ) if res['OK'] and len( res['Value'] ): pass else: self.log.verbose( 'Job %s has an input data requirement and will be processed' % ( job ) ) inputData = result['Value'] result = self.__resolveInputData( job, inputData ) if not result['OK']: self.log.warn( result['Message'] ) return result return self.setNextOptimizer( job ) ############################################################################# def __resolveInputData( self, job, inputData ): """This method checks the file catalog for replica information. """ lfns = [ fname.replace( 'LFN:', '' ) for fname in inputData ] start = time.time() # In order to place jobs on Hold if a certain SE is banned we need first to check first if # if the replicas are really available replicas = self.dataManager.getActiveReplicas( lfns ) timing = time.time() - start self.log.verbose( 'Catalog Replicas Lookup Time: %.2f seconds ' % ( timing ) ) if not replicas['OK']: self.log.warn( replicas['Message'] ) return replicas replicaDict = replicas['Value'] siteCandidates = self.__checkReplicas( job, replicaDict ) if not siteCandidates['OK']: self.log.warn( siteCandidates['Message'] ) return siteCandidates if self.checkFileMetadata: guids = True start = time.time() guidDict = self.fc.getFileMetadata( lfns ) timing = time.time() - start self.log.info( 'Catalog Metadata Lookup Time: %.2f seconds ' % ( timing ) ) if not guidDict['OK']: self.log.warn( guidDict['Message'] ) guids = False failed = guidDict['Value']['Failed'] if failed: self.log.warn( 'Failed to establish some GUIDs' ) self.log.warn( failed ) guids = False if guids: for lfn, reps in replicaDict['Successful'].items(): guidDict['Value']['Successful'][lfn].update( reps ) replicas = guidDict resolvedData = {} resolvedData['Value'] = replicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ), resolvedData ) if not result['OK']: self.log.warn( result['Message'] ) return result return S_OK( resolvedData ) ############################################################################# def __checkReplicas( self, job, replicaDict ): """Check that all input lfns have valid replicas and can all be found at least in one single site. """ badLFNs = [] if replicaDict.has_key( 'Successful' ): for lfn, reps in replicaDict['Successful'].items(): if not reps: badLFNs.append( 'LFN:%s Problem: No replicas available' % ( lfn ) ) else: return S_ERROR( 'No replica Info available' ) if replicaDict.has_key( 'Failed' ): for lfn, cause in replicaDict['Failed'].items(): badLFNs.append( 'LFN:%s Problem: %s' % ( lfn, cause ) ) if badLFNs: self.log.info( 'Found %s problematic LFN(s) for job %s' % ( len( badLFNs ), job ) ) param = '\n'.join( badLFNs ) self.log.info( param ) result = self.setJobParam( job, self.am_getModuleParam( 'optimizerName' ), param ) if not result['OK']: self.log.error( result['Message'] ) return S_ERROR( 'Input Data Not Available' ) return self.__getSiteCandidates( replicaDict['Successful'] ) ############################################################################# # FIXME: right now this is unused... def __checkActiveSEs( self, job, replicaDict ): """ Check active SE and replicas and identify possible Site candidates for the execution of the job """ # Now let's check if some replicas might not be available due to banned SE's activeReplicas = self.dataManager.checkActiveReplicas( replicaDict ) if not activeReplicas['OK']: # due to banned SE's input data might no be available msg = "On Hold: Missing replicas due to banned SE" self.log.info( msg ) self.log.warn( activeReplicas['Message'] ) return S_ERROR( msg ) activeReplicaDict = activeReplicas['Value'] siteCandidates = self.__checkReplicas( job, activeReplicaDict ) if not siteCandidates['OK']: # due to a banned SE's input data is not available at a single site msg = "On Hold: Input data not Available due to banned SE" self.log.info( msg ) self.log.warn( siteCandidates['Message'] ) return S_ERROR( msg ) resolvedData = {} resolvedData['Value'] = activeReplicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam( 'optimizerName' ), resolvedData ) if not result['OK']: self.log.warn( result['Message'] ) return result return S_OK( resolvedData ) ############################################################################# def __getSitesForSE( self, se ): """ Returns a list of sites having the given SE as a local one. Uses the local cache of the site-se information """ # Empty the cache if too old if ( time.time() - self.lastCScheck ) > self.cacheLength: self.log.verbose( 'Resetting the SE to site mapping cache' ) self.seToSiteMapping = {} self.lastCScheck = time.time() if se not in self.seToSiteMapping: sites = getSitesForSE( se ) if sites['OK']: self.seToSiteMapping[se] = list( sites['Value'] ) return sites else: return S_OK( self.seToSiteMapping[se] ) ############################################################################# def __getSiteCandidates( self, inputData ): """This method returns a list of possible site candidates based on the job input data requirement. For each site candidate, the number of files on disk and tape is resolved. """ fileSEs = {} for lfn, replicas in inputData.items(): siteList = [] for se in replicas.keys(): sites = self.__getSitesForSE( se ) if sites['OK']: siteList += sites['Value'] fileSEs[lfn] = uniqueElements( siteList ) siteCandidates = [] i = 0 for _fileName, sites in fileSEs.items(): if not i: siteCandidates = sites else: tempSite = [] for site in siteCandidates: if site in sites: tempSite.append( site ) siteCandidates = tempSite i += 1 if not len( siteCandidates ): return S_ERROR( 'No candidate sites available' ) #In addition, check number of files on tape and disk for each site #for optimizations during scheduling siteResult = {} for site in siteCandidates: siteResult[site] = { 'disk': [], 'tape': [] } seDict = {} for lfn, replicas in inputData.items(): for se in replicas.keys(): if se not in seDict: sites = self.__getSitesForSE( se ) if not sites['OK']: continue try: #storageElement = StorageElement( se ) result = self.resourceStatus.getStorageElementStatus( se, statusType = 'ReadAccess' ) if not result['OK']: continue seDict[se] = { 'Sites': sites['Value'], 'SEParams': result['Value'][se] } result = getStorageElementOptions( se ) if not result['OK']: continue seDict[se]['SEParams'].update(result['Value']) except Exception: self.log.exception( 'Failed to instantiate StorageElement( %s )' % se ) continue for site in seDict[se]['Sites']: if site in siteCandidates: if seDict[se]['SEParams']['ReadAccess'] and seDict[se]['SEParams']['DiskSE']: if lfn not in siteResult[site]['disk']: siteResult[site]['disk'].append( lfn ) if lfn in siteResult[site]['tape']: siteResult[site]['tape'].remove( lfn ) if seDict[se]['SEParams']['ReadAccess'] and seDict[se]['SEParams']['TapeSE']: if lfn not in siteResult[site]['tape'] and lfn not in siteResult[site]['disk']: siteResult[site]['tape'].append( lfn ) for site in siteResult: siteResult[site]['disk'] = len( siteResult[site]['disk'] ) siteResult[site]['tape'] = len( siteResult[site]['tape'] ) return S_OK( siteResult )
def filterReplicas(opFile, logger=None, dataManager=None): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger("filterReplicas") result = defaultdict(list) replicas = dataManager.getActiveReplicas(opFile.LFN, getUrl=False) if not replicas["OK"]: log.error('Failed to get active replicas', replicas["Message"]) return replicas reNotExists = re.compile(r".*such file.*") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas(opFile.LFN, getUrl=False) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get(opFile.LFN, {}) if not allReplicas: result['NoReplicas'].append(None) noReplicas = True else: # There are replicas but we cannot get metadata because the replica is not active result['NoActiveReplicas'] += list(allReplicas) log.verbose("File has no%s replica in File Catalog" % ('' if noReplicas else ' active'), opFile.LFN) else: return allReplicas if not opFile.Checksum or hexAdlerToInt(opFile.Checksum) is False: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get( 'Value', {}).get( 'Successful', {}).get( opFile.LFN, {}).get('Checksum') # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: if hexAdlerToInt(fcChecksum) is not False: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get('ChecksumType', 'Adler32') else: opFile.Checksum = None # If no replica was found, return what we collected as information if not replicas: return S_OK(result) for repSEName in replicas: repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN) error = repSEMetadata.get('Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(opFile.LFN)) if error: log.warn('unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', '')) if 'File does not exist' in error: result['NoReplicas'].append(repSEName) else: result["NoMetadata"].append(repSEName) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = hexAdlerToInt(repSEMetadata.get("Checksum")) # As from here seChecksum is an integer or False, not a hex string! if seChecksum is False and opFile.Checksum: result['NoMetadata'].append(repSEName) elif not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and (not opFile.Checksum or opFile.Checksum == 'False'): # Use the SE checksum (convert to hex) and force type to be Adler32 opFile.Checksum = intAdlerToHex(seChecksum) opFile.ChecksumType = 'Adler32' if not opFile.Checksum or not seChecksum or compareAdler( intAdlerToHex(seChecksum), opFile.Checksum): # # All checksums are OK result["Valid"].append(repSEName) else: log.warn(" %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, intAdlerToHex(seChecksum))) result["Bad"].append(repSEName) else: # If a replica was found somewhere, don't set the file as no replicas result['NoReplicas'] = [] return S_OK(result)
class RequestPreparationAgent( AgentModule ): def initialize( self ): self.fileCatalog = FileCatalog() self.dm = DataManager() self.stagerClient = StorageManagerClient() self.dataIntegrityClient = DataIntegrityClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption( 'shifterProxy', 'DataManager' ) return S_OK() def execute( self ): """ This is the first logical task to be executed and manages the New->Waiting transition of the Replicas """ res = self.__getNewReplicas() if not res['OK']: gLogger.fatal( "RequestPreparation.prepareNewReplicas: Failed to get replicas from StagerDB.", res['Message'] ) return res if not res['Value']: gLogger.info( "There were no New replicas found" ) return res replicas = res['Value']['Replicas'] replicaIDs = res['Value']['ReplicaIDs'] gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s New replicas for preparation." % len( replicaIDs ) ) # Check if the files exist in the FileCatalog res = self.__getExistingFiles( replicas ) if not res['OK']: return res exist = res['Value']['Exist'] terminal = res['Value']['Missing'] failed = res['Value']['Failed'] if not exist: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed to determine the existence of any file' ) return S_OK() terminalReplicaIDs = {} for lfn, reason in terminal.items(): for replicaID in replicas[lfn].values(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: %s files exist in the FileCatalog." % len( exist ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files do not exist in the FileCatalog." % len( terminal ) ) # Obtain the file sizes from the FileCatalog res = self.__getFileSize( exist ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroSize'] fileSizes = res['Value']['FileSizes'] if not fileSizes: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine sizes of any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained %s file sizes from the FileCatalog." % len( fileSizes ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero size in the FileCatalog." % len( terminal ) ) # Obtain the replicas from the FileCatalog res = self.__getFileReplicas( fileSizes.keys() ) if not res['OK']: return res failed.update( res['Value']['Failed'] ) terminal = res['Value']['ZeroReplicas'] fileReplicas = res['Value']['Replicas'] if not fileReplicas: gLogger.error( 'RequestPreparation.prepareNewReplicas: Failed determine replicas for any files' ) return S_OK() for lfn, reason in terminal.items(): for _se, replicaID in replicas[lfn].items(): terminalReplicaIDs[replicaID] = reason replicas.pop( lfn ) gLogger.info( "RequestPreparation.prepareNewReplicas: Obtained replica information for %s file from the FileCatalog." % len( fileReplicas ) ) if terminal: gLogger.info( "RequestPreparation.prepareNewReplicas: %s files registered with zero replicas in the FileCatalog." % len( terminal ) ) # Check the replicas exist at the requested site replicaMetadata = [] for lfn, requestedSEs in replicas.items(): lfnReplicas = fileReplicas.get( lfn ) # This should not happen in principle, but it was seen # after a corrupted staging request has entered the DB if not lfnReplicas: gLogger.error( "Missing replicas information", "%s %s" % ( lfn, requestedSEs ) ) continue for requestedSE, replicaID in requestedSEs.items(): if not requestedSE in lfnReplicas.keys(): terminalReplicaIDs[replicaID] = "LFN not registered at requested SE" replicas[lfn].pop( requestedSE ) else: replicaMetadata.append( ( replicaID, lfnReplicas[requestedSE], fileSizes[lfn] ) ) # Update the states of the files in the database if terminalReplicaIDs: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replicas are terminally failed." % len( terminalReplicaIDs ) ) # res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) res = self.stagerClient.updateReplicaFailure( terminalReplicaIDs ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica failures.", res['Message'] ) if replicaMetadata: gLogger.info( "RequestPreparation.prepareNewReplicas: %s replica metadata to be updated." % len( replicaMetadata ) ) # Sets the Status='Waiting' of CacheReplicas records that are OK with catalogue checks res = self.stagerClient.updateReplicaInformation( replicaMetadata ) if not res['OK']: gLogger.error( "RequestPreparation.prepareNewReplicas: Failed to update replica metadata.", res['Message'] ) return S_OK() def __getNewReplicas( self ): """ This obtains the New replicas from the Replicas table and for each LFN the requested storage element """ # First obtain the New replicas from the CacheReplicas table res = self.stagerClient.getCacheReplicas( {'Status':'New'} ) if not res['OK']: gLogger.error( "RequestPreparation.__getNewReplicas: Failed to get replicas with New status.", res['Message'] ) return res if not res['Value']: gLogger.debug( "RequestPreparation.__getNewReplicas: No New replicas found to process." ) return S_OK() else: gLogger.debug( "RequestPreparation.__getNewReplicas: Obtained %s New replicas(s) to process." % len( res['Value'] ) ) replicas = {} replicaIDs = {} for replicaID, info in res['Value'].items(): lfn = info['LFN'] storageElement = info['SE'] replicas.setdefault( lfn, {} )[storageElement] = replicaID replicaIDs[replicaID] = ( lfn, storageElement ) return S_OK( {'Replicas':replicas, 'ReplicaIDs':replicaIDs} ) def __getExistingFiles( self, lfns ): """ This checks that the files exist in the FileCatalog. """ res = self.fileCatalog.exists( list( set( lfns ) ) ) if not res['OK']: gLogger.error( "RequestPreparation.__getExistingFiles: Failed to determine whether files exist.", res['Message'] ) return res failed = res['Value']['Failed'] success = res['Value']['Successful'] exist = [lfn for lfn, exists in success.items() if exists] missing = list( set( success ) - set( exist ) ) if missing: reason = 'LFN not registered in the FC' gLogger.warn( "RequestPreparation.__getExistingFiles: %s" % reason, '\n'.join( [''] + missing ) ) self.__reportProblematicFiles( missing, 'LFN-LFC-DoesntExist' ) missing = dict.fromkeys( missing, reason ) else: missing = {} return S_OK( {'Exist':exist, 'Missing':missing, 'Failed':failed} ) def __getFileSize( self, lfns ): """ This obtains the file size from the FileCatalog. """ fileSizes = {} zeroSize = {} res = self.fileCatalog.getFileSize( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileSize: Failed to get sizes for files.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, size in res['Value']['Successful'].items(): if size == 0: zeroSize[lfn] = "LFN registered with zero size in the FileCatalog" else: fileSizes[lfn] = size if zeroSize: for lfn, reason in zeroSize.items(): gLogger.warn( "RequestPreparation.__getFileSize: %s" % reason, lfn ) self.__reportProblematicFiles( zeroSize.keys(), 'LFN-LFC-ZeroSize' ) return S_OK( {'FileSizes':fileSizes, 'ZeroSize':zeroSize, 'Failed':failed} ) def __getFileReplicas( self, lfns ): """ This obtains the replicas from the FileCatalog. """ replicas = {} noReplicas = {} res = self.dm.getActiveReplicas( lfns ) if not res['OK']: gLogger.error( "RequestPreparation.__getFileReplicas: Failed to obtain file replicas.", res['Message'] ) return res failed = res['Value']['Failed'] for lfn, lfnReplicas in res['Value']['Successful'].items(): if len( lfnReplicas.keys() ) == 0: noReplicas[lfn] = "LFN registered with zero replicas in the FileCatalog" else: replicas[lfn] = lfnReplicas if noReplicas: for lfn, reason in noReplicas.items(): gLogger.warn( "RequestPreparation.__getFileReplicas: %s" % reason, lfn ) self.__reportProblematicFiles( noReplicas.keys(), 'LFN-LFC-NoReplicas' ) return S_OK( {'Replicas':replicas, 'ZeroReplicas':noReplicas, 'Failed':failed} ) def __reportProblematicFiles( self, lfns, reason ): return S_OK() res = self.dataIntegrityClient.setFileProblematic( lfns, reason, sourceComponent = 'RequestPreparationAgent' ) if not res['OK']: gLogger.error( "RequestPreparation.__reportProblematicFiles: Failed to report missing files.", res['Message'] ) return res if res['Value']['Successful']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Successfully reported %s missing files." % len( res['Value']['Successful'] ) ) if res['Value']['Failed']: gLogger.info( "RequestPreparation.__reportProblematicFiles: Failed to report %s problematic files." % len( res['Value']['Failed'] ) ) return res
def filterReplicas( opFile, logger = None, dataManager = None, seCache = None ): """ filter out banned/invalid source SEs """ from DIRAC.Core.Utilities.Adler import compareAdler if not logger: logger = gLogger if not dataManager: dataManager = DataManager() if not seCache: seCache = {} log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "Banned" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( replicas["Message"] ) return replicas reNotExists = re.compile( "not such file or directory" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) for repSEName in replicas: repSE = seCache[repSEName] if repSEName in seCache else \ seCache.setdefault( repSEName, StorageElement( repSEName ) ) pfn = repSE.getPfnForLfn( opFile.LFN ) if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']: log.warn( "unable to create pfn for %s lfn at %s: %s" % ( opFile.LFN, repSEName, pfn.get( 'Message', pfn.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) ) ) ret["NoPFN"].append( repSEName ) else: pfn = pfn["Value"]['Successful'][ opFile.LFN ] repSEMetadata = repSE.getFileMetadata( pfn ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( pfn ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: log.verbose( "StorageElement '%s' is banned for reading" % ( repSEName ) ) ret["Banned"].append( repSEName ) else: repSEMetadata = repSEMetadata['Value']['Successful'][pfn] seChecksum = repSEMetadata.get( "Checksum" ) if opFile.Checksum and seChecksum and not compareAdler( seChecksum, opFile.Checksum ) : log.warn( " %s checksum mismatch: %s %s:%s" % ( opFile.LFN, opFile.Checksum, repSE, seChecksum ) ) ret["Bad"].append( repSEName ) else: # # if we're here repSE is OK ret["Valid"].append( repSEName ) return S_OK( ret )
def filterReplicas( opFile, logger = None, dataManager = None ): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( 'Failed to get active replicas', replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas( opFile.LFN ) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {} ) if not allReplicas: ret['NoReplicas'].append( None ) noReplicas = True else: # We try inactive replicas to see if maybe the file doesn't exist at all replicas = allReplicas log.warn( "File has no%s replica in File Catalog" % ( '' if noReplicas else ' active' ), opFile.LFN ) else: return allReplicas if not opFile.Checksum: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' ) # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' ) for repSEName in replicas: repSEMetadata = StorageElement( repSEName ).getFileMetadata( opFile.LFN ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = repSEMetadata.get( "Checksum" ) if not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and not opFile.Checksum: opFile.Checksum = seChecksum if not opFile.Checksum or not seChecksum or compareAdler( seChecksum, opFile.Checksum ): # # All checksums are OK ret["Valid"].append( repSEName ) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) else: # If a replica was found somewhere, don't set the file as no replicas ret['NoReplicas'] = [] return S_OK( ret )
class InputDataAgent(OptimizerModule): """ The specific Optimizer must provide the following methods: - initializeOptimizer() before each execution cycle - checkJob() - the main method called for each job """ ############################################################################# def initializeOptimizer(self): """Initialize specific parameters for JobSanityAgent. """ self.failedMinorStatus = self.am_getOption('/FailedJobStatus', 'Input Data Not Available') #this will ignore failover SE files self.checkFileMetadata = self.am_getOption('CheckFileMetadata', True) self.dataManager = DataManager() self.resourceStatus = ResourceStatus() self.fc = FileCatalog() self.seToSiteMapping = {} self.lastCScheck = 0 self.cacheLength = 600 return S_OK() ############################################################################# def checkJob(self, job, classAdJob): """ This method does the optimization corresponding to this Agent, it is call for each job by the Optimizer framework """ result = self.jobDB.getInputData(job) if not result['OK']: self.log.warn('Failed to get input data from JobdB for %s' % (job)) self.log.warn(result['Message']) return result if not result['Value']: self.log.verbose('Job %s has no input data requirement' % (job)) return self.setNextOptimizer(job) #Check if we already executed this Optimizer and the input data is resolved res = self.getOptimizerJobInfo(job, self.am_getModuleParam('optimizerName')) if res['OK'] and len(res['Value']): pass else: self.log.verbose( 'Job %s has an input data requirement and will be processed' % (job)) inputData = result['Value'] result = self.__resolveInputData(job, inputData) if not result['OK']: self.log.warn(result['Message']) return result return self.setNextOptimizer(job) ############################################################################# def __resolveInputData(self, job, inputData): """This method checks the file catalog for replica information. """ lfns = [fname.replace('LFN:', '') for fname in inputData] start = time.time() # In order to place jobs on Hold if a certain SE is banned we need first to check first if # if the replicas are really available replicas = self.dataManager.getActiveReplicas(lfns) timing = time.time() - start self.log.verbose('Catalog Replicas Lookup Time: %.2f seconds ' % (timing)) if not replicas['OK']: self.log.warn(replicas['Message']) return replicas replicaDict = replicas['Value'] siteCandidates = self.__checkReplicas(job, replicaDict) if not siteCandidates['OK']: self.log.warn(siteCandidates['Message']) return siteCandidates if self.checkFileMetadata: guids = True start = time.time() guidDict = self.fc.getFileMetadata(lfns) timing = time.time() - start self.log.info('Catalog Metadata Lookup Time: %.2f seconds ' % (timing)) if not guidDict['OK']: self.log.warn(guidDict['Message']) guids = False failed = guidDict['Value']['Failed'] if failed: self.log.warn('Failed to establish some GUIDs') self.log.warn(failed) guids = False if guids: for lfn, reps in replicaDict['Successful'].items(): guidDict['Value']['Successful'][lfn].update(reps) replicas = guidDict resolvedData = {} resolvedData['Value'] = replicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam('optimizerName'), resolvedData) if not result['OK']: self.log.warn(result['Message']) return result return S_OK(resolvedData) ############################################################################# def __checkReplicas(self, job, replicaDict): """Check that all input lfns have valid replicas and can all be found at least in one single site. """ badLFNs = [] if replicaDict.has_key('Successful'): for lfn, reps in replicaDict['Successful'].items(): if not reps: badLFNs.append('LFN:%s Problem: No replicas available' % (lfn)) else: return S_ERROR('No replica Info available') if replicaDict.has_key('Failed'): for lfn, cause in replicaDict['Failed'].items(): badLFNs.append('LFN:%s Problem: %s' % (lfn, cause)) if badLFNs: self.log.info('Found %s problematic LFN(s) for job %s' % (len(badLFNs), job)) param = '\n'.join(badLFNs) self.log.info(param) result = self.setJobParam(job, self.am_getModuleParam('optimizerName'), param) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Input Data Not Available') return self.__getSiteCandidates(replicaDict['Successful']) ############################################################################# # FIXME: right now this is unused... def __checkActiveSEs(self, job, replicaDict): """ Check active SE and replicas and identify possible Site candidates for the execution of the job """ # Now let's check if some replicas might not be available due to banned SE's activeReplicas = self.dataManager.checkActiveReplicas(replicaDict) if not activeReplicas['OK']: # due to banned SE's input data might no be available msg = "On Hold: Missing replicas due to banned SE" self.log.info(msg) self.log.warn(activeReplicas['Message']) return S_ERROR(msg) activeReplicaDict = activeReplicas['Value'] siteCandidates = self.__checkReplicas(job, activeReplicaDict) if not siteCandidates['OK']: # due to a banned SE's input data is not available at a single site msg = "On Hold: Input data not Available due to banned SE" self.log.info(msg) self.log.warn(siteCandidates['Message']) return S_ERROR(msg) resolvedData = {} resolvedData['Value'] = activeReplicas resolvedData['SiteCandidates'] = siteCandidates['Value'] result = self.setOptimizerJobInfo( job, self.am_getModuleParam('optimizerName'), resolvedData) if not result['OK']: self.log.warn(result['Message']) return result return S_OK(resolvedData) ############################################################################# def __getSitesForSE(self, se): """ Returns a list of sites having the given SE as a local one. Uses the local cache of the site-se information """ # Empty the cache if too old if (time.time() - self.lastCScheck) > self.cacheLength: self.log.verbose('Resetting the SE to site mapping cache') self.seToSiteMapping = {} self.lastCScheck = time.time() if se not in self.seToSiteMapping: sites = getSitesForSE(se) if sites['OK']: self.seToSiteMapping[se] = list(sites['Value']) return sites else: return S_OK(self.seToSiteMapping[se]) ############################################################################# def __getSiteCandidates(self, inputData): """This method returns a list of possible site candidates based on the job input data requirement. For each site candidate, the number of files on disk and tape is resolved. """ fileSEs = {} for lfn, replicas in inputData.items(): siteList = [] for se in replicas.keys(): sites = self.__getSitesForSE(se) if sites['OK']: siteList += sites['Value'] fileSEs[lfn] = uniqueElements(siteList) siteCandidates = [] i = 0 for _fileName, sites in fileSEs.items(): if not i: siteCandidates = sites else: tempSite = [] for site in siteCandidates: if site in sites: tempSite.append(site) siteCandidates = tempSite i += 1 if not len(siteCandidates): return S_ERROR('No candidate sites available') #In addition, check number of files on tape and disk for each site #for optimizations during scheduling siteResult = {} for site in siteCandidates: siteResult[site] = {'disk': [], 'tape': []} seDict = {} for lfn, replicas in inputData.items(): for se in replicas.keys(): if se not in seDict: sites = self.__getSitesForSE(se) if not sites['OK']: continue try: #storageElement = StorageElement( se ) result = self.resourceStatus.getStorageElementStatus( se, statusType='ReadAccess') if not result['OK']: continue seDict[se] = { 'Sites': sites['Value'], 'SEParams': result['Value'][se] } result = getStorageElementOptions(se) if not result['OK']: continue seDict[se]['SEParams'].update(result['Value']) except Exception: self.log.exception( 'Failed to instantiate StorageElement( %s )' % se) continue for site in seDict[se]['Sites']: if site in siteCandidates: if seDict[se]['SEParams']['ReadAccess'] and seDict[se][ 'SEParams']['DiskSE']: if lfn not in siteResult[site]['disk']: siteResult[site]['disk'].append(lfn) if lfn in siteResult[site]['tape']: siteResult[site]['tape'].remove(lfn) if seDict[se]['SEParams']['ReadAccess'] and seDict[se][ 'SEParams']['TapeSE']: if lfn not in siteResult[site][ 'tape'] and lfn not in siteResult[site][ 'disk']: siteResult[site]['tape'].append(lfn) for site in siteResult: siteResult[site]['disk'] = len(siteResult[site]['disk']) siteResult[site]['tape'] = len(siteResult[site]['tape']) return S_OK(siteResult)