def filterReplicas(opFile, logger=None, dataManager=None): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger("filterReplicas") result = defaultdict(list) replicas = dataManager.getActiveReplicas(opFile.LFN, getUrl=False) if not replicas["OK"]: log.error('Failed to get active replicas', replicas["Message"]) return replicas reNotExists = re.compile(r".*such file.*") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas(opFile.LFN, getUrl=False) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {}) if not allReplicas: result['NoReplicas'].append(None) noReplicas = True else: # There are replicas but we cannot get metadata because the replica is not active result['NoActiveReplicas'] += list(allReplicas) log.verbose( "File has no%s replica in File Catalog" % ('' if noReplicas else ' active'), opFile.LFN) else: return allReplicas if not opFile.Checksum or hexAdlerToInt(opFile.Checksum) is False: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get('Value', {}).get('Successful', {}).get(opFile.LFN, {}).get('Checksum') # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: if hexAdlerToInt(fcChecksum) is not False: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][ opFile.LFN].get('ChecksumType', 'Adler32') else: opFile.Checksum = None # If no replica was found, return what we collected as information if not replicas: return S_OK(result) for repSEName in replicas: repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN) error = repSEMetadata.get( 'Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(opFile.LFN)) if error: log.warn( 'unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', '')) if 'File does not exist' in error: result['NoReplicas'].append(repSEName) else: result["NoMetadata"].append(repSEName) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = hexAdlerToInt(repSEMetadata.get("Checksum")) # As from here seChecksum is an integer or False, not a hex string! if seChecksum is False and opFile.Checksum: result['NoMetadata'].append(repSEName) elif not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and (not opFile.Checksum or opFile.Checksum == 'False'): # Use the SE checksum (convert to hex) and force type to be Adler32 opFile.Checksum = intAdlerToHex(seChecksum) opFile.ChecksumType = 'Adler32' if not opFile.Checksum or not seChecksum or compareAdler( intAdlerToHex(seChecksum), opFile.Checksum): # # All checksums are OK result["Valid"].append(repSEName) else: log.warn(" %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, intAdlerToHex(seChecksum))) result["Bad"].append(repSEName) else: # If a replica was found somewhere, don't set the file as no replicas result['NoReplicas'] = [] return S_OK(result)
def filterReplicas( opFile, logger = None, dataManager = None ): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( 'Failed to get active replicas', replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas( opFile.LFN ) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {} ) if not allReplicas: ret['NoReplicas'].append( None ) noReplicas = True else: # We try inactive replicas to see if maybe the file doesn't exist at all replicas = allReplicas log.warn( "File has no%s replica in File Catalog" % ( '' if noReplicas else ' active' ), opFile.LFN ) else: return allReplicas if not opFile.Checksum: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' ) # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' ) for repSEName in replicas: repSEMetadata = StorageElement( repSEName ).getFileMetadata( opFile.LFN ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = repSEMetadata.get( "Checksum" ) if not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and not opFile.Checksum: opFile.Checksum = seChecksum if not opFile.Checksum or not seChecksum or compareAdler( seChecksum, opFile.Checksum ): # # All checksums are OK ret["Valid"].append( repSEName ) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) else: # If a replica was found somewhere, don't set the file as no replicas ret['NoReplicas'] = [] return S_OK( ret )
def filterReplicas( opFile, logger = None, dataManager = None, seCache = None ): """ filter out banned/invalid source SEs """ if not logger: logger = gLogger if not dataManager: dataManager = DataManager() if not seCache: seCache = {} log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( replicas["Message"] ) return replicas reNotExists = re.compile( "not such file or directory" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) for repSEName in replicas: repSE = seCache[repSEName] if repSEName in seCache else \ seCache.setdefault( repSEName, StorageElement( repSEName ) ) pfn = repSE.getPfnForLfn( opFile.LFN ) if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']: log.warn( "unable to create pfn for %s lfn at %s: %s" % ( opFile.LFN, repSEName, pfn.get( 'Message', pfn.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) ) ) ret["NoPFN"].append( repSEName ) else: pfn = pfn["Value"]['Successful'][ opFile.LFN ] repSEMetadata = repSE.getFileMetadata( pfn ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( pfn ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) else: repSEMetadata = repSEMetadata['Value']['Successful'][pfn] seChecksum = repSEMetadata.get( "Checksum" ) if opFile.Checksum and seChecksum and not compareAdler( seChecksum, opFile.Checksum ) : # The checksum in the request may be wrong, check with FC fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' ) if fcChecksum and fcChecksum != opFile.Checksum and compareAdler( fcChecksum , seChecksum ): opFile.Checksum = fcChecksum ret['Valid'].append( repSEName ) else: log.warn( " %s checksum mismatch, request: %s @%s: %s" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) else: # # if we're here repSE is OK ret["Valid"].append( repSEName ) return S_OK( ret )
def filterReplicas(opFile, logger=None, dataManager=None): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger("filterReplicas") result = defaultdict(list) replicas = dataManager.getActiveReplicas(opFile.LFN, getUrl=False) if not replicas["OK"]: log.error('Failed to get active replicas', replicas["Message"]) return replicas reNotExists = re.compile(r".*such file.*") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas(opFile.LFN, getUrl=False) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get(opFile.LFN, {}) if not allReplicas: result['NoReplicas'].append(None) noReplicas = True else: # There are replicas but we cannot get metadata because the replica is not active result['NoActiveReplicas'] += list(allReplicas) log.verbose("File has no%s replica in File Catalog" % ('' if noReplicas else ' active'), opFile.LFN) else: return allReplicas if not opFile.Checksum or hexAdlerToInt(opFile.Checksum) is False: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get( 'Value', {}).get( 'Successful', {}).get( opFile.LFN, {}).get('Checksum') # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: if hexAdlerToInt(fcChecksum) is not False: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get('ChecksumType', 'Adler32') else: opFile.Checksum = None # If no replica was found, return what we collected as information if not replicas: return S_OK(result) for repSEName in replicas: repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN) error = repSEMetadata.get('Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(opFile.LFN)) if error: log.warn('unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', '')) if 'File does not exist' in error: result['NoReplicas'].append(repSEName) else: result["NoMetadata"].append(repSEName) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = hexAdlerToInt(repSEMetadata.get("Checksum")) # As from here seChecksum is an integer or False, not a hex string! if seChecksum is False and opFile.Checksum: result['NoMetadata'].append(repSEName) elif not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and (not opFile.Checksum or opFile.Checksum == 'False'): # Use the SE checksum (convert to hex) and force type to be Adler32 opFile.Checksum = intAdlerToHex(seChecksum) opFile.ChecksumType = 'Adler32' if not opFile.Checksum or not seChecksum or compareAdler( intAdlerToHex(seChecksum), opFile.Checksum): # # All checksums are OK result["Valid"].append(repSEName) else: log.warn(" %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, intAdlerToHex(seChecksum))) result["Bad"].append(repSEName) else: # If a replica was found somewhere, don't set the file as no replicas result['NoReplicas'] = [] return S_OK(result)
def filterReplicas(opFile, logger=None, dataManager=None, seCache=None): """ filter out banned/invalid source SEs """ if not logger: logger = gLogger if not dataManager: dataManager = DataManager() if not seCache: seCache = {} log = logger.getSubLogger("filterReplicas") ret = { "Valid": [], "NoMetadata": [], "Bad": [], 'NoReplicas': [], 'NoPFN': [] } replicas = dataManager.getActiveReplicas(opFile.LFN) if not replicas["OK"]: log.error(replicas["Message"]) return replicas reNotExists = re.compile("not such file or directory") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) for repSEName in replicas: repSE = seCache[repSEName] if repSEName in seCache else \ seCache.setdefault( repSEName, StorageElement( repSEName ) ) pfn = repSE.getPfnForLfn(opFile.LFN) if not pfn["OK"] or opFile.LFN not in pfn['Value']['Successful']: log.warn( "unable to create pfn for %s lfn at %s: %s" % (opFile.LFN, repSEName, pfn.get( 'Message', pfn.get('Value', {}).get('Failed', {}).get(opFile.LFN)))) ret["NoPFN"].append(repSEName) else: pfn = pfn["Value"]['Successful'][opFile.LFN] repSEMetadata = repSE.getFileMetadata(pfn) error = repSEMetadata.get( 'Message', repSEMetadata.get('Value', {}).get('Failed', {}).get(pfn)) if error: log.warn( 'unable to get metadata at %s for %s' % (repSEName, opFile.LFN), error.replace('\n', '')) if 'File does not exist' in error: ret['NoReplicas'].append(repSEName) else: ret["NoMetadata"].append(repSEName) else: repSEMetadata = repSEMetadata['Value']['Successful'][pfn] seChecksum = repSEMetadata.get("Checksum") if opFile.Checksum and seChecksum and not compareAdler( seChecksum, opFile.Checksum): # The checksum in the request may be wrong, check with FC fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get('Value', {}).get( 'Successful', {}).get(opFile.LFN, {}).get('Checksum') if fcChecksum and fcChecksum != opFile.Checksum and compareAdler( fcChecksum, seChecksum): opFile.Checksum = fcChecksum ret['Valid'].append(repSEName) else: log.warn(" %s checksum mismatch, request: %s @%s: %s" % (opFile.LFN, opFile.Checksum, repSEName, seChecksum)) ret["Bad"].append(repSEName) else: # # if we're here repSE is OK ret["Valid"].append(repSEName) return S_OK(ret)
def filterReplicas( opFile, logger = None, dataManager = None ): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( 'Failed to get active replicas', replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) if not opFile.Checksum: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum', '' ) # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' ) for repSEName in replicas: repSE = StorageElement( repSEName ) repSEMetadata = repSE.getFileMetadata( opFile.LFN ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) else: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = repSEMetadata.get( "Checksum" ) if ( opFile.Checksum and seChecksum and compareAdler( seChecksum, opFile.Checksum ) ) or\ ( not opFile.Checksum and not seChecksum ): # # All checksums are OK ret["Valid"].append( repSEName ) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) return S_OK( ret )
def filterReplicas(opFile, logger=None, dataManager=None): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger("filterReplicas") ret = {"Valid": [], "NoMetadata": [], "Bad": [], "NoReplicas": [], "NoPFN": []} replicas = dataManager.getActiveReplicas(opFile.LFN) if not replicas["OK"]: log.error("Failed to get active replicas", replicas["Message"]) return replicas reNotExists = re.compile(r".*such file.*") replicas = replicas["Value"] failed = replicas["Failed"].get(opFile.LFN, "") if reNotExists.match(failed.lower()): opFile.Status = "Failed" opFile.Error = failed return S_ERROR(failed) replicas = replicas["Successful"].get(opFile.LFN, {}) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas(opFile.LFN) if allReplicas["OK"]: allReplicas = allReplicas["Value"]["Successful"].get(opFile.LFN, {}) if not allReplicas: ret["NoReplicas"].append(None) noReplicas = True else: # We try inactive replicas to see if maybe the file doesn't exist at all replicas = allReplicas log.warn("File has no%s replica in File Catalog" % ("" if noReplicas else " active"), opFile.LFN) else: return allReplicas if not opFile.Checksum: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata(opFile.LFN) fcChecksum = fcMetadata.get("Value", {}).get("Successful", {}).get(opFile.LFN, {}).get("Checksum") # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata["Value"]["Successful"][opFile.LFN].get("ChecksumType", "Adler32") for repSEName in replicas: repSEMetadata = StorageElement(repSEName).getFileMetadata(opFile.LFN) error = repSEMetadata.get("Message", repSEMetadata.get("Value", {}).get("Failed", {}).get(opFile.LFN)) if error: log.warn("unable to get metadata at %s for %s" % (repSEName, opFile.LFN), error.replace("\n", "")) if "File does not exist" in error: ret["NoReplicas"].append(repSEName) else: ret["NoMetadata"].append(repSEName) elif not noReplicas: repSEMetadata = repSEMetadata["Value"]["Successful"][opFile.LFN] seChecksum = repSEMetadata.get("Checksum") if not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and not opFile.Checksum: opFile.Checksum = seChecksum if not opFile.Checksum or not seChecksum or compareAdler(seChecksum, opFile.Checksum): # # All checksums are OK ret["Valid"].append(repSEName) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % (opFile.LFN, opFile.Checksum, repSEName, seChecksum) ) ret["Bad"].append(repSEName) else: # If a replica was found somewhere, don't set the file as no replicas ret["NoReplicas"] = [] return S_OK(ret)
def filterReplicas( opFile, logger = None, dataManager = None ): """ filter out banned/invalid source SEs """ if logger is None: logger = gLogger if dataManager is None: dataManager = DataManager() log = logger.getSubLogger( "filterReplicas" ) ret = { "Valid" : [], "NoMetadata" : [], "Bad" : [], 'NoReplicas':[], 'NoPFN':[] } replicas = dataManager.getActiveReplicas( opFile.LFN ) if not replicas["OK"]: log.error( 'Failed to get active replicas', replicas["Message"] ) return replicas reNotExists = re.compile( r".*such file.*" ) replicas = replicas["Value"] failed = replicas["Failed"].get( opFile.LFN , "" ) if reNotExists.match( failed.lower() ): opFile.Status = "Failed" opFile.Error = failed return S_ERROR( failed ) replicas = replicas["Successful"].get( opFile.LFN, {} ) noReplicas = False if not replicas: allReplicas = dataManager.getReplicas( opFile.LFN ) if allReplicas['OK']: allReplicas = allReplicas['Value']['Successful'].get( opFile.LFN, {} ) if not allReplicas: ret['NoReplicas'].append( None ) noReplicas = True else: # We try inactive replicas to see if maybe the file doesn't exist at all replicas = allReplicas log.warn( "File has no%s replica in File Catalog" % ( '' if noReplicas else ' active' ), opFile.LFN ) else: return allReplicas if not opFile.Checksum or hexAdlerToInt( opFile.Checksum ) == False: # Set Checksum to FC checksum if not set in the request fcMetadata = FileCatalog().getFileMetadata( opFile.LFN ) fcChecksum = fcMetadata.get( 'Value', {} ).get( 'Successful', {} ).get( opFile.LFN, {} ).get( 'Checksum' ) # Replace opFile.Checksum if it doesn't match a valid FC checksum if fcChecksum: if hexAdlerToInt( fcChecksum ) != False: opFile.Checksum = fcChecksum opFile.ChecksumType = fcMetadata['Value']['Successful'][opFile.LFN].get( 'ChecksumType', 'Adler32' ) else: opFile.Checksum = None for repSEName in replicas: repSEMetadata = StorageElement( repSEName ).getFileMetadata( opFile.LFN ) error = repSEMetadata.get( 'Message', repSEMetadata.get( 'Value', {} ).get( 'Failed', {} ).get( opFile.LFN ) ) if error: log.warn( 'unable to get metadata at %s for %s' % ( repSEName, opFile.LFN ), error.replace( '\n', '' ) ) if 'File does not exist' in error: ret['NoReplicas'].append( repSEName ) else: ret["NoMetadata"].append( repSEName ) elif not noReplicas: repSEMetadata = repSEMetadata['Value']['Successful'][opFile.LFN] seChecksum = hexAdlerToInt( repSEMetadata.get( "Checksum" ) ) if seChecksum == False and opFile.Checksum: ret['NoMetadata'].append( repSEName ) elif not seChecksum and opFile.Checksum: opFile.Checksum = None opFile.ChecksumType = None elif seChecksum and ( not opFile.Checksum or opFile.Checksum == 'False' ): # Use the SE checksum and force type to be Adler32 opFile.Checksum = seChecksum opFile.ChecksumType = 'Adler32' if not opFile.Checksum or not seChecksum or compareAdler( seChecksum, opFile.Checksum ): # # All checksums are OK ret["Valid"].append( repSEName ) else: log.warn( " %s checksum mismatch, FC: '%s' @%s: '%s'" % ( opFile.LFN, opFile.Checksum, repSEName, seChecksum ) ) ret["Bad"].append( repSEName ) else: # If a replica was found somewhere, don't set the file as no replicas ret['NoReplicas'] = [] return S_OK( ret )
def scanPopularity(since, getAllDatasets, topDirectory='/lhcb', csvFile=None): """ That function does the job to cache the directories, get the corresponding datasets and join with the popularity """ # Reset global variables bkPathForDir.clear() cachedInvisible.clear() prodForBKPath.clear() bkPathUsage.clear() processingPass.clear() bkPathPopularity.clear() physicalStorageUsage.clear() datasetStorage.clear() for infoType in storageTypes: datasetStorage[infoType] = set() # set of used directories usedDirectories = set() usedSEs = {} binSize = 'week' nbBins = int((since + 6) / 7) since = 7 * nbBins from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations ignoreDirectories = Operations().getValue( 'DataManagement/PopularityIgnoreDirectories', [ 'user', 'test', 'debug', 'dataquality', 'software', 'database', 'swtest', 'certification', 'validation' ]) nowBin = getTimeBin(datetime.now() - timedelta(days=1)) notCached = set() if getAllDatasets: # Get list of directories startTime = time.time() res = FileCatalog().listDirectory(topDirectory) if not res['OK']: gLogger.fatal("Cannot get list of directories", res['Message']) DIRAC.exit(1) directories = set( subDir for subDir in res['Value']['Successful'][topDirectory]['SubDirs'] if subDir.split('/')[2] not in ignoreDirectories and 'RAW' not in subDir and 'RDST' not in subDir and 'SDST' not in subDir) allDirectoriesSet = set() for baseDir in directories: allDirectoriesSet.update(getPhysicalUsage(baseDir)) gLogger.always( "Obtained %d directories storage usage information in %.1f seconds)" % (len(allDirectoriesSet), time.time() - startTime)) cacheDirectories(allDirectoriesSet) # Get the popularity raw information for the specified number of days if since: entries = 0 now = datetime.now() endTime = datetime(now.year, now.month, now.day, 0, 0, 0) startTime = endTime gLogger.always('Get popularity day-by-day') stTime = time.time() for _i in range(since): endTime = startTime startTime = endTime - timedelta(days=1) endTimeQuery = endTime.isoformat() startTimeQuery = startTime.isoformat() status = 'Used' # Getting the popularity with 10 retries for the given day for _i in range(10): res = duClient.getDataUsageSummary(startTimeQuery, endTimeQuery, status, timeout=7200) if res['OK']: break gLogger.error("Error getting popularity entries, retrying...", res['Message']) if not res['OK']: gLogger.fatal("Error getting popularity entries", res['Message']) DIRAC.exit(1) val = res['Value'] entries += len(val) # Get information on useful directories directories = set(row[1] for row in val if row[1].split('/')[2] not in ignoreDirectories) usedDirectories.update(directories) cacheDirectories(directories) # Get information in bins (day or week) for _rowId, dirLfn, se, count, insertTime in val: if dirLfn not in directories: # print rowId, dirLfn, count, insertTime, 'ignored' continue # get the binNumber (day or week) binNumber = getTimeBin(insertTime) bkPath = bkPathForDir.get(dirLfn) if not bkPath: if dirLfn not in notCached: notCached.add(dirLfn) gLogger.error('Directory %s was not cached' % dirLfn) bkPath = 'Unknown-' + dirLfn bkPathPopularity[ bkPath][binNumber] = bkPathPopularity.setdefault( bkPath, {}).setdefault(binNumber, 0) + count usedSEs.setdefault(bkPath, set()).add(se) gLogger.always( "\n=============================================================") gLogger.always( "Retrieved %d entries from Popularity table in %.1f seconds" % (entries, time.time() - stTime)) gLogger.always('Found %d datasets used since %d days' % (len(bkPathPopularity), since)) counters = {} strangeBKPaths = set( bkPath for bkPath in bkPathPopularity if not bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))[0]) if strangeBKPaths: gLogger.always('%d used datasets do not have an LFN count:' % len(strangeBKPaths)) gLogger.always('\n'.join("%s : %s" % (bkPath, str(bkPathUsage.get(bkPath, {}))) for bkPath in strangeBKPaths)) gLogger.always('\nDataset usage for %d datasets' % len(bkPathPopularity)) for infoType in ('All', 'LFN'): for i in range(2): counters.setdefault(infoType, []).append( sum( bkPathUsage.get(bkPath, {}).get(infoType, (0, 0))[i] for bkPath in bkPathPopularity)) for bkPath in sorted(bkPathPopularity): if bkPath not in datasetStorage['Disk'] | datasetStorage[ 'Archived'] | datasetStorage['Tape']: ses = usedSEs.get(bkPath) if ses is None: gLogger.error("BK path not in usedSEs", bkPath) else: datasetStorage[storageType(ses)].add(bkPath) nLfns, lfnSize = bkPathUsage.get(bkPath, {}).get('LFN', (0, 0)) nPfns, pfnSize = bkPathUsage.get(bkPath, {}).get('All', (0, 0)) gLogger.always( '%s (%d LFNs, %s), (%d PFNs, %s, %.1f replicas)' % (bkPath, nLfns, prSize(lfnSize), nPfns, prSize(pfnSize), float(nPfns) / float(nLfns) if nLfns else 0.)) bins = sorted(bkPathPopularity[bkPath]) lastBin = bins[-1] accesses = sum(bkPathPopularity[bkPath][binNumber] for binNumber in bins) gLogger.always( '\tUsed first in %s, %d accesses (%.1f%%), %d accesses during last %s %s' % (prBinNumber(bins[0]), accesses, accesses * 100. / nLfns if nLfns else 0., bkPathPopularity[bkPath][lastBin], binSize, prBinNumber(lastBin))) gLogger.always( "\nA total of %d LFNs (%s), %d PFNs (%s) have been used" % (counters['LFN'][0], prSize(counters['LFN'][1]), counters['All'][0], prSize(counters['All'][1]))) if getAllDatasets: # Consider only unused directories unusedDirectories = allDirectoriesSet - usedDirectories if unusedDirectories: gLogger.always( "\n=============================================================" ) gLogger.always('%d directories have not been used' % len(unusedDirectories)) # Remove the used datasets (from other directories) unusedBKPaths = set( bkPathForDir[dirLfn] for dirLfn in unusedDirectories if dirLfn in bkPathForDir) - set(bkPathPopularity) # Remove empty datasets strangeBKPaths = set( bkPath for bkPath in unusedBKPaths if not bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))[0]) if strangeBKPaths: gLogger.always('%d unused datasets do not have an LFN count:' % len(strangeBKPaths)) gLogger.always('\n'.join( "%s : %s" % (bkPath, str(bkPathUsage.get(bkPath, {}))) for bkPath in strangeBKPaths)) unusedBKPaths = set( bkPath for bkPath in unusedBKPaths if bkPathUsage.get(bkPath, {}).get('LFN', (0, 0))[0]) # In case there are datasets both on tape and disk, priviledge tape datasetStorage['Disk'] -= datasetStorage['Tape'] gLogger.always( "\nThe following %d BK paths were not used since %d days" % (len(unusedBKPaths), since)) for infoType in storageTypes[0:3]: gLogger.always("\n=========== %s datasets ===========" % infoType) unusedPaths = unusedBKPaths & datasetStorage[infoType] counters = {} for ty in ('All', 'LFN'): for i in range(2): counters.setdefault(ty, []).append( sum( bkPathUsage.get(bkPath, {}).get(ty, (0, 0))[i] for bkPath in unusedPaths)) for bkPath in sorted(unusedPaths): nLfns, lfnSize = bkPathUsage.get(bkPath, {}).get('LFN', (0, 0)) nPfns, pfnSize = bkPathUsage.get(bkPath, {}).get('All', (0, 0)) gLogger.always( '\t%s (%d LFNs, %s), (%d PFNs, %s, %.1f replicas)' % (bkPath, nLfns, prSize(lfnSize), nPfns, prSize(pfnSize), float(nPfns) / float(nLfns))) gLogger.always( "\nA total of %d %s LFNs (%s), %d PFNs (%s) were not used" % (counters['LFN'][0], infoType, prSize(counters['LFN'][1]), counters['All'][0], prSize(counters['All'][1]))) else: unusedBKPaths = set() datasetStorage['Disk'] -= datasetStorage['Tape'] # Now create a CSV file with all dataset information # Name, ProcessingPass, #files, size, SE type, each week's usage (before now) csvFile = 'popularity-%ddays.csv' % since if csvFile is None else csvFile gLogger.always( "\n=============================================================") gLogger.always('Creating %s file with %d datasets' % (csvFile, len(bkPathPopularity) + len(unusedBKPaths))) with open(csvFile, 'w') as fd: title = "Name;Configuration;ProcessingPass;FileType;Type;Creation-%s;" % binSize + \ "NbLFN;LFNSize;NbDisk;DiskSize;NbTape;TapeSize;NbArchived;ArchivedSize;" + \ ';'.join(site.split('.')[1] for site in storageSites) + \ ";Nb Replicas;Nb ArchReps;Storage;FirstUsage;LastUsage;Now" for binNumber in range(nbBins): title += ';%d' % (1 + binNumber) fd.write(title + '\n') teraByte = 1000. * 1000. * 1000. * 1000. for bkPath in sorted(bkPathPopularity) + sorted(unusedBKPaths): # Skip unknown datasets if bkPath.startswith('Unknown-'): continue # Not interested in histograms splitBKPath = bkPath.split('/') fileType = splitBKPath[-1] if 'HIST' in fileType: continue # Only RAW for partition LHCb may be of interest (and even...) if fileType == 'RAW' and not bkPath.startswith('/LHCb'): continue info = bkPathUsage.get(bkPath, {}) # check if the production is still active prods = prodForBKPath[bkPath] res = transClient.getTransformations( {'TransformationID': list(prods)}) creationTime = datetime.now() active = [] for prodDict in res.get('Value', []): creationTime = min(creationTime, prodDict['CreationDate']) if prodDict['Status'] in ('Active', 'Idle', 'Completed'): active.append(str(prodDict['TransformationID'])) if active: gLogger.always("Active productions %s found in %s" % (','.join(sorted(active)), bkPath)) if info['LFN'][0] == 0: continue for infoType in info: info[infoType][1] /= teraByte # Some BK paths contain a , to be replaces by a . for the CSV file!! config = '/'.join(splitBKPath[0:3]) if ',' in bkPath: gLogger.always("BK path found with ',':", bkPath) # Name,Configuration,ProcessingPass, FileType row = '%s;%s;%s;%s' % (bkPath.replace('Real Data', 'RealData'), config, processingPass.get( bkPath, 'Unknown').replace( 'Real Data', 'RealData'), fileType) # Type configTypes = {'/MC/Dev': 2, '/MC/Upgrade': 3} configType = configTypes.get(config, 0 if bkPath.startswith('/MC') else 1) row += ';%d' % configType # CreationTime row += ';%d' % (getTimeBin(creationTime)) # NbLFN,LFNSize,NbDisk,DiskSize,NbTape,TapeSize, NbArchived,ArchivedSize for infoType in ('LFN', 'Disk', 'Tape', 'Archived'): row += ';%d;%f' % tuple(info[infoType]) for site in storageSites: row += ';%f' % info[site][1] row += ';%f;%f' % (float(info['Disk'][0]) / float(info['LFN'][0]), float(info['Archived'][0]) / float(info['LFN'][0])) if active: dsType = 'Active' else: dsType = 'Unknown' for infoType in storageTypes[0:3]: if bkPath in datasetStorage[infoType]: dsType = infoType break row += ';%s' % dsType bins = sorted(bkPathPopularity.get(bkPath, {})) if not bins: bins = [0] row += ';%d;%d;%d' % (bins[0], bins[-1], nowBin) usage = 0 for binNumber in range(nbBins): usage += bkPathPopularity.get(bkPath, {}).get(nowBin - binNumber, 0) row += ';%d' % usage fd.write(row + '\n') gLogger.always('\nSuccessfully wrote CSV file %s' % csvFile)