def __init__(self, startRun = None, purgeTime = 48, phedexUrl = "https://cmsweb.cern.ch/phedex/datasvc/json/prod/fileReplicas", dbsHost = "cmsweb.cern.ch/dbs_discovery/", dbsInstance = "cms_dbs_prod_global", dbsPort = 443): """ Configure the feeder """ # Configure data service look up components self.dbsHelper = DbsQueryHelper(dbsHost, dbsPort, dbsInstance) self.phedexHelper = PhEDExQueryHelper(phedexUrl) # Runs that are being watched self.watchedRuns = [] # The last run that was identified as new, and run purge time self.lastRun = 0 self.purgeTime = purgeTime * 3600 # Convert hours to seconds # Bootstrap run list if not startRun: # Determine the last run registered in DBS runs = self.dbsHelper.getRuns() runs.sort(reverse=True) if len(runs) == 0: msg = "Could not bootstrap RunTransferNotifier feeder" raise RuntimeError, msg # There are runs, ensure we ignore them in first query self.lastRun = runs[0] else: # Ensure we include the startRun in first query self.lastRun = startRun - 1
class RunTransferNotifier(FeederImpl): """ Run / transfer feeder implementation """ class WatchedRun: """ Records a run that is being watched, and maintains state about transfered files for monitored datasets """ def __init__(self, run): self.run = run self.creationTime = int(time()) self.accessTime = int(time()) self.datasetCompletion = {} def addDatasetOfInterest(self, dataset): """ Adds an entry to Dataset : CompleteLocation tracking """ if not self.datasetCompletion.has_key(dataset): self.datasetCompletion[dataset] = set() self.accessTime = int(time()) def addCompletedNodes(self, dataset, nodes): """ Records completed transfer for a dataset to a node """ if isinstance(node, list): self.datasetCompletion[dataset].extend(nodes) else: self.datasetCompletion[dataset].append(nodes) self.accessTime = int(time()) def getNewSites(self, dataset, sitesWithRun): """ Returns all sites that have not been marked as complete """ return sitesWithRun - self.datasetCompletion[dataset] def __init__( self, startRun=None, purgeTime=48, phedexUrl="https://cmsweb.cern.ch/phedex/datasvc/json/prod/fileReplicas", dbsHost="cmsweb.cern.ch/dbs_discovery/", dbsInstance="cms_dbs_prod_global", dbsPort=443): """ Configure the feeder """ # Configure data service look up components self.dbsHelper = DbsQueryHelper(dbsHost, dbsPort, dbsInstance) self.phedexHelper = PhEDExQueryHelper(phedexUrl) # Runs that are being watched self.watchedRuns = [] # The last run that was identified as new, and run purge time self.lastRun = 0 self.purgeTime = purgeTime * 3600 # Convert hours to seconds # Bootstrap run list if not startRun: # Determine the last run registered in DBS runs = self.dbsHelper.getRuns() runs.sort(reverse=True) if len(runs) == 0: msg = "Could not bootstrap RunTransferNotifier feeder" raise RuntimeError, msg # There are runs, ensure we ignore them in first query self.lastRun = runs[0] else: # Ensure we include the startRun in first query self.lastRun = startRun - 1 def __call__(self, filesets): """ The algorithm itself """ # Update run list self.getNewRuns() # Do per fileset work, abandon fileset processing on exception for fileset in filesets: ds = fileset.name try: # Do per run work watchCompleteFiles = [] for watch in self.watchedRuns: # Ensure watcher has dataset listed watch.addDatasetOfInterest(ds) # Query DBS to find all blocks for this run / dataset (files, blocks, fileInfoMap) = \ self.dbsHelper.getFileInfo(watch.run, ds) # Now determine all required parent blocks parentBlocks = set() if fileset.requireParents: parentDs = self.dbsHelper.getParentDataset(ds) parentBlocks = self.dbsHelper.getBlockInfo( watch.run, parentDs) # Final list of all required blocks allBlocks = blocks[:] allBlocks.update(parentBlocks) # Find all sites where all blocks are complete sites = self.phedexHelper.getCompleteSites(blocks) # Get sites with newly completed transfers newSites = watch.getNewSites(ds, sites) if len(newSites) > 0: # Add the files for these blocks to the fileset for file in fileInfoMap: fi = fileInfoMap[file] # First add parent file if fileset.requireParents: parentFile = File(lfn=fi["file.parent"]) parentFile.save() parentFile.setLocation(newSites) # Add actual file fileToAdd = File(lfn=file, size=fi["file.size"], events=fi["file.events"], run=watch.run, umi=fi["file.lumi"]) if not fileToAdd.exists( ) and fileset.requireParents: fileToAdd.addParent(fi["file.parent"]) # Add new locations but don't persist immediately fileToAdd.setLocations(newSites, immediateSave=False) # Add the file to the new file list fileset.addFile(fileToAdd) # Add the site info to the watcher list watchCompleteFiles.append([watch, ds, newSites]) # Commit the fileset fileset.commit() # Add the watched runs for a in watchCompleteFiles: a[0].addCompletedNodes(a[1], a[2]) except: # Reset the watch list so we re-evaluate next call watchCompleteFiles = [] # Purge old runs self.purgeWatchedRuns() def getNewRuns(self): """ Queries DBS to determine what new runs are present, and adds a watcher """ runs = self.dbsHelper.getRuns(self.lastRun) runs.sort() for run in runs: watchedRuns.append(WatchedRun(run)) self.lastRun = run def purgeWatchedRuns(self): """ Purges watched runs that were last accessed longer than purgeTime ago """ validRuns = [] for run in self.watchedRuns: if int(time) - run.accessTime < self.purgeTime: validRuns.append(run) self.watchedRuns = validRuns
class RunTransferNotifier(FeederImpl): """ Run / transfer feeder implementation """ class WatchedRun: """ Records a run that is being watched, and maintains state about transfered files for monitored datasets """ def __init__(self, run): self.run = run self.creationTime = int(time()) self.accessTime = int(time()) self.datasetCompletion = {} def addDatasetOfInterest(self, dataset): """ Adds an entry to Dataset : CompleteLocation tracking """ if dataset not in self.datasetCompletion: self.datasetCompletion[dataset] = set() self.accessTime = int(time()) def addCompletedNodes(self, dataset, nodes): """ Records completed transfer for a dataset to a node """ if isinstance(node, list): self.datasetCompletion[dataset].extend(nodes) else: self.datasetCompletion[dataset].append(nodes) self.accessTime = int(time()) def getNewSites(self, dataset, sitesWithRun): """ Returns all sites that have not been marked as complete """ return sitesWithRun - self.datasetCompletion[dataset] def __init__(self, startRun = None, purgeTime = 48, phedexUrl = "https://cmsweb.cern.ch/phedex/datasvc/json/prod/fileReplicas", dbsHost = "cmsweb.cern.ch/dbs_discovery/", dbsInstance = "cms_dbs_prod_global", dbsPort = 443): """ Configure the feeder """ # Configure data service look up components self.dbsHelper = DbsQueryHelper(dbsHost, dbsPort, dbsInstance) self.phedexHelper = PhEDExQueryHelper(phedexUrl) # Runs that are being watched self.watchedRuns = [] # The last run that was identified as new, and run purge time self.lastRun = 0 self.purgeTime = purgeTime * 3600 # Convert hours to seconds # Bootstrap run list if not startRun: # Determine the last run registered in DBS runs = self.dbsHelper.getRuns() runs.sort(reverse=True) if len(runs) == 0: msg = "Could not bootstrap RunTransferNotifier feeder" raise RuntimeError, msg # There are runs, ensure we ignore them in first query self.lastRun = runs[0] else: # Ensure we include the startRun in first query self.lastRun = startRun - 1 def __call__(self, filesets): """ The algorithm itself """ # Update run list self.getNewRuns() # Do per fileset work, abandon fileset processing on exception for fileset in filesets: ds = fileset.name try: # Do per run work watchCompleteFiles = [] for watch in self.watchedRuns: # Ensure watcher has dataset listed watch.addDatasetOfInterest(ds) # Query DBS to find all blocks for this run / dataset (files, blocks, fileInfoMap) = \ self.dbsHelper.getFileInfo(watch.run, ds) # Now determine all required parent blocks parentBlocks = set() if fileset.requireParents: parentDs = self.dbsHelper.getParentDataset(ds) parentBlocks = self.dbsHelper.getBlockInfo(watch.run, parentDs) # Final list of all required blocks allBlocks = blocks[:] allBlocks.update(parentBlocks) # Find all sites where all blocks are complete sites = self.phedexHelper.getCompleteSites(blocks) # Get sites with newly completed transfers newSites = watch.getNewSites(ds, sites) if len(newSites) > 0: # Add the files for these blocks to the fileset for file in fileInfoMap: fi = fileInfoMap[file] # First add parent file if fileset.requireParents: parentFile = File(lfn=fi["file.parent"]) parentFile.save() parentFile.setLocation(newSites) # Add actual file fileToAdd = File(lfn=file, size=fi["file.size"], events=fi["file.events"], run=watch.run, umi=fi["file.lumi"]) if not fileToAdd.exists() and fileset.requireParents: fileToAdd.addParent(fi["file.parent"]) # Add new locations but don't persist immediately fileToAdd.setLocations(newSites, immediateSave=False) # Add the file to the new file list fileset.addFile(fileToAdd) # Add the site info to the watcher list watchCompleteFiles.append([watch, ds, newSites]) # Commit the fileset fileset.commit() # Add the watched runs for a in watchCompleteFiles: a[0].addCompletedNodes(a[1], a[2]) except: # Reset the watch list so we re-evaluate next call watchCompleteFiles = [] # Purge old runs self.purgeWatchedRuns() def getNewRuns(self): """ Queries DBS to determine what new runs are present, and adds a watcher """ runs = self.dbsHelper.getRuns(self.lastRun) runs.sort() for run in runs: watchedRuns.append(WatchedRun(run)) self.lastRun = run def purgeWatchedRuns(self): """ Purges watched runs that were last accessed longer than purgeTime ago """ validRuns = [] for run in self.watchedRuns: if int(time) - run.accessTime < self.purgeTime: validRuns.append(run) self.watchedRuns = validRuns