def _getnevents(self, das=True, refresh=False, tree='Events', limit=-1, checkfiles=False, verb=0): """Get number of nanoAOD events from DAS (default), or from files on storage system (das=False).""" nevents = self.nevents filenevts = self.filenevts treename = tree if nevents <= 0 or refresh: if checkfiles or (self.storage and not das ): # get number of events from storage system files = self.getfiles(url=True, das=das, refresh=refresh, limit=limit, verb=verb) for fname in files: nevts = getnevents(fname, treename) filenevts[fname] = nevts # cache nevents += nevts LOG.verb( "_getnevents: Found %d events in %r." % (nevts, fname), verb, 3) else: # get number of events from DAS for daspath in self.paths: nevents += getdasnevents(daspath, instance=self.instance, verb=verb - 1) if limit < 0: self.nevents = nevents return nevents, filenevts
def getfiles(self, refresh=False, url=True, verb=0): """Get list of files from DAS.""" files = self.files if self.refreshable and (not files or refresh): files = [] for path in self.paths: if self.storage: # get files from storage system sepath = repkey(self.storage, PATH=path).replace('//', '/') storage = getstorage(sepath, verb=verb - 1) outlist = storage.getfiles(url=url, verb=verb - 1) else: # get files from DAS dascmd = 'dasgoclient --query="file dataset=%s instance=%s"' % ( path, self.instance) #--limit=0 LOG.verb(repr(dascmd), verb) cmdout = execute(dascmd, verb=verb - 1) outlist = cmdout.split(os.linesep) for line in outlist: # filter root files line = line.strip() if line.endswith('.root') and not any( f.endswith(line) for f in self.blacklist): if url and self.url not in line and 'root://' not in line: line = self.url + line files.append(line) files.sort() # for consistent list order self.files = files return files
def dasgoclient(query, **kwargs): """Help function to call dasgoclient and retrieve data set information.""" try: verbosity = kwargs.get('verb', 0) instance = kwargs.get('instance', "") limit = kwargs.get('limit', 0) option = kwargs.get('opts', "") if instance: query += " instance=%s" % (instance) dascmd = 'dasgoclient --query="%s"' % (query) if limit > 0: dascmd += " --limit=%d" % (limit) if option: dascmd += " " + option.strip() LOG.verb(repr(dascmd), verbosity) cmdout = execute(dascmd, verb=verbosity - 1) except CalledProcessError as e: print LOG.error( "Failed to call 'dasgoclient' command. Please make sure:\n" " 1) 'dasgoclient' command exists.\n" " 2) You have a valid VOMS proxy. Use 'voms-proxy-init -voms cms -valid 200:0' or 'source utils/setupVOMS.sh'.\n" " 3) The DAS dataset in '%s' exists!\n" % (dascmd)) raise e return cmdout
def getnevents(self, das=True, refresh=False, treename='Events', verb=0): """Get number of nanoAOD events from DAS (default), or from files on storage system (das=False).""" nevents = self.nevents if nevents <= 0 or refresh: if self.storage and not das: # get number of events from storage system files = self.getfiles(url=True, refresh=refresh, verb=verb) for fname in files: file = ensureTFile(fname) tree = file.Get(treename) if not tree: LOG.warning("getnevents: No %r tree in events in %r!" % ('Events', fname)) continue nevts = tree.GetEntries() file.Close() nevents += nevts LOG.verb( "getnevents: Found %d events in %r." % (nevts, fname), verb, 3) else: # get number of events from DAS for daspath in self.paths: cmdout = dasgoclient("summary dataset=%s instance=%s" % (daspath, self.instance), verb=verb - 1) if "nevents" in cmdout: ndasevts = int( cmdout.split('"nevents":')[1].split(',')[0]) else: ndasevts = 0 LOG.warning( "Could not get number of events from DAS for %r." % (self.name)) nevents += ndasevts self.nevents = nevents return nevents
def getnevents(self, refresh=False, verb=0): """Get number of files from DAS.""" nevents = self.nevents if nevents <= 0 or refresh: for path in self.paths: dascmd = 'dasgoclient --query="summary dataset=%s instance=%s"' % ( path, self.instance) LOG.verb(repr(dascmd), verb) cmdout = execute(dascmd, verb=verb - 1) if "nevents" in cmdout: ndasevts = int(cmdout.split('"nevents":')[1].split(',')[0]) else: LOG.warning( "Could not get number of events from DAS for %r." % (self.name)) nevents += ndasevts self.nevents = nevents return nevents
def getfiles(self,das=False,refresh=False,url=True,limit=-1,verb=0): """Get list of files from storage system (default), or DAS (if no storage system of das=True).""" LOG.verb("getfiles: das=%r, refresh=%r, url=%r, limit=%r, filelist=%r, len(files)=%d, len(filenevts)=%d"%( das,refresh,url,limit,self.filelist,len(self.files),len(self.filenevts)),verb,1) if self.filelist and not self.files: # get file list from text file for first time self.loadfiles(self.filelist) files = self.files # cache for efficiency url_ = self.dasurl if (das and self.storage) else self.url if self.refreshable and (not files or das or refresh): # (re)derive file list if not files or das: LOG.verb("getfiles: Retrieving files...",verb,2) else: LOG.verb("getfiles: Refreshing file list...",verb,2) files = [ ] for daspath in self.paths: # loop over DAS dataset paths self.pathfiles[daspath] = [ ] if (self.storage and not das) or (not self.instance): # get files from storage system postfix = self.postfix+'.root' sepath = repkey(self.storepath,PATH=daspath,DAS=daspath).replace('//','/') outlist = self.storage.getfiles(sepath,url=url,verb=verb-1) if limit>0: outlist = outlist[:limit] else: # get files from DAS postfix = '.root' outlist = getdasfiles(daspath,instance=self.instance,limit=limit,verb=verb-1) for line in outlist: # filter root files line = line.strip() if line.endswith(postfix) and not any(f.endswith(line) for f in self.blacklist): if url and url_ not in line and 'root://' not in line: line = url_+line files.append(line) self.pathfiles[daspath].append(line) self.pathfiles[daspath].sort() if not self.pathfiles[daspath]: LOG.warning("getfiles: Did not find any files for %s"%(daspath)) files.sort() # for consistent list order if not das or not self.storage: self.files = files # store cache for efficiency elif url and any(url_ not in f for f in files): # add url if missing files = [(url_+f if url_ not in f else f) for f in files] elif not url and any(url_ in f for f in files): # remove url files = [f.replace(url_,"") for f in files] return files[:] # pass copy to protect private self.files
def loadfiles(self,listname_,**kwargs): verbosity = LOG.getverbosity(self,kwargs) """Load filenames from text file for fast look up in future.""" listname = repkey(listname_,ERA=self.era,GROUP=self.group,SAMPLE=self.name) LOG.verb("loadfiles: listname=%r -> %r, len(files)=%d, len(filenevts)=%d"%( listname_,listname,len(self.files),len(self.filenevts)),verbosity,1) filenevts = self.filenevts nevents = 0 #listname = ensurefile(listname,fatal=False) filelist = [ ] paths = self.paths if '$PATH' in listname else [self.paths[0]] for path in paths: listname_ = repkey(listname,PATH=path.strip('/').replace('/','__')) if self.verbosity>=1: print ">>> Loading sample files from %r..."%(listname_) self.pathfiles[path] = [ ] if os.path.isfile(listname_): skip = False subpaths = [ ] # for sanity check with open(listname_,'r') as file: for line in file: line = line.strip().split() # split at space to allow comments at end if not line: continue line = line[0].strip() # remove spaces, consider only first part of the line if line[0]=='#': continue # do not consider comments #if line.endswith('.root'): if line.startswith("DASPATH="): # to keep track of multiple DAS data set paths path = line.split('=')[-1] # DAS data set path LOG.insist(path.count('/')>=3 and path.startswith('/'), "DAS path %r in %s has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT..."%(path,listname_)) if path in self.paths: # store file list for this path self.pathfiles[path] = [ ] subpaths.append(path) skip = False else: # do not store file list for this path skip = True else: if skip: continue # only load files for this sample's DAS dataset paths match = fevtsexp.match(line) # match $FILENAM(:NEVTS) if not match: continue infile = match.group(1) if match.group(2): # found nevents in filename nevts = int(match.group(2)) filenevts[infile] = nevts # store/cache in dictionary nevents += nevts filelist.append(infile) self.pathfiles[path].append(infile) if self.verbosity>=3: print ">>> %7d events for %s"%(nevts,infile) if not filelist: LOG.warning("loadfiles: Did not find any files in %s!"%(listname_)) self.refreshable = True else: # sanity check for empty list for subpath in subpaths: if not self.pathfiles[subpath]: LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(subpath,listname_)) else: LOG.warning("loadfiles: file list %s does not exist!"%(listname_)) self.refreshable = True for path in self.paths: if path not in self.pathfiles: # nonexistent list LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(path,listname)) if self.nevents<=0: self.nevents = nevents elif self.nevents!=nevents: LOG.warning("loadfiles: stored nevents=%d does not match the sum total of file events, %d!"%(self.nevents,nevents)) self.nevents == nevents self.files = filelist self.files.sort() return self.files
def _getnevents(self,das=True,refresh=False,tree='Events',limit=-1,checkfiles=False,ncores=0,verb=0): """Get number of nanoAOD events from DAS (default), or from files on storage system (das=False).""" LOG.verb("_getnevents: das=%r, refresh=%r, tree=%r, limit=%r, checkfiles=%r, filelist=%r, len(files)=%d, len(filenevts)=%d"%( das,refresh,tree,limit,checkfiles,self.filelist,len(self.files),len(self.filenevts)),verb,1) if self.filelist and not self.files: # get file list from text file for first time self.loadfiles(self.filelist) nevents = self.nevents filenevts = self.filenevts bar = None if nevents<=0 or refresh: if checkfiles or (self.storage and not das): # get number of events per file from storage system LOG.verb("_getnevents: Get events per file (storage=%r, das=%r)..."%(self.storage,das),verb,2) files = self.getfiles(url=True,das=das,refresh=refresh,limit=limit,verb=verb) if verb<=0 and len(files)>=5: bar = LoadingBar(len(files),width=20,pre=">>> Getting number of events: ",counter=True,remove=True) for nevts, fname in iterevts(files,tree,filenevts,refresh,ncores=ncores,verb=verb): filenevts[fname] = nevts # cache nevents += nevts LOG.verb("_getnevents: Found %d events in %r."%(nevts,fname),verb,3) if bar: if self.nevents>0: bar.count("files, %d/%d events (%d%%)"%(nevents,self.nevents,100.0*nevents/self.nevents)) else: bar.count("files, %d events"%(nevents)) else: # get total number of events from DAS LOG.verb("_getnevents: Get total number of events per path (storage=%r, das=%r)..."%(self.storage,das),verb,2) for daspath in self.paths: nevts = getdasnevents(daspath,instance=self.instance,verb=verb-1) LOG.verb("_getnevents: %10d events for %s..."%(nevts,daspath),verb,2) nevents += nevts if limit<=0: self.nevents = nevents else: LOG.verb("_getnevents: Reusing old number of events (nevents=%r, refresh=%r)..."%(nevents,refresh),verb,2) return nevents, filenevts