def __loadChain(self): ''' Load the TChain. Private. ''' if len(self.files) == 0: raise helpers.EmptySampleError("Sample {name} has no input files! Can not load.".format(name = self.name) ) else: self._chain = ROOT.TChain(self.treeName) counter = 0 for f in self.files: logger.debug("Now adding file %s to sample '%s'", f, self.name) try: if helpers.checkRootFile(f, checkForObjects=[self.treeName]): self._chain.Add(f) counter+=1 else: logger.error( "Check of root file failed. Skipping. File: %s", f ) except IOError as e: logger.error( "Could not load file %s", f ) #raise e logger.debug( "Loaded %i files for sample '%s'.", counter, self.name ) # Add friends if hasattr( self, 'friends'): # Catch cases where cached samples have no default value for friends attribute for friend_sample, friend_treeName in self.friends: self.chain.AddFriend(friend_sample.chain, friend_treeName)
def __loadChain(self): ''' Load the TChain. Private. ''' if len(self.files) == 0: raise helpers.EmptySampleError("Sample {name} has no input files! Can not load.".format(name = self.name) ) else: self._chain = ROOT.TChain(self.treeName) counter = 0 for f in self.files: logger.debug("Now adding file %s to sample '%s'", f, self.name) try: if helpers.checkRootFile(f, checkForObjects=[self.treeName]): self._chain.Add(f) counter+=1 except IOError: pass logger.warning( "Could not load file %s", f ) logger.debug( "Loaded %i files for sample '%s'.", counter, self.name )
def __init__(self, heppy_samples, dpm_directories, cache_file, multithreading=True): # Read cache file, if exists if os.path.exists(cache_file) and not overwrite: self.sample_map = pickle.load(file(cache_file)) logger.info("Loaded cache file %s" % cache_file) else: logger.info("Cache file %s not found. Recreate map.", cache_file) logger.info("Check proxy.") # Proxy certificate from RootTools.core.helpers import renew_proxy # Make proxy in afs to allow batch jobs to run proxy_path = os.path.expandvars('$HOME/private/.proxy') proxy = renew_proxy(proxy_path) logger.info("Using proxy %s" % proxy) # Read dpm directories self.cmg_directories = {} for data_path in dpm_directories: logger.info("Walking dpm directory %s", data_path) walker = walk_dpm(data_path) self.cmg_directories[ data_path] = walker.walk_dpm_cmgdirectories('.', maxN=maxN) #del walker logger.info("Now mapping directories to heppy samples") for heppy_sample in heppy_samples: heppy_sample.candidate_directories = [] pd, era = heppy_sample.dataset.split('/')[1:3] for data_path in self.cmg_directories.keys(): for dpm_directory in self.cmg_directories[data_path].keys( ): if not ('/%s/' % pd in dpm_directory): logger.debug("/%s/ not in dpm_directory %s", pd, dpm_directory) continue if not ('/' + era in dpm_directory): logger.debug("/%s not in dpm_directory %s", era, dpm_directory) continue heppy_sample.candidate_directories.append( [data_path, dpm_directory]) logger.debug("heppy sample %s in %s", heppy_sample.name, dpm_directory) logger.info("Found heppy sample %s in %i directories.", heppy_sample.name, len(heppy_sample.candidate_directories)) # Merge from RootTools.core.Sample import Sample logger.info( "Now making new sample map from %i directories and for %i heppy samples to be stored in %s", len(dpm_directories), len(heppy_samples), cache_file) self.sample_map = {} for heppy_sample in heppy_samples: if len(heppy_sample.candidate_directories) == 0: logger.info("No directory found for %s", heppy_sample.name) else: normalization, files = walker.combine_cmg_directories(\ cmg_directories = {dpm_directory:self.cmg_directories[data_path][dpm_directory] for data_path, dpm_directory in heppy_sample.candidate_directories }, multithreading = multithreading, ) logger.info( "Sample %s: Found a total of %i files with normalization %3.2f", heppy_sample.name, len(files), normalization) tmp_files = [] for f in files: isGoodFile = False try: isGoodFile = checkRootFile( "root://hephyse.oeaw.ac.at/" + os.path.join(f)) logger.debug("File %s got added", f) except IOError: logger.info("File %s is corrupted, skipping", f) if isGoodFile: tmp_files.append(f) self.sample_map[heppy_sample] = Sample.fromFiles( heppy_sample.name, files=[ 'root://hephyse.oeaw.ac.at/' + f for f in tmp_files ], normalization=normalization, treeName='tree', isData=heppy_sample.isData, maxN=maxN) logger.info( "Combined %i directories for sample %s to a total of %i files with normalization %3.2f", len(heppy_sample.candidate_directories), heppy_sample.name, len(files), normalization) # Store cache file dir_name = os.path.dirname(cache_file) if len(self.sample_map.keys()) > 0: if not os.path.exists(dir_name): os.makedirs(dir_name) pickle.dump(self.sample_map, file(cache_file, 'w')) logger.info("Created MC sample cache %s", cache_file) else: logger.info("Skipping to write %s because map is empty.", cache_file)
def fromDAS(cls, name, dataset, instance = 'global', prefix='root://cms-xrd-global.cern.ch/', texName = None, maxN = None, dbFile=None, overwrite=False, skipCheck = False): ''' Make sample from DAS. ''' # https://github.com/CERN-PH-CMG/cmg-cmssw/blob/0f1d3bf62e7ec91c2e249af1555644b7f414ab50/CMGTools/Production/python/dataset.py#L437 maxN = maxN if maxN is not None and maxN>0 else None limit = maxN if maxN else 0 DASname = dataset.rstrip('/') n_cache_files = 0 # Don't use the cache on partial queries if dbFile is not None and ( maxN<0 or maxN is None ): cache = Database(dbFile, "fileCache", ["name"]) n_cache_files = cache.contains({'name':name}) else: cache = None if n_cache_files and not overwrite: files = [ f["value"] for f in cache.getDicts({'name':name}) ] logger.info('Found sample %s in cache %s, return %i files.', name, dbFile, len(files)) else: # def _dasPopen(dbs): # if 'LSB_JOBID' in os.environ: # raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs # if 'X509_USER_PROXY' in os.environ: # dbs += " --key {0} --cert {0}".format(os.environ['X509_USER_PROXY']) # logger.info('DAS query\t: %s', dbs) # return os.popen(dbs) # # sampleName = dataset.rstrip('/') # query, qwhat = sampleName, "dataset" # if "#" in sampleName: qwhat = "block" # # dbs='das_client --query="file %s=%s instance=prod/%s" --limit %i'%(qwhat,query, instance, limit) # dbsOut = _dasPopen(dbs).readlines() if overwrite: cache.removeObjects({"name":name}) def _dasPopen(dbs): if 'LSB_JOBID' in os.environ: raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs logger.info('DAS query\t: %s', dbs) return os.popen(dbs) query, qwhat = DASname, "dataset" if "#" in DASname: qwhat = "block" dbs='dasgoclient -query="file %s=%s instance=prod/%s" --limit %i'%(qwhat,query, instance, limit) dbsOut = _dasPopen(dbs).readlines() files = [] for line in dbsOut: if line.startswith('/store/'): line = line.rstrip() filename = line try: if skipCheck or helpers.checkRootFile(prefix+filename): files.append(filename) except IOError: logger.warning( "IOError for file %s. Skipping.", filename ) if cache is not None: cache.add({"name":name}, filename, save=True) if limit>0: files=files[:limit] result = cls(name, files=[prefix+file for file in files], texName = texName) result.DASname = DASname return result
def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG crab output directory ''' import tarfile from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number' treeFiles = {} zipFiles = {} for root, subFolders, filenames in os.walk( baseDirectory ): for filename in filenames: base, ext = os.path.splitext( filename ) try: n = int(base.split('_')[-1]) except: # filename is not of the form 'xyz_n' where n is the job number continue # add the tgz and files to the dict. filename_ = os.path.join(root, filename) if ext=='.root': treeFiles[n] = filename_ if ext=='.tgz': zipFiles[n] = filename_ # Find pairs of zip and root files pairs = set(zipFiles.keys()) & set(treeFiles.keys()) n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) ) normalization = 0 files = [] failedJobs = [] for n in pairs: sumW = None tf = tarfile.open( zipFiles[n], 'r:gz' ) for f in tf.getmembers(): if "SkimReport.txt" in f.name: sumW = read_cmg_normalization(tf.extractfile(f)) if sumW is not None: break if sumW is None: logger.warning( "No normalization found when reading tar file %s", zipFiles[n] ) tf.close() # Check treefile for whether the tree 'treeName' can be found. # This is an implicit check for broken, recovered or otherwise corrupted root files. treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read job %i and incremented normalization by %7.2f", n, sumW ) else: failedJobs.append( n ) # Don't allow empty samples if len(files) == 0: raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\ .format(name, len(pairs), baseDirectory)) # Log statements eff = 100*len(failedJobs)/float( n_jobs ) logger.debug("Loaded CMGOutput sample %s. Total number of jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(pairs), n_jobs, normalization, len(failedJobs), eff) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. Expects the presence of the tree root file and the SkimReport.txt ''' from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting chunkDirectories = [] # FIXME: Better to loop only over subdirectories in base directory? for x in os.listdir(baseDirectory): if os.path.isdir(os.path.join(baseDirectory, x)): if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString: chunkDirectories.append(os.path.join(baseDirectory, x)) if len(chunkDirectories)==maxN:break logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \ len(chunkDirectories), chunkString, baseDirectory ) normalization = 0 files = [] failedChunks=[] goodChunks =[] for i, chunkDirectory in enumerate( chunkDirectories ): success = False logger.debug("Reading chunk %s", chunkDirectory) # Find normalization sumW = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Determine normalization constant if 'SkimReport.txt' in filenames: skimReportFilename = os.path.join(root, 'SkimReport.txt') with open(skimReportFilename, 'r') as fin: sumW = read_cmg_normalization(fin) if not sumW: logger.warning( "Read chunk %s and found report '%s' but could not read normalization.", chunkDirectory, skimReportFilename ) # Find treefile treeFile = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Load tree file if treeFilename in filenames: treeFile = os.path.join(root, treeFilename) # Checking whether root file is OG and contains a tree if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ): logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.", chunkDirectory, treeFile ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f", chunkDirectory, sumW ) success = True goodChunks.append( chunkDirectory ) if not success: failedChunks.append( chunkDirectory ) # Don't allow empty samples if len(goodChunks) == 0: raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\ .format(name, len(chunkDirectories), baseDirectory)) # Log statements eff = 100*len(failedChunks)/float( len(chunkDirectories) ) logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(chunkDirectories), normalization, len(failedChunks), eff) for chunk in failedChunks: logger.debug( "Failed to load chunk %s", chunk) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
argParser.add_argument('--logLevel', action='store', nargs='?', choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'TRACE', 'NOTSET'], default='INFO', help="Log level for logging" ) args = argParser.parse_args() logger = logger.get_logger(args.logLevel, logFile = None) # Check if the gridpack exists if not os.path.exists( args.gridpack ): logger.error( "Gridpack %s not found. Exit", args.gridpack ) sys.exit(0) gp = os.path.basename(args.gridpack).rstrip('.tar.xz') # Check if the output is there out_file = os.path.join( skim_output_directory, 'gen', args.outDir, gp, gp+'.root') if os.path.exists( out_file ) and checkRootFile( out_file, checkForObjects=["Events"] ): logger.info( "Found output file %s.", out_file ) if args.overwrite: os.remove( out_file ) logger.info( "Deleted, because I overwrite." ) else: sys.exit(0) else: logger.info( "Did not find output file %s. Look for gen sample. ", out_file ) # Check if the intermediate gen file is there gen_file = os.path.join( args.genSampleDir, gp, 'events.root' ) if os.path.exists( gen_file ) and checkEDMRootFile( gen_file ): logger.info( "Found edm gen file %s.", gen_file) if args.overwriteGenFile: os.remove( gen_file )
def fromDPMDirectory(cls, name, directory, prefix='root://hephyse.oeaw.ac.at/', texName=None, maxN=None, dbFile=None, overwrite=False, skipCheck=False): maxN = maxN if maxN is not None and maxN > 0 else None limit = maxN if maxN else 0 n_cache_files = 0 # Don't use the cache on partial queries if dbFile is not None and (maxN < 0 or maxN is None): cache = Database(dbFile, "fileCache", ["name"]) n_cache_files = cache.contains({'name': name}) else: cache = None if n_cache_files and not overwrite: files = [f["value"] for f in cache.getDicts({'name': name})] logger.info('Found sample %s in cache %s, return %i files.', name, dbFile, len(files)) else: if overwrite: cache.removeObjects({"name": name}) def _dasPopen(dbs): if 'LSB_JOBID' in os.environ: raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs logger.info('DAS query\t: %s', dbs) return os.popen(dbs) files = [] dbs = 'xrdfs %s ls %s' % (prefix, directory) dbsOut = _dasPopen(dbs).readlines() for line in dbsOut: if line.startswith('/store/'): line = line.rstrip() filename = line try: if skipCheck or helpers.checkRootFile(prefix + filename): files.append(filename) except IOError: logger.warning("IOError for file %s. Skipping.", filename) if cache is not None: cache.add({"name": name}, filename, save=True) if limit > 0: files = files[:limit] result = cls(name, files=[prefix + file for file in files], texName=texName) result.DASname = prefix + directory.rstrip("/") return result
def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, xSection = -1,\ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG crab output directory ''' import tarfile from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number' treeFiles = {} zipFiles = {} for root, subFolders, filenames in os.walk( baseDirectory ): for filename in filenames: base, ext = os.path.splitext( filename ) try: n = int(base.split('_')[-1]) except: # filename is not of the form 'xyz_n' where n is the job number continue # add the tgz and files to the dict. filename_ = os.path.join(root, filename) if ext=='.root': treeFiles[n] = filename_ if ext=='.tgz': zipFiles[n] = filename_ # Find pairs of zip and root files pairs = set(zipFiles.keys()) & set(treeFiles.keys()) n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) ) normalization = 0 files = [] failedJobs = [] for n in pairs: sumW = None tf = tarfile.open( zipFiles[n], 'r:gz' ) for f in tf.getmembers(): if "SkimReport.txt" in f.name: sumW = read_cmg_normalization(tf.extractfile(f)) if sumW is not None: break if sumW is None: logger.warning( "No normalization found when reading tar file %s", zipFiles[n] ) tf.close() # Check treefile for whether the tree 'treeName' can be found. # This is an implicit check for broken, recovered or otherwise corrupted root files. treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read job %i and incremented normalization by %7.2f", n, sumW ) else: failedJobs.append( n ) # Don't allow empty samples if len(files) == 0: raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\ .format(name, len(pairs), baseDirectory)) # Log statements eff = 100*len(failedJobs)/float( n_jobs ) logger.debug("Loaded CMGOutput sample %s. Total number of jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(pairs), n_jobs, normalization, len(failedJobs), eff) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, xSection = xSection, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \ selectionString = None, xSection = -1, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. Expects the presence of the tree root file and the SkimReport.txt ''' from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting chunkDirectories = [] # FIXME: Better to loop only over subdirectories in base directory? for x in os.listdir(baseDirectory): if os.path.isdir(os.path.join(baseDirectory, x)): if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString: chunkDirectories.append(os.path.join(baseDirectory, x)) if len(chunkDirectories)==maxN:break logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \ len(chunkDirectories), chunkString, baseDirectory ) normalization = 0 files = [] failedChunks=[] goodChunks =[] for i, chunkDirectory in enumerate( chunkDirectories ): success = False logger.debug("Reading chunk %s", chunkDirectory) # Find normalization sumW = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Determine normalization constant if 'SkimReport.txt' in filenames: skimReportFilename = os.path.join(root, 'SkimReport.txt') with open(skimReportFilename, 'r') as fin: sumW = read_cmg_normalization(fin) if not sumW: logger.warning( "Read chunk %s and found report '%s' but could not read normalization.", chunkDirectory, skimReportFilename ) # Find treefile treeFile = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Load tree file if treeFilename in filenames: treeFile = os.path.join(root, treeFilename) # Checking whether root file is OG and contains a tree if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ): logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.", chunkDirectory, treeFile ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f", chunkDirectory, sumW ) success = True goodChunks.append( chunkDirectory ) if not success: failedChunks.append( chunkDirectory ) # Don't allow empty samples if len(goodChunks) == 0: raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\ .format(name, len(chunkDirectories), baseDirectory)) # Log statements eff = 100*len(failedChunks)/float( len(chunkDirectories) ) logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(chunkDirectories), normalization, len(failedChunks), eff) for chunk in failedChunks: logger.debug( "Failed to load chunk %s", chunk) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, xSection = xSection, isData = isData, color = color, texName = texName )