def __loadChain(self): ''' Load the TChain. Private. ''' if len(self.files) == 0: raise helpers.EmptySampleError( "Sample {name} has no input files! Can not load.".format( name=self.name)) else: self._chain = ROOT.TChain(self.treeName) counter = 0 for f in self.files: logger.debug("Now adding file %s to sample '%s'", f, self.name) try: if self.skipCheck or helpers.checkRootFile( f, checkForObjects=[self.treeName]): self._chain.Add(f) counter += 1 else: logger.error( "Check of root file failed. Skipping. File: %s", f) except IOError as e: logger.error("Could not load file %s", f) #raise e if counter == 0: raise helpers.EmptySampleError("No root files for sample %s." % self.name) logger.debug("Loaded %i files for sample '%s'.", counter, self.name) # Add friends if hasattr( self, 'friends' ): # Catch cases where cached samples have no default value for friends attribute for friend_sample, friend_treeName in self.friends: self.chain.AddFriend(friend_sample.chain, friend_treeName)
def reduceFiles( self, factor = 1, to = None ): ''' Reduce number of files in the sample ''' len_before = len(self.files) norm_before = self.normalization if factor!=1: #self.files = self.files[:len_before/factor] self.files = self.files[0::factor] if len(self.files)==0: raise helpers.EmptySampleError( "No ROOT files for sample %s after reducing by factor %f" % (self.name, factor) ) elif to is not None: if to>=len(self.files): return self.files = self.files[:to] else: return # Keeping track of reduceFile factors factor = len(self.files)/float(len_before) if hasattr(self, "reduce_files_factor"): self.reduce_files_factor *= factor else: self.reduce_files_factor = factor self.normalization = factor*self.normalization if self.normalization is not None else None logger.info("Sample %s: Reduced number of files from %i to %i. Old normalization: %r. New normalization: %r. factor: %3.3f", self.name, len_before, len(self.files), norm_before, self.normalization, factor) return
def fromDirectory(cls, name, directory, treeName = "Events", normalization = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None, maxN = None): '''Load sample from directory or list of directories. If the name is "", enumerate the sample ''' # Work with directories and list of directories directories = [directory] if type(directory)==type("") else directory # If no name, enumerate them. if not name: name = new_name() # find all files files = [] for d in directories: fileNames = [ os.path.join(d, f) for f in os.listdir(d) if f.endswith('.root') ] if len(fileNames) == 0: raise helpers.EmptySampleError( "No root files found in directory %s." %d ) files.extend( fileNames ) if not treeName: treeName = "Events" logger.debug("Argument 'treeName' not provided, using 'Events'.") # restrict files maxN = maxN if maxN is not None and maxN>0 else None files = files[:maxN] sample = cls(name = name, treeName = treeName, files = files, normalization = normalization, \ selectionString = selectionString, weightString = weightString, isData = isData, color=color, texName = texName) logger.debug("Loaded sample %s from %i files.", name, len(files)) return sample
def fromDirectory(cls, name, directory, color=0, texName=None, maxN=None): '''Load sample from directory or list of directories. If the name is "", enumerate the sample ''' # Work with directories and list of directories directories = [directory] if type(directory) == type("") else directory # If no name, enumerate them. if not name: name = newName() # find all files files = [] for d in directories: fileNames = [ os.path.join(d, f) for f in os.listdir(d) if f.endswith('.root') ] if len(fileNames) == 0: raise helpers.EmptySampleError( "No root files found in directory %s." % d) files.extend(fileNames) # restrict files maxN = maxN if maxN is not None and maxN > 0 else None files = files[:maxN] return cls(name=name, files=files, color=color, texName=texName)
def fromDPMDirectory(cls, name, directory, redirector='root://hephyse.oeaw.ac.at/', treeName = "Events", normalization = None, xSection = -1, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None, maxN = None, noCheckProxy=False): # Work with directories and list of directories directories = [directory] if type(directory) == type("") else directory if not all([d.startswith("/dpm") for d in directories]): raise ValueError("DPM directories do not start with /dpm/") # If no name, enumerate them. if not name: name = new_name() # Renew proxy from RootTools.core.helpers import renew_proxy proxy_path = os.path.expandvars('$HOME/private/.proxy') if not noCheckProxy: proxy = renew_proxy(proxy_path) else: proxy = proxy_path logger.info( "Not checking your proxy. Asuming you know it's still valid.") logger.info("Using proxy %s" % proxy) files = [] for d in directories: cmd = ["xrdfs", redirector, "ls", d] fileList = [] for i in range(10): try: fileList = [ file for file in subprocess.check_output(cmd).split( "\n")[:-1] ] break except: if i < 9: pass counter = 0 for filename in fileList: if filename.endswith(".root"): files.append(redirector + os.path.join(d, filename)) counter += 1 if maxN is not None and maxN > 0 and len(files) >= maxN: break if counter == 0: raise helpers.EmptySampleError( "No root files found in directory %s." % d) sample = cls(name = name, treeName = treeName, files = files, normalization = normalization, xSection = xSection,\ selectionString = selectionString, weightString = weightString, isData = isData, color=color, texName = texName) logger.debug("Loaded sample %s from %i files.", name, len(files)) return sample
def fromDirectory(cls, name, directory, redirector = None, treeName = "Events", normalization = None, xSection = -1, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None, maxN = None, skipCheck = False): '''Load sample from directory or list of directories. If the name is "", enumerate the sample ''' # Work with directories and list of directories directories = [directory] if type(directory) == type("") else directory # If no name, enumerate them. if not name: name = new_name() # find all files files = [] for d in directories: if redirector is None: fileNames = [ os.path.join(d, f) for f in os.listdir(d) if f.endswith('.root') ] logger.debug("Found %i files in directory %s", len(fileNames), d) else: cmd = "xrdfs %s ls %s" % (redirector, d) p = subprocess.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) fileNames = [ redirector + '/' + f.rstrip("\n") for f in p.stdout.readlines() if f.endswith('.root\n') ] logger.debug("Found %i files in directory (xrootd) %s", len(fileNames), d) if len(fileNames) == 0: raise helpers.EmptySampleError( "No root files found in directory %s." % d) files.extend(fileNames) if not treeName: treeName = "Events" logger.debug("Argument 'treeName' not provided, using 'Events'.") # restrict files maxN = maxN if maxN is not None and maxN > 0 else None files = files[:maxN] sample = cls(name = name, treeName = treeName, files = files, normalization = normalization, xSection = xSection,\ selectionString = selectionString, weightString = weightString, isData = isData, color=color, texName = texName, skipCheck = skipCheck) logger.debug("Loaded sample %s from %i files.", name, len(files)) return sample
def __init__(self, name, treeName , files = [], normalization = None, selectionString = None, weightString = None, xSection = -1, isData = False, color = 0, texName = None): ''' Handling of sample. Uses a TChain to handle root files with flat trees. 'name': Name of the sample, 'treeName': name of the TTree in the input files 'normalization': can be set in order to later calculate weights, e.g. to total number of events befor all cuts or the sum of NLO gen weights 'selectionString': sample specific string based selection (can be list of strings) 'weightString': sample specific string based weight (can be list of strings) 'xSection': cross section of the sample 'isData': Whether the sample is real data or not (simulation) 'color': ROOT color to be used in plot scripts 'texName': ROOT TeX string to be used in legends etc. ''' self.name = name self.treeName = treeName self.files = files self.xSection = xSection if not len(self.files)>0: raise helpers.EmptySampleError( "No ROOT files for sample %s! Files: %s" % (self.name, self.files) ) self.normalization = normalization self._chain = None self.__selectionStrings = [] self.setSelectionString( selectionString ) self.__weightStrings = [] self.setWeightString( weightString ) self.isData = isData self.color = color self.texName = texName if not texName is None else name # Other samples. Add friend elements (friend, treeName) self.friends = [] logger.debug("Created new sample %s with %i files, treeName %s, selectionStrings %r and weightStrings %r.", name, len(self.files), treeName, self.__selectionStrings, self.__weightStrings)
def __init__(self, name, files = [], color = 0, texName = None): ''' Base class constructor for all sample classes. 'name': Name of the sample, 'color': ROOT color to be used in plot scripts 'texName': ROOT TeX string to be used in legends etc. ''' self.name = name self.files = files if not len(self.files)>0: raise helpers.EmptySampleError( "No ROOT files for sample %s! Files: %s" % (self.name, self.files) ) self.color = color self.texName = texName if not texName is None else name logger.debug("Created new sample %s with %i files.", name, len(self.files))
def __init__(self, name, files=[], color=0, texName=None): ''' Base class constructor for all sample classes. 'name': Name of the sample, 'color': ROOT color to be used in plot scripts 'texName': ROOT TeX string to be used in legends etc. ''' super(FWLiteSample, self).__init__(name=name, files=files, normalization=None, xSection=None, isData=None, color=color, texName=texName) if not len(self.files) > 0: raise helpers.EmptySampleError( "No ROOT files for sample %s! Files: %s" % (self.name, self.files)) logger.debug("Created new sample %s with %i files.", name, len(self.files))
def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG crab output directory ''' import tarfile from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number' treeFiles = {} zipFiles = {} for root, subFolders, filenames in os.walk( baseDirectory ): for filename in filenames: base, ext = os.path.splitext( filename ) try: n = int(base.split('_')[-1]) except: # filename is not of the form 'xyz_n' where n is the job number continue # add the tgz and files to the dict. filename_ = os.path.join(root, filename) if ext=='.root': treeFiles[n] = filename_ if ext=='.tgz': zipFiles[n] = filename_ # Find pairs of zip and root files pairs = set(zipFiles.keys()) & set(treeFiles.keys()) n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) ) normalization = 0 files = [] failedJobs = [] for n in pairs: sumW = None tf = tarfile.open( zipFiles[n], 'r:gz' ) for f in tf.getmembers(): if "SkimReport.txt" in f.name: sumW = read_cmg_normalization(tf.extractfile(f)) if sumW is not None: break if sumW is None: logger.warning( "No normalization found when reading tar file %s", zipFiles[n] ) tf.close() # Check treefile for whether the tree 'treeName' can be found. # This is an implicit check for broken, recovered or otherwise corrupted root files. treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read job %i and incremented normalization by %7.2f", n, sumW ) else: failedJobs.append( n ) # Don't allow empty samples if len(files) == 0: raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\ .format(name, len(pairs), baseDirectory)) # Log statements eff = 100*len(failedJobs)/float( n_jobs ) logger.debug("Loaded CMGOutput sample %s. Total number of jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(pairs), n_jobs, normalization, len(failedJobs), eff) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. Expects the presence of the tree root file and the SkimReport.txt ''' from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting chunkDirectories = [] # FIXME: Better to loop only over subdirectories in base directory? for x in os.listdir(baseDirectory): if os.path.isdir(os.path.join(baseDirectory, x)): if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString: chunkDirectories.append(os.path.join(baseDirectory, x)) if len(chunkDirectories)==maxN:break logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \ len(chunkDirectories), chunkString, baseDirectory ) normalization = 0 files = [] failedChunks=[] goodChunks =[] for i, chunkDirectory in enumerate( chunkDirectories ): success = False logger.debug("Reading chunk %s", chunkDirectory) # Find normalization sumW = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Determine normalization constant if 'SkimReport.txt' in filenames: skimReportFilename = os.path.join(root, 'SkimReport.txt') with open(skimReportFilename, 'r') as fin: sumW = read_cmg_normalization(fin) if not sumW: logger.warning( "Read chunk %s and found report '%s' but could not read normalization.", chunkDirectory, skimReportFilename ) # Find treefile treeFile = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Load tree file if treeFilename in filenames: treeFile = os.path.join(root, treeFilename) # Checking whether root file is OG and contains a tree if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ): logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.", chunkDirectory, treeFile ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f", chunkDirectory, sumW ) success = True goodChunks.append( chunkDirectory ) if not success: failedChunks.append( chunkDirectory ) # Don't allow empty samples if len(goodChunks) == 0: raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\ .format(name, len(chunkDirectories), baseDirectory)) # Log statements eff = 100*len(failedChunks)/float( len(chunkDirectories) ) logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(chunkDirectories), normalization, len(failedChunks), eff) for chunk in failedChunks: logger.debug( "Failed to load chunk %s", chunk) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )