def test_TreeCache1(self): # load sample tree sampleTree = self.getTree() # create skimmed tree (cache) tc = TreeCache.TreeCache(sample=self.sampleName, cutList=[self.someCut], inputFolder=self.scratchDirectory, tmpFolder=self.tmpDir, outputFolder=self.cacheDir, branches=['a', 'b', 'c'], debug=False) tc.setSampleTree(sampleTree).cache() sampleTree.process() # now try to load the cached tree tc2 = TreeCache.TreeCache(sample=self.sampleName, cutList=[self.someCut], inputFolder=self.scratchDirectory, tmpFolder=self.tmpDir, outputFolder=self.cacheDir, debug=False) # and check if cached tree is there self.assertTrue(tc2.isCached()) self.assertTrue(tc2.isCachedAndValid()) # get the sampleTree sampleTreeCached = tc2.getTree() # check if the number of events matches nSelectedEvents = sampleTree.tree.Draw("a", self.someCut, "goff") self.assertEqual(sampleTreeCached.tree.GetEntries(), nSelectedEvents)
def prepare(self): # add DATA + MC samples self.fileNames = [] for sample in self.dataSamples + self.mcSamples: print(sample.identifier) # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append( self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config) if tc.isCached(): self.fileNames += tc.findCachedFileNames() else: print("ERROR: not cached, run cacheplot again") raise Exception("NotCached") if len(self.fileNames) < 1: print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m") return self
def prepare(self): print( "INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:" .format(region=region)) for var in self.vars: print(" > {var}".format(var=var)) self.histogramStacks = {} for var in self.vars: self.histogramStacks[var] = StackMaker(self.config, var, self.region, self.signalRegion, None, '_' + self.subcutPlotName, title=self.title) fileLocator = FileLocator(config=self.config, useDirectoryListingCache=True) # add DATA + MC samples for sample in self.dataSamples + self.mcSamples: # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append( self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config, fileLocator=fileLocator) sampleTree = tc.getTree() if sampleTree: groupName = self.getSampleGroup(sample) print(" > found the tree, #entries = ", sampleTree.tree.GetEntries()) print(" > group =", groupName) print(" > now adding the tree for vars=", self.vars) # add the sample tree for all the variables for var in self.vars: self.histogramStacks[var].addSampleTree( sample=sample, sampleTree=sampleTree, groupName=groupName, cut=self.subcut if self.subcut else '1') else: print("\x1b[31mERROR: sampleTree not available for ", sample, ", run caching again!!\x1b[0m") raise Exception("CachedTreeMissing") return self
def test_NotCached(self): # now try to load the cached tree tc2 = TreeCache.TreeCache(sample='ThisSampleDoesNotExistInCache', cutList=[self.someCut], inputFolder=self.scratchDirectory, tmpFolder=self.tmpDir, outputFolder=self.cacheDir, debug=False) self.assertFalse(tc2.isCached())
def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval( self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] try: keepBranchesPlot += eval( self.config.get('Branches', 'keep_branches')) except: pass # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): try: if section.startswith( 'plotDef:') and self.config.has_option( section, 'relPath'): keepBranchesPlot.append( self.config.get(section, 'relPath')) except Exception as e: print("\x1b[31mWARNING: config error in:", section, "=>", e, "\x1b[0m") except Exception as e2: print( "\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m" ) print(e2) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass # plotting region cut for region, regionInfo in self.regionsDict.iteritems(): keepBranchesPlot.append(regionInfo['cut']) keepBranchesPlotFinal = BranchList( keepBranchesPlot).getListOfBranches() print("KEEP:", keepBranchesPlotFinal) # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] for sample in subsamples: # add cuts for all training regions for region, regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s' % region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append( self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general', 'addBlindingCut'): sampleCuts.append( self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format( region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print( "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m" ) raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk( self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print( "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m" ) raise Exception("SampleFilesHaveChanged") treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) else: print("INFO: already cached!", tc, "(", tc.hash, ")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
def prepare(self): self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE") # ---------------------------------------------------------------------------------------------------------------------- # create TMVA factory # ---------------------------------------------------------------------------------------------------------------------- self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings) if self.trainingOutputFile and self.factory: print("INFO: initialized MvaTrainingHelper.", self.factory) else: print( "\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m" ) # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/eval trees # ---------------------------------------------------------------------------------------------------------------------- try: addBackgroundTreeMethod = self.factory.AddBackgroundTree addSignalTreeMethod = self.factory.AddSignalTree self.dataLoader = None except: print("oh no..") # the DataLoader wants to be called '.' self.dataLoader = ROOT.TMVA.DataLoader(".") addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree addSignalTreeMethod = self.dataLoader.AddSignalTree # DEBUG: restrict memory # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024)) self.sampleTrees = [] for addTreeFcn, samples in [[ addBackgroundTreeMethod, self.samples['BKG'] ], [addSignalTreeMethod, self.samples['SIG']]]: for sample in samples: print('*' * 80, '\n%s\n' % sample, '*' * 80) for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True) sampleTree = tc.getTree() sampleTree.tree.SetCacheSize(32 * 1024) # prevent garbage collection self.sampleTrees.append(sampleTree) if sampleTree: treeScale = sampleTree.getScale( sample) * self.globalRescale # only non-empty trees can be added if sampleTree.tree.GetEntries() > 0: addTreeFcn( sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting) print('max mem used = %d' % (resource.getrusage( resource.RUSAGE_SELF).ru_maxrss)) else: print("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") if self.dataLoader: for var in self.MVA_Vars['Nominal']: self.dataLoader.AddVariable(var, 'D') else: for var in self.MVA_Vars['Nominal']: self.factory.AddVariable(var, 'D') return self
def prepare(self): if len(self.dcMakers) > 0: self.treeCaches = [] self.sampleTree = None # cuts allSamples = self.getAllSamples() subsamples = [ x for x in allSamples if x.identifier == self.sampleToCache ] # loop over all datacard regions for dcMaker in self.dcMakers: # loop over all subsamples (which come from the same root tree files) for sample in subsamples: # combine subcut and systematics cut with logical AND # systematics cuts are combined with logical OR, such that 1 cache file can be used for all the systematics isData = (sample.type == 'DATA') systematicsCuts = sorted( list( set([ x['cachecut'] for x in dcMaker.getSystematicsList( isData=isData) ]))) sampleCuts = { 'AND': [sample.subcut, { 'OR': systematicsCuts }] } if self.verbose: print( json.dumps(sampleCuts, sort_keys=True, indent=8, default=str)) # make list of branches to keep in root file branchList = BranchList(sample.subcut) branchList.addCut( [x['cachecut'] for x in dcMaker.getSystematicsList()]) branchList.addCut( [x['cut'] for x in dcMaker.getSystematicsList()]) branchList.addCut( [x['var'] for x in dcMaker.getSystematicsList()]) branchList.addCut( [x['weight'] for x in dcMaker.getSystematicsList()]) branchList.addCut(self.config.get('Weights', 'weightF')) branchList.addCut( eval(self.config.get('Branches', 'keep_branches'))) branchesToKeep = branchList.getListOfBranches() # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'dc:{region}_{sample}'.format( region=dcMaker.getRegion(), sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, cutSequenceMode='TREE', branches=branchesToKeep, inputFolder=dcMaker.path, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, config=self.config, debug=self.verbose) # check if this part of the sample is already cached isCached = tc.partIsCached() print( "check if sample \x1b[34m{sample}\x1b[0m part {part} is cached:" .format(sample=sample.name, part=self.chunkNumber), isCached) if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': dcMaker.path }, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print( "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m" ) raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk( self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print( "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m" ) raise Exception("SampleFilesHaveChanged") # connect the TreeCache object to the input sampleTree and add it to the list of cached trees self.treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) else: print("WARNING: no datacard regions added, nothing to do.") return self
def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format( region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut == self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles( chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self. splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
def prepare(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- self.sampleTrees = [] categories = ['BKG', 'SIG'] datasetParts = {'train': self.trainCut, 'test': self.evalCut} cachedFilesPath = self.getCachedNumpyArrayPath() try: os.makedirs(cachedFilesPath) except: pass # load numpy arrays from disk if they have been already created if self.loadCachedNumpyArrays(cachedFilesPath): return self arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} # standard weight expression weightF = self.config.get('Weights','weightF') for category in categories: for sample in self.samples[category]: print ('*'*80,'\n%s\n'%sample,'*'*80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale print ('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) sampleTree.addFormula(weightF) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section totalWeight = treeScale * sampleTree.evaluate(weightF) weightLists[datasetName].append(totalWeight) targetLists[datasetName].append(categories.index(category)) arrayLists[datasetName].append(inputData) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), }, } # write numpy arrays to disk self.writeNumpyArrays(cachedFilesPath) return self
def run(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- categories = self.samples.keys() datasetParts = {'train': self.trainCut, 'test': self.evalCut} systematics = self.systematics arrayLists = { datasetName: [] for datasetName in datasetParts.iterkeys() } arrayLists_sys = { x: {datasetName: [] for datasetName in datasetParts.iterkeys()} for x in systematics } weightLists = { datasetName: [] for datasetName in datasetParts.iterkeys() } targetLists = { datasetName: [] for datasetName in datasetParts.iterkeys() } weightListsSYS = { x: {datasetName: [] for datasetName in datasetParts.iterkeys()} for x in self.weightSYS } # standard weight expression weightF = self.config.get('Weights', 'weightF') for category in categories: for sample in self.samples[category]: print('*' * 80, '\n%s\n' % sample, '*' * 80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale( sample) * self.globalRescale print('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] features_sys = { x: self.MVA_Vars[x] for x in systematics } nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) inputData_sys = { x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics } # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) for k, features_s in features_sys.iteritems(): for feature in features_s: sampleTree.addFormula(feature) sampleTree.addFormula(weightF) for syst in self.weightSYS: sampleTree.addFormula(self.weightSYSweights[syst]) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section totalWeight = treeScale * sampleTree.evaluate( weightF) weightLists[datasetName].append(totalWeight) targetLists[datasetName].append( categories.index(category)) # add weights varied by (btag) systematics for syst in self.weightSYS: weightListsSYS[syst][datasetName].append( treeScale * sampleTree.evaluate( self.weightSYSweights[syst])) # fill systematics for k, feature_s in features_sys.iteritems(): for j, feature in enumerate(feature_s): inputData_sys[k][ i, j] = sampleTree.evaluate(feature) arrayLists[datasetName].append(inputData) for sys in systematics: arrayLists_sys[sys][datasetName].append( inputData_sys[sys]) else: print("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), }, 'category_labels': {idx: label for idx, label in enumerate(categories)}, 'meta': { 'version': self.dataFormatVersion, 'region': self.mvaName, 'cutName': self.treeCutName, 'cut': self.treeCut, 'trainCut': self.trainCut, 'testCut': self.evalCut, 'samples': self.sampleNames, 'weightF': weightF, 'weightSYS': self.weightSYS, 'variables': ' '.join(self.MVA_Vars['Nominal']) } } # add systematics variations for sys in systematics: self.data['train']['X_' + sys] = np.concatenate( arrayLists_sys[sys]['train'], axis=0) for syst in self.weightSYS: self.data['train']['sample_weight_' + syst] = np.array( weightListsSYS[syst]['train'], dtype=np.float32) numpyOutputFileName = './' + self.mvaName + '.dmpz' with gzip.open(numpyOutputFileName, 'wb') as outputFile: pickle.dump(self.data, outputFile) print(self.data['meta']) print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")