def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesDefinitions = self.config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions ], []))) self.signalSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions ], []))) self.samples = self.samplesInfo.get_samples( list(set(self.backgroundSampleNames + self.signalSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get(trainingRegion, 'treeCut') treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() systematics = [ x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip()) > 0 ] mvaVars = [] for systematic in systematics: mvaVars += config.get(treeVarSet, systematic).strip().split(' ') self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace)
def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None): self.config = config self.sampleIdentifier = sampleIdentifier self.regions = list(set(regions)) self.forceRedo = forceRedo self.sampleTree = None self.samplesPath = self.config.get('Directories', 'plottingSamples') self.samplesDefinitions = self.config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.sampleNames = eval(self.config.get('Plot_general', 'samples')) self.dataNames = eval(self.config.get('Plot_general', 'Data')) self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames) self.regionsDict = {} for region in self.regions: treeCut = config.get('Cuts', region) self.regionsDict[region] = {'cut': treeCut} self.splitFilesChunkSize = splitFilesChunkSize self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.fileList = FileList.decompress(fileList) if fileList else None VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)
def __init__(self, config, mvaName): self.config = config self.factoryname = config.get('factory', 'factoryname') self.factorysettings = config.get('factory', 'factorysettings') self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVAtype = config.get(mvaName, 'MVAtype') self.MVAsettings = config.get(mvaName, 'MVAsettings') self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(signalSampleNames), } self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.TrainCut) print("EVAL CUT:", self.EvalCut) self.globalRescale = 2.0 self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format( factoryname=self.factoryname, region=mvaName) print("INFO: MvaTrainingHelper class created.")
def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None): self.config = config self.sampleIdentifier = sampleIdentifier self.regions = list(set(regions)) self.forceRedo = forceRedo self.sampleTree = None self.samplesPath = self.config.get('Directories', 'plottingSamples') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.sampleNames = list( eval(self.config.get('Plot_general', 'samples'))) self.dataNames = list(eval(self.config.get('Plot_general', 'Data'))) self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames) self.regionsDict = {} for region in self.regions: treeCut = config.get('Cuts', region) self.regionsDict[region] = {'cut': treeCut} self.splitFilesChunkSize = splitFilesChunkSize self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.fileList = FileList.decompress(fileList) if fileList else None VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)
def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection='Plot:%s'%region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general','addBlindingCut') print ('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers] self.mcSamples = [x for x in self.mcSamples if x.identifier in self.sampleIdentifiers]
def customInit(self, initVars): self.sample = initVars['sample'] self.sampleTree = initVars['sampleTree'] self.config = initVars['config'] self.samplesInfo = ParseInfo(samples_path=self.config.get( 'Directories', 'dcSamples'), config=self.config) self.subsamples = [ x for x in self.samplesInfo if x.identifier == self.sample.identifier and x.subsample ] print("INFO: subsamples/cut") for s in self.subsamples: print(" >", s.name, s.subcut) self.sampleTree.addFormula(s.subcut) if not self.groupDict: self.groupDict = eval(self.config.get('LimitGeneral', 'Group')) self.groupNames = list(set(self.groupDict.values())) self.groups = { k: [x for x, y in self.groupDict.iteritems() if y == k] for k in self.groupNames } for groupName, sampleNames in self.groups.iteritems(): self.branches.append({ 'name': self.prefix + groupName, 'formula': self.isInGroup, 'arguments': groupName }) self.branches.append({ 'name': 'sampleIndex', 'formula': self.getSampleIndex, 'type': 'i' }) if self.eventCountsDict: self.branches.append({ 'name': 'event_unique', 'formula': self.getEventNumber, 'type': 'l' }) if len(self.sampleTree.sampleFileNames) != 1: print( "ERROR: adding unique event numbers for chains is not implemented!" ) raise Exception("SampleGroup__customInit__not_implemented") self.eventNumberOffset = self.eventCountsDict[ self.sample.identifier][self.sampleTree.sampleFileNames[0]]
def __init__(self, config, region, vars = None, title=None): self.config = config self.region = region self.vars = vars self.title = title if title and len(title)>0 else None # VHbb namespace VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) # additional blinding cut: self.addBlindingCut = None if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general','addBlindingCut') print ('adding add. blinding cut:', self.addBlindingCut) self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection='Plot:%s'%region if self.vars and type(self.vars) == list: self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] if not self.vars or len(self.vars) < 1: varListFromConfig = self.config.get(self.configSection, 'vars').split(',') print ("VARS::", self.configSection, " => ", varListFromConfig) self.vars = [x.strip() for x in varListFromConfig if len(x.strip()) > 0] # load samples self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) self.groupDict = eval(self.config.get('Plot_general', 'Group')) self.subcutPlotName = '' self.histogramStacks = {}
def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesDefinitions = self.config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], []))) self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], []))) self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get(trainingRegion, 'treeCut') treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0] systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else [] mvaVars = config.get(treeVarSet, 'Nominal').split(' ') weightVars = [] #for systematic in systematics: for syst in systematics: systNameUp = syst+'_UP' if self.config.has_option('Weights',syst+'_UP') else syst+'_Up' systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down' weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)] self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, 'weightVars': weightVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace=config.get('VHbbNameSpace','library') ROOT.gSystem.Load(VHbbNameSpace)
def __init__(self, config, mvaName): self.config = config self.factoryname = config.get('factory', 'factoryname') self.factorysettings = config.get('factory', 'factorysettings') self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVAtype = config.get(mvaName, 'MVAtype') self.MVAsettings = config.get(mvaName,'MVAsettings') self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(signalSampleNames), } self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.TrainCut) print("EVAL CUT:", self.EvalCut) self.globalRescale = 2.0 self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName) print("INFO: MvaTrainingHelper class created.")
def __init__(self, config, mvaName): self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) self.dataFormatVersion = 2 self.sampleTrees = [] self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) # region self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/eval sets self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') # rescale MC by 2 because of train/eval split self.globalRescale = 2.0 # variables and systematics self.treeVarSet = config.get(mvaName, 'treeVarSet') self.systematics = config.get('systematics', 'systematics').strip().split(' ') self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]} for sys in self.systematics: self.MVA_Vars[sys] = [x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0] # samples self.sampleNames = { # 'BKG_TT': eval(self.config.get('Plot_general', 'TT')), # 'BKG_ST': eval(self.config.get('Plot_general', 'ST')), # 'BKG_VV': eval(self.config.get('Plot_general', 'VV')), # 'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')), # 'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')), # 'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')), # 'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')), # 'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')), 'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')), 'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')), } self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}
class MvaTrainingHelper(object): def __init__(self, config, mvaName): self.config = config self.factoryname = config.get('factory', 'factoryname') self.factorysettings = config.get('factory', 'factorysettings') self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVAtype = config.get(mvaName, 'MVAtype') self.MVAsettings = config.get(mvaName,'MVAsettings') self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(signalSampleNames), } self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.TrainCut) print("EVAL CUT:", self.EvalCut) self.globalRescale = 2.0 self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName) print("INFO: MvaTrainingHelper class created.") def prepare(self): self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE") # ---------------------------------------------------------------------------------------------------------------------- # create TMVA factory # ---------------------------------------------------------------------------------------------------------------------- self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings) if self.trainingOutputFile and self.factory: print ("INFO: initialized MvaTrainingHelper.", self.factory) else: print ("\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m") # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/eval trees # ---------------------------------------------------------------------------------------------------------------------- try: addBackgroundTreeMethod = self.factory.AddBackgroundTree addSignalTreeMethod = self.factory.AddSignalTree self.dataLoader = None except: print("oh no..") # the DataLoader wants to be called '.' self.dataLoader = ROOT.TMVA.DataLoader(".") addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree addSignalTreeMethod = self.dataLoader.AddSignalTree # DEBUG: restrict memory # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024)) self.sampleTrees = [] for addTreeFcn, samples in [ [addBackgroundTreeMethod, self.samples['BKG']], [addSignalTreeMethod, self.samples['SIG']] ]: for sample in samples: print ('*'*80,'\n%s\n'%sample,'*'*80) for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() sampleTree.tree.SetCacheSize(32*1024) # prevent garbage collection self.sampleTrees.append(sampleTree) if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale # only non-empty trees can be added if sampleTree.tree.GetEntries() > 0: addTreeFcn(sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting) print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") if self.dataLoader: for var in self.MVA_Vars['Nominal']: self.dataLoader.AddVariable(var, 'D') else: for var in self.MVA_Vars['Nominal']: self.factory.AddVariable(var, 'D') return self # ---------------------------------------------------------------------------------------------------------------------- # backup old .xml and .info files # ---------------------------------------------------------------------------------------------------------------------- def backupOldFiles(self): success = False MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/' backupDir = MVAdir + 'backup/' try: os.makedirs(backupDir) except: pass freeNumber = 1 try: lastUsedBackupDirectories = sorted(glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True) freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split('/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1 except Exception as e: print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m") freeNumber = -1 if freeNumber > -1: try: fileNamesToBackup = glob.glob(MVAdir + self.factoryname+'_'+self.mvaName + '.*') fileNamesToBackup += glob.glob(MVAdir + '/../mvatraining_MVA_ZllBDT_*.root') os.makedirs(backupDir + 'v%d/'%freeNumber) for fileNameToBackup in fileNamesToBackup: shutil.copy(fileNameToBackup, backupDir + 'v%d/'%freeNumber) success = True except Exception as e: print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m") return success def run(self): backupFiles = False try: backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights')) except: pass if backupFiles: print('backing up old BDT files') self.backupOldFiles() # ---------------------------------------------------------------------------------------------------------------------- # Execute TMVA # ---------------------------------------------------------------------------------------------------------------------- self.factory.Verbose() print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")'%(self.MVAtype, self.mvaName, self.MVAsettings)) print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) weightF = self.config.get('Weights','weightF') try: self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings) print("ROOT 5 style TMVA found") self.factory.SetSignalWeightExpression(weightF) self.factory.SetBackgroundWeightExpression(weightF) except: print("ROOT 6 style TMVA found, using data loader object!!! >_<") print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir) print(" data loader:", self.dataLoader) print(" type: ", self.MVAtype) print(" name: ", self.mvaName) print(" settings: ", self.MVAsettings) ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights' self.dataLoader.SetSignalWeightExpression(weightF) self.dataLoader.SetBackgroundWeightExpression(weightF) self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings) sys.stdout.flush() print('Execute TMVA: TrainAllMethods') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.TrainAllMethods() sys.stdout.flush() print('Execute TMVA: TestAllMethods') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.TestAllMethods() sys.stdout.flush() print('Execute TMVA: EvaluateAllMethods') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.EvaluateAllMethods() sys.stdout.flush() print('Execute TMVA: output.Write') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.trainingOutputFile.Close() return self def printInfo(self): #WRITE INFOFILE MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/' infofile = open(MVAdir+self.factoryname+'_'+self.mvaName+'.info','w') print ('@DEBUG: output infofile name') print (infofile) info=mvainfo(self.mvaName) info.factoryname=self.factoryname info.factorysettings=self.factorysettings info.MVAtype=self.MVAtype info.MVAsettings=self.MVAsettings info.weightfilepath=MVAdir info.path=self.samplesPath info.varset=self.treeVarSet info.vars=self.MVA_Vars['Nominal'] pickle.dump(info,infofile) infofile.close() def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0): hSIG = ROOT.TH1D("hSig","hSig",nBins,xMin,xMax) hBKG = ROOT.TH1D("hBkg","hBkg",nBins,xMin,xMax) print("INFO: GetEntries() = ", tree.GetEntries()) if power != 1.0: print("INFO: rescale BDT score with power ", power) for event in tree: if power != 1.0: x = (getattr(event, self.mvaName)-xMin)/(xMax-xMin) if x<0: x=0 if x>0.999999: x=0.999999 value = math.pow(x, power)*(xMax-xMin)+xMin else: value = max(min(getattr(event, self.mvaName),xMax-0.00001),xMin) weight = event.weight if event.classID == 1: hSIG.Fill(value, weight * rescaleSig) else: hBKG.Fill(value, weight * rescaleBkg) ssbSum = 0.0 sSum = 0 bSum = 0 sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}" print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----") print(sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)")) for i in range(nBins): ssbSum += hSIG.GetBinContent(1+i)*hSIG.GetBinContent(1+i)/(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0 sSum += hSIG.GetBinContent(1+i) bSum += hBKG.GetBinContent(1+i) ssb = hSIG.GetBinContent(1+i)/math.sqrt(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0 print(sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1+i),1), background=round(hBKG.GetBinContent(1+i),1), ssb=round(ssb,3))) expectedSignificance = math.sqrt(ssbSum) print(sbTableFormat.format(bin="SUM", signal=round(sSum,1), background=round(bSum,1), ssb="\x1b[34mZ=%1.3f\x1b[0m"%expectedSignificance)) print("-"*40) hSIG.Delete() hBKG.Delete() return expectedSignificance, sSum, bSum def estimateExpectedSignificance(self): print("INFO: open ", self.trainingOutputFileName) rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ") print("INFO: ->", rootFile) testTree = rootFile.Get('./TestTree') # run a few tests with different binnings and rescaling of BDT score self.getExpectedSignificance(testTree, 15, -0.8, 1.0) self.getExpectedSignificance(testTree, 15, -0.8, 0.9) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0) # close to nominal binning print("---- ~nominal TEST -----") esTest, sTest, bTest = self.getExpectedSignificance(testTree, 15, -0.8, 0.8) print("---- ~nominal TRAINING (without correct normalization) -----") trainTree = rootFile.Get('./TrainTree') esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8) # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events rescaleSig = 1.0*sTest/sTrain rescaleBkg = 1.0*bTest/bTrain print("---- ~nominal TRAINING -----") trainTree = rootFile.Get('./TrainTree') esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg)
TrainFlag = eval(config.get('Analysis','TrainFlag')) btagLibrary = config.get('BTagReshaping','library') samplesinfo=config.get('Directories','samplesinfo') channel=config.get('Configuration','channel') VHbbNameSpace=config.get('VHbbNameSpace','library') ROOT.gSystem.Load(VHbbNameSpace) pathIN = config.get('Directories','SYSin') pathOUT = config.get('Directories','SYSout') tmpDir = config.get('Directories','scratch') print 'INput samples:\t%s'%pathIN print 'OUTput samples:\t%s'%pathOUT fileLocator = FileLocator(config=config) # samples info = ParseInfo(samplesinfo, pathIN) matchingSamples = [x for x in info if x.identifier==opts.sampleIdentifier and not x.subsample] if len(matchingSamples) != 1: print "need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) sample = matchingSamples[0] # TODO: collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', collections for fileName in filelist:
signals = eval(signals) #backgrounds backgrounds = config.get(run, 'backgrounds') backgrounds = eval(backgrounds) treeVarSet = config.get(run, 'treeVarSet') print 'signals are', signals print 'backgrounds are', backgrounds #variables #TreeVar Array MVA_Vars = {} MVA_Vars['Nominal'] = config.get(treeVarSet, 'Nominal') MVA_Vars['Nominal'] = MVA_Vars['Nominal'].split(' ') #Infofile info = ParseInfo(samplesinfo, path) #Workdir workdir = ROOT.gDirectory.GetPath() #Remove EventForTraining in order to run the MVA directly from the PREP step #TrainCut='%s & !((evt%s)==0 || isData)'%(TCut,'%2') #EvalCut= '%s & ((evt%s)==0 || isData)'%(TCut,'%2') TrainCut = '!((evt%2)==0 || isData)' EvalCut = '((evt%2)==0 || isData)' #TrainCut='%s & EventForTraining==1'%TCut #EvalCut='%s & EventForTraining==0'%TCut if data_as_signal: TrainCut = '1' EvalCut = '1'
#Import after configure to get help message from myutils import BetterConfigParser, progbar, printc, ParseInfo, MvaEvaluator config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis","tag") #get locations: Wdir = config.get('Directories','Wdir') samplesinfo = config.get('Directories','samplesinfo') #systematics INpath = config.get('Directories','MVAin') OUTpath = config.get('Directories','MVAout') info = ParseInfo(samplesinfo,INpath) arglist = opts.discr #RTight_blavla,bsbsb namelistIN = opts.names namelist = namelistIN.split(',') print ('\n-----> SampleList: ', namelist) MVAlist = arglist.split(',') print ('-----> MVAList:', MVAlist) #CONFIG #factory factoryname = config.get('factory','factoryname') # unique training name
def __init__(self, config, region, vars=None, title=None): self.config = config self.region = region self.vars = vars self.title = title if title and len(title) > 0 else None # VHbb namespace VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) # additional blinding cut: self.addBlindingCut = None if self.config.has_option( 'Plot_general', 'addBlindingCut' ): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general', 'addBlindingCut') print('adding add. blinding cut:', self.addBlindingCut) self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection = 'Plot:%s' % region if self.vars and type(self.vars) == list: self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] if not self.vars or len(self.vars) < 1: varListFromConfig = self.config.get(self.configSection, 'vars').split(',') print("VARS::", self.configSection, " => ", varListFromConfig) self.vars = [ x.strip() for x in varListFromConfig if len(x.strip()) > 0 ] # load samples self.data = eval(self.config.get( self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get( 'Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) self.groupDict = eval(self.config.get('Plot_general', 'Group')) self.subcutPlotName = '' self.histogramStacks = {}
class CachePlot(object): def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None): self.config = config self.sampleIdentifier = sampleIdentifier self.regions = list(set(regions)) self.forceRedo = forceRedo self.sampleTree = None self.samplesPath = self.config.get('Directories', 'plottingSamples') self.samplesDefinitions = self.config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.sampleNames = eval(self.config.get('Plot_general', 'samples')) self.dataNames = eval(self.config.get('Plot_general', 'Data')) self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames) self.regionsDict = {} for region in self.regions: treeCut = config.get('Cuts', region) self.regionsDict[region] = {'cut': treeCut} self.splitFilesChunkSize = splitFilesChunkSize self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.fileList = FileList.decompress(fileList) if fileList else None VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) def printInfo(self): print ("REGION:".ljust(24),"CUT:") for region,regionInfo in self.regionsDict.iteritems(): print (" > ",region.ljust(20), regionInfo['cut']) def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval(self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] try: keepBranchesPlot += eval(self.config.get('Branches', 'keep_branches')) except: pass # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): if section.startswith('plotDef:') and self.config.has_option(section, 'relPath'): keepBranchesPlot.append(self.config.get(section, 'relPath')) except Exception as e: print("\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m") print(e) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass # plotting region cut for region,regionInfo in self.regionsDict.iteritems(): keepBranchesPlot.append(regionInfo['cut']) keepBranchesPlotFinal = BranchList(keepBranchesPlot).getListOfBranches() print("KEEP:", keepBranchesPlotFinal) # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print ('*'*80) print (' ',sampleToCache) print ('*'*80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [x for x in self.samples if x.identifier == sampleToCache] for sample in subsamples: # add cuts for all training regions for region,regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s'%region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append(self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general','addBlindingCut'): sampleCuts.append(self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format(region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True ) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print ("\x1b[31mERROR: creation of sample tree failed!!\x1b[0m") raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk(self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print ("\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m") raise Exception("SampleFilesHaveChanged") treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) else: print ("INFO: already cached!",tc, "(",tc.hash,")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print ("nothing to do!")
def __init__(self, config, mvaName, useSyst=True, useWeightSyst=True, testRun=False): self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) self.dataFormatVersion = 3 self.sampleTrees = [] self.config = config self.testRun = testRun self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) # region self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/eval sets self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') # rescale MC by 2 because of train/eval split self.globalRescale = 2.0 # variables and systematics self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]} self.weightSYS = [] self.weightSYSweights = {} self.systematics = [] if useSyst: print('INFO: use systematics in training!') self.systList = eval(self.config.get(mvaName, 'systematics')) if self.config.has_option(mvaName, 'systematics') else [] for syst in self.systList: systNameUp = syst+'_UP' if self.config.has_option('Weights',syst+'_UP') else syst+'_Up' systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down' self.systematics.append({ 'name': syst, 'U': self.config.get('Weights', systNameUp), 'D': self.config.get('Weights', systNameDown), }) # default: signal vs. background self.sampleNames = { 'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')), 'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')), } # for multi-output classifiers load dictionary from config self.categories = None if self.config.has_option(mvaName, 'classDict'): self.sampleNames = eval(self.config.get(mvaName, 'classDict')) self.categories = self.samples.keys() print("classes dict:", self.sampleNames) elif self.config.has_option(mvaName, 'classes'): self.sampleNames = dict(eval(self.config.get(mvaName, 'classes'))) self.categories = [x[0] for x in eval(self.config.get(mvaName, 'classes'))] self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()} if not self.categories: self.categories = self.samples.keys() if self.testRun: print("\x1b[31mDEBUG: TEST-RUN, using only small subset of samples!\x1b[0m")
def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None if config.has_option('Directories', 'trainingSamples'): self.samplesPath = self.config.get('Directories', 'trainingSamples') else: self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions ], []))) self.signalSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions ], []))) # can include DATA in the .h5 files for training self.dataSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'data')) if self.config.has_option(trainingRegion, 'data') else [] for trainingRegion in self.trainingRegions ], []))) self.samples = self.samplesInfo.get_samples( list( set(self.backgroundSampleNames + self.signalSampleNames + self.dataSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get( trainingRegion, 'treeCut') if config.has_option( trainingRegion, 'treeCut') else trainingRegion treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0] if config.has_option(trainingRegion, 'systematics'): systematicsString = config.get(trainingRegion, 'systematics').strip() if systematicsString.startswith('['): systematics = eval(systematicsString) else: systematics = systematicsString.split(' ') else: systematics = [] mvaVars = config.get(treeVarSet, 'Nominal').split(' ') weightVars = [] #for systematic in systematics: for syst in systematics: systNameUp = syst + '_UP' if self.config.has_option( 'Weights', syst + '_UP') else syst + '_Up' systNameDown = syst + '_DOWN' if self.config.has_option( 'Weights', syst + '_DOWN') else syst + '_Down' if self.config.has_option('Weights', systNameUp): weightVars.append(self.config.get('Weights', systNameUp)) if self.config.has_option('Weights', systNameDown): weightVars.append(self.config.get('Weights', systNameDown)) self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, 'weightVars': weightVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace)
config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis", "tag") #get locations: Wdir = config.get('Directories', 'Wdir') samplesinfo = config.get('Directories', 'samplesinfo') #systematics INpath = config.get('Directories', 'MVAin') OUTpath = config.get('Directories', 'MVAout') #read shape systematics systematics = config.get('systematics', 'systematics') info = ParseInfo(samplesinfo, INpath) arglist = opts.discr #RTight_blavla,bsbsb namelistIN = opts.names namelist = namelistIN.split(',') print('\n-----> SampleList: ', namelist) MVAlist = arglist.split(',') print('-----> MVAList:', MVAlist) #CONFIG #factory factoryname = config.get('factory', 'factoryname') # unique training name
class SkimsHelper(object): def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection='Plot:%s'%region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general','addBlindingCut') print ('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers] self.mcSamples = [x for x in self.mcSamples if x.identifier in self.sampleIdentifiers] def prepare(self): # add DATA + MC samples self.fileNames = [] for sample in self.dataSamples + self.mcSamples: print(sample.identifier) # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append(self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache self.fileNames += TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config ).findCachedFileNames() if len(self.fileNames) < 1: print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m") return self def run(self): name = self.config.get('Configuration', 'channel') if self.config.has_option('Configuration', 'channel') else '_' timestamp = datetime.datetime.now().strftime("%y%m%d") tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root' destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root' sampleTree = SampleTree(self.fileNames, config=self.config) if self.config.has_option('Plot_general', 'controlSample'): controlSampleDict = eval(self.config.get('Plot_general', 'controlSample')) controlSample = controlSampleDict[self.region] if self.region in controlSampleDict else -1 sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i") print("INFO: setting controlSample to", controlSample) sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False) sampleTree.process() # copy to final destination if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(tmpName, destName, force=True) print('copy ', tmpName, destName) if not self.fileLocator.isValidRootFile(destName): print("\x1b[31mERROR: copy failed, output is broken!\x1b[0m") else: try: self.fileLocator.rm(tmpName) except Exception as e: print(e) except Exception as e: print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
def __init__(self, config, mvaName): self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) self.dataFormatVersion = 2 self.sampleTrees = [] self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) # region self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/eval sets self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') # rescale MC by 2 because of train/eval split self.globalRescale = 2.0 # variables and systematics self.treeVarSet = config.get(mvaName, 'treeVarSet') self.systematics = config.get('systematics', 'systematics').strip().split(' ') self.MVA_Vars = { 'Nominal': [ x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0 ] } for sys in self.systematics: self.MVA_Vars[sys] = [ x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0 ] self.weightSYS = [] self.weightWithoutBtag = self.config.get('Weights', 'weight_noBTag') self.weightSYSweights = {} for d in ['Up', 'Down']: for syst in [ 'HFStats1', 'HFStats2', 'LF', 'HF', 'LFStats1', 'LFStats2', 'cErr2', 'cErr1', 'JES' ]: systFullName = "btag_" + syst + "_" + d weightName = "bTagWeightCMVAV2_Moriond_" + syst + d self.weightSYSweights[ systFullName] = self.weightWithoutBtag + '*' + weightName self.weightSYS.append(systFullName) # samples self.sampleNames = { # 'BKG_TT': eval(self.config.get('Plot_general', 'TT')), # 'BKG_ST': eval(self.config.get('Plot_general', 'ST')), # 'BKG_VV': eval(self.config.get('Plot_general', 'VV')), # 'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')), # 'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')), # 'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')), # 'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')), # 'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')), 'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')), 'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')), } self.samples = { category: self.samplesInfo.get_samples(samples) for category, samples in self.sampleNames.iteritems() }
default=None, help="max number of files to process") (opts, args) = parser.parse_args(argv) config = BetterConfigParser() config.read(opts.config) fileList = FileList.decompress( opts.fileList) if len(opts.fileList) > 0 else None pathOUT = config.get('Directories', 'PREPout') samplefiles = config.get('Directories', 'samplefiles') sampleconf = config whereToLaunch = config.get('Configuration', 'whereToLaunch') info = ParseInfo(samples_path=None, config=config) samples = [ x for x in info if not x.subsample and (len(opts.sampleIdentifier) == 0 or x.identifier in opts.sampleIdentifier.split(',')) ] treeCopier = copytreePSI.CopyTreePSI(config=config) if opts.limit and len(samples) > int(opts.limit): samples = samples[:int(opts.limit)] for sample in samples: treeCopier.copytreePSI(pathIN=samplefiles, pathOUT=pathOUT, folderName=sample.identifier, skimmingCut=sample.addtreecut, fileList=fileList)
class SkimsHelper(object): def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split( ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection = 'Plot:%s' % region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option( 'Plot_general', 'addBlindingCut' ): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general', 'addBlindingCut') print('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get( self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get( 'Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [ x for x in self.dataSamples if x.identifier in self.sampleIdentifiers ] self.mcSamples = [ x for x in self.mcSamples if x.identifier in self.sampleIdentifiers ] def prepare(self): # add DATA + MC samples self.fileNames = [] for sample in self.dataSamples + self.mcSamples: print(sample.identifier) # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append( self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config) if tc.isCached(): self.fileNames += tc.findCachedFileNames() else: print("ERROR: not cached, run cacheplot again") raise Exception("NotCached") if len(self.fileNames) < 1: print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m") return self def run(self): name = self.config.get('Configuration', 'channel') if self.config.has_option( 'Configuration', 'channel') else '_' timestamp = datetime.datetime.now().strftime("%y%m%d") tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root' destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root' sampleTree = SampleTree(self.fileNames, config=self.config) if self.config.has_option('Plot_general', 'controlSample'): controlSampleDict = eval( self.config.get('Plot_general', 'controlSample')) controlSample = controlSampleDict[ self.region] if self.region in controlSampleDict else -1 sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i") print("INFO: setting controlSample to", controlSample) sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False) sampleTree.process() # copy to final destination if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(tmpName, destName, force=True) print('copy ', tmpName, destName) if not self.fileLocator.isValidRootFile(destName): print( "\x1b[31mERROR: copy failed, output is broken!\x1b[0m") else: try: self.fileLocator.rm(tmpName) except Exception as e: print(e) except Exception as e: print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
def __init__(self, config, mvaName): self.dataRepresentationVersion = 2 self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.logpath = config.get('Directories', 'logpath') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.mvaName = mvaName self.MVAsettings = config.get(mvaName,'MVAsettings') self.factoryname = 'scikit-test1' VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) self.signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(self.signalSampleNames), } # MVA signal region cuts self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/test samples self.datasets = ['train', 'test'] self.varsets = ['X', 'y', 'sample_weight'] self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.trainCut) print("TEST CUT:", self.evalCut) self.globalRescale = 2.0 # default parameters self.parameters = { 'factoryname': self.factoryname, 'mvaName': self.mvaName, 'MVAregionCut': self.treeCutName + ': ' + self.treeCut, #'classifier': 'GradientBoostingClassifier', 'classifier': 'RandomForestClassifier', #'classifier': 'ExtraTreesClassifier', #'classifier': 'FT_GradientBoostingClassifier', 'max_depth': None, 'max_leaf_nodes': None, 'class_weight': 'balanced', #'criterion': 'friedman_mse', 'criterion': 'gini', #'n_estimators': 3000, 'n_estimators': 400, #'learning_rate': 0.1, 'algorithm': 'SAMME.R', #'min_samples_leaf': 100, 'splitter': 'best', 'max_features': 4, 'subsample': 0.6, 'limit': -1, 'additional_signal_weight': 1.0, 'min_impurity_split': 0.0, 'bootstrap': True, } # load parameters from config in a format similar to Root TMVA parameter string self.MVAsettingsEvaluated = [] for mvaSetting in self.MVAsettings.split(':'): self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip()) try: self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()]) except: print("???:", mvaSetting) self.MVAsettingsEvaluated.append(mvaSetting) self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)
class MvaTrainingHelper(object): def __init__(self, config, mvaName): self.dataRepresentationVersion = 2 self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.logpath = config.get('Directories', 'logpath') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.mvaName = mvaName self.MVAsettings = config.get(mvaName,'MVAsettings') self.factoryname = 'scikit-test1' VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) self.signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(self.signalSampleNames), } # MVA signal region cuts self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/test samples self.datasets = ['train', 'test'] self.varsets = ['X', 'y', 'sample_weight'] self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.trainCut) print("TEST CUT:", self.evalCut) self.globalRescale = 2.0 # default parameters self.parameters = { 'factoryname': self.factoryname, 'mvaName': self.mvaName, 'MVAregionCut': self.treeCutName + ': ' + self.treeCut, #'classifier': 'GradientBoostingClassifier', 'classifier': 'RandomForestClassifier', #'classifier': 'ExtraTreesClassifier', #'classifier': 'FT_GradientBoostingClassifier', 'max_depth': None, 'max_leaf_nodes': None, 'class_weight': 'balanced', #'criterion': 'friedman_mse', 'criterion': 'gini', #'n_estimators': 3000, 'n_estimators': 400, #'learning_rate': 0.1, 'algorithm': 'SAMME.R', #'min_samples_leaf': 100, 'splitter': 'best', 'max_features': 4, 'subsample': 0.6, 'limit': -1, 'additional_signal_weight': 1.0, 'min_impurity_split': 0.0, 'bootstrap': True, } # load parameters from config in a format similar to Root TMVA parameter string self.MVAsettingsEvaluated = [] for mvaSetting in self.MVAsettings.split(':'): self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip()) try: self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()]) except: print("???:", mvaSetting) self.MVAsettingsEvaluated.append(mvaSetting) self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated) # load numpy arrays with training/testing data def loadCachedNumpyArrays(self, cachedFilesPath): cached = True try: with open(cachedFilesPath + '/scikit_input.dmp', 'rb') as inputFile: self.data = pickle.load(inputFile) print("INFO: found numpy arrays for input in:", cachedFilesPath) except: cached = False return cached # save numpy arrays with training/testing data def writeNumpyArrays(self, cachedFilesPath): with open(cachedFilesPath + '/scikit_input.dmp', 'wb') as outputFile: pickle.dump(self.data, outputFile) print("INFO: wrote numpy arrays for input to:", cachedFilesPath) def getCachedNumpyArrayPath(self): identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__V:%r'%self.dataRepresentationVersion varsHash = hashlib.sha224(identifier).hexdigest() cachedFilesPath = self.logpath + '/../cache/' + varsHash + '/' return cachedFilesPath def getHash(self): identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__PAR:%r'%self.parameters return hashlib.sha224(identifier).hexdigest()[:8] def prepare(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- self.sampleTrees = [] categories = ['BKG', 'SIG'] datasetParts = {'train': self.trainCut, 'test': self.evalCut} cachedFilesPath = self.getCachedNumpyArrayPath() try: os.makedirs(cachedFilesPath) except: pass # load numpy arrays from disk if they have been already created if self.loadCachedNumpyArrays(cachedFilesPath): return self arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} # standard weight expression weightF = self.config.get('Weights','weightF') for category in categories: for sample in self.samples[category]: print ('*'*80,'\n%s\n'%sample,'*'*80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale print ('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) sampleTree.addFormula(weightF) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section totalWeight = treeScale * sampleTree.evaluate(weightF) weightLists[datasetName].append(totalWeight) targetLists[datasetName].append(categories.index(category)) arrayLists[datasetName].append(inputData) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), }, } # write numpy arrays to disk self.writeNumpyArrays(cachedFilesPath) return self def verify_data(self): valid = True for dataset in self.datasets: for var in self.varsets: print("DEBUG: self.data['{dataset}']['{var}'].shape = {shape}".format(dataset=dataset, var=var, shape=self.data[dataset][var].shape)) for dataset in self.datasets: for i in range(len(self.varsets)-1): valid = valid and self.data[dataset][self.varsets[i]].shape[0] == self.data[dataset][self.varsets[i+1]].shape[0] return valid def run(self): if not self.verify_data(): print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m") raise Exception("BadTrainingInputData") applyClassWeights = False if self.parameters['classifier'] == 'GradientBoostingClassifier': clf = GradientBoostingClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], subsample=self.parameters['subsample'], min_impurity_split=self.parameters['min_impurity_split'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'RandomForestClassifier': clf = RandomForestClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], bootstrap=self.parameters['bootstrap'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'ExtraTreesClassifier': clf = ExtraTreesClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], bootstrap=self.parameters['bootstrap'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier': rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0) clf0 = GradientBoostingClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], subsample=self.parameters['subsample'], min_impurity_split=self.parameters['min_impurity_split'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True clf = make_pipeline(rt, clf0) elif self.parameters['classifier'] == 'XGBClassifier': clf = XGBClassifier( learning_rate=self.parameters['learning_rate'], max_depth=self.parameters['max_depth'], n_estimators=self.parameters['n_estimators'], objective='binary:logitraw', colsample_bytree=self.parameters['colsample_bytree'], subsample=self.parameters['subsample'], min_child_weight=self.parameters['min_child_weight'], gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0, #reg_alpha=8, reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0, reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0, ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'MLPClassifier': classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']} clf = MLPClassifier(**classifierParams) elif self.parameters['classifier'] in ['SVC', 'LinearSVC']: ''' clf = SVC( C=1.0, cache_size=4000, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=100000, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=True ) ''' bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False if self.parameters['classifier'] == 'LinearSVC': clf = LinearSVC( class_weight='balanced', dual=self.parameters['dual'], max_iter=self.parameters['max_iter'], C=self.parameters['C'], penalty=self.parameters['penalty'], loss=self.parameters['loss'], tol=self.parameters['tol'], verbose=True, ) else: # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000 clf = SVC( C=self.parameters['C'], cache_size=self.parameters['cache_size'], class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=self.parameters['degree'], gamma=self.parameters['gamma'], kernel=self.parameters['kernel'], max_iter=self.parameters['max_iter'], probability=False, random_state=None, shrinking=self.parameters['shrinking'], tol=self.parameters['tol'], verbose=True ) if bagged: n_estimators = bagged if 'bag_oversampling' in self.parameters: n_estimators = int(n_estimators * self.parameters['bag_oversampling']) clf0 = clf clf = BaggingClassifier( clf0, max_samples=1.0 / bagged, max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0, bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False, n_estimators=n_estimators, ) else: clf = AdaBoostClassifier( DecisionTreeClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], class_weight=self.parameters['class_weight'], criterion=self.parameters['criterion'], splitter=self.parameters['splitter'], max_features=self.parameters['max_features'], ), n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], algorithm=self.parameters['algorithm'], ) #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile: # clf = pickle.load(inputFile) # preprocessing print("transformation...") if 'scaler' in self.parameters: if self.parameters['scaler'] == 'standard': self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X']) elif self.parameters['scaler'] == 'minmax': self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X']) elif self.parameters['scaler'] == 'robust': self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X']) else: self.scaler = None else: self.scaler = None if self.scaler: self.data['train']['X'] = self.scaler.transform(self.data['train']['X']) self.data['test']['X'] = self.scaler.transform(self.data['test']['X']) # SHUFFLE all samples before self.shuffle = False if self.shuffle: print("shuffle input data...") for dataset in self.datasets: nSamples = self.data[dataset][self.varsets[0]].shape[0] randomPermutation = np.random.permutation(nSamples) for var in self.varsets: self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0) # LIMIT number of training samples # recommended to also shuffle samples before, because they are ordered by signal/background limitNumTrainingSamples = self.parameters['limit'] if (limitNumTrainingSamples > 0): print("limit training samples to:", limitNumTrainingSamples) #for dataset in self.datasets: # for var in self.varsets: # self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples] for dataset in self.datasets: self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False) # oversample upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None if upscale: upscalemax = self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal indices = [] for i in range(len(self.data['train']['sample_weight'])): #print(x) x= self.data['train']['sample_weight'][i] if self.data['train']['y'][i] > 0.5: x *= upscalesignal n = x * upscale # limit oversampling factor! if n > upscalemax: n=upscalemax if n<1: n=1 intN = int(n) indices += [i]*intN #floatN = n-intN #if floatN > 0: # if random.uniform(0.0,1.0) < floatN: # indices += [i] self.data['train']['X'] = self.data['train']['X'][indices] self.data['train']['y'] = self.data['train']['y'][indices] self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices] self.verify_data() # BALANCE weights # calculate total weights and class_weights nSig = len([x for x in self.data['train']['y'] if x >= 0.5]) nBkg = len([x for x in self.data['train']['y'] if x < 0.5]) print("#SIG:", nSig) print("#BKG:", nBkg) weightsSignal = [] weightsBackground = [] for i in range(len(self.data['train']['sample_weight'])): if self.data['train']['y'][i] < 0.5: weightsBackground.append(self.data['train']['sample_weight'][i]) else: weightsSignal.append(self.data['train']['sample_weight'][i]) weightsSignal.sort() weightsBackground.sort() totalWeightSignal = sum(weightsSignal) totalWeightBackground = sum(weightsBackground) signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight'] backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground print("SUM of weights for signal:", totalWeightSignal) print("SUM of weights for background:", totalWeightBackground) if applyClassWeights: print("re-weight signals by:", signalReweight) print("re-weight background by:", backgroundReweight) for i in range(len(self.data['train']['sample_weight'])): if self.data['train']['y'][i] < 0.5: self.data['train']['sample_weight'][i] *= backgroundReweight else: self.data['train']['sample_weight'][i] *= signalReweight else: print("DO NOT re-weight signals by:", signalReweight) print("...") # TRAINING learningCurve = [] if self.parameters['classifier'] == 'XGBClassifier': clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True) else: try: clf = clf.fit(**self.data['train']) except: clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y']) if 'rounds' in self.parameters and self.parameters['rounds'] > 1: for rNumber in range(self.parameters['rounds']): results = clf.predict_proba(self.data['test']['X']) auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight']) print(" round ", rNumber, " AUC=", auc1) learningCurve.append(auc1) clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y']) print("***** FIT done") # TEST try: results = clf.decision_function(self.data['test']['X']) print("***** EVALUATION on test sample done") results_train = clf.decision_function(self.data['train']['X']) print("***** EVALUATION on training sample done") print("R:", results.shape, results) results = np.c_[np.ones(results.shape[0]), results] results_train = np.c_[np.ones(results_train.shape[0]), results_train] except: results = clf.predict_proba(self.data['test']['X']) results_train = clf.predict_proba(self.data['train']['X']) # ROC curve print("calculating auc...") auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight']) auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight']) print("AUC:", auc1, " (training:", auc_training, ")") print("**** compute quantiles") qx = np.array([0.01, 0.99]) qy = np.array([0.0, 0.0]) thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0) nS = len(results) for i in range(nS): thq.Fill(results[i][1]) thq.GetQuantiles(2, qy, qx) # rescaling of SCORE to [0, 1] minProb = 2.0 maxProb = -1.0 #for i in range(len(self.data['train']['X'])): # if results_train[i][1] > maxProb: # maxProb = results_train[i][1] # if results_train[i][1] < minProb: # minProb = results_train[i][1] #for i in range(len(self.data['test']['X'])): # if results[i][1] > maxProb: # maxProb = results[i][1] # if results[i][1] < minProb: # minProb = results[i][1] minProb = qy[0] maxProb = qy[1] delta = maxProb-minProb minProb -= delta * 0.01 maxProb += delta * 0.10 useSqrt = False # fill TRAINING SCORE histogram (class probability) h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0) h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0) for i in range(len(self.data['train']['X'])): result = (results_train[i][1]-minProb)/(maxProb-minProb)
class CacheTraining(object): def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesDefinitions = self.config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions ], []))) self.signalSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions ], []))) self.samples = self.samplesInfo.get_samples( list(set(self.backgroundSampleNames + self.signalSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get(trainingRegion, 'treeCut') treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() systematics = [ x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip()) > 0 ] mvaVars = [] for systematic in systematics: mvaVars += config.get(treeVarSet, systematic).strip().split(' ') self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) def printInfo(self): print("REGION:".ljust(24), "CUT:") for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): print(" > ", trainingRegion.ljust(20), trainingRegionInfo['cut']) def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format( region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut == self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles( chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self. splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
class MvaTrainingHelper(object): def __init__(self, config, mvaName): self.config = config self.factoryname = config.get('factory', 'factoryname') self.factorysettings = config.get('factory', 'factorysettings') self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVAtype = config.get(mvaName, 'MVAtype') self.MVAsettings = config.get(mvaName, 'MVAsettings') self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(signalSampleNames), } self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.TrainCut) print("EVAL CUT:", self.EvalCut) self.globalRescale = 2.0 self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format( factoryname=self.factoryname, region=mvaName) print("INFO: MvaTrainingHelper class created.") def prepare(self): self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE") # ---------------------------------------------------------------------------------------------------------------------- # create TMVA factory # ---------------------------------------------------------------------------------------------------------------------- self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings) if self.trainingOutputFile and self.factory: print("INFO: initialized MvaTrainingHelper.", self.factory) else: print( "\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m" ) # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/eval trees # ---------------------------------------------------------------------------------------------------------------------- try: addBackgroundTreeMethod = self.factory.AddBackgroundTree addSignalTreeMethod = self.factory.AddSignalTree self.dataLoader = None except: print("oh no..") # the DataLoader wants to be called '.' self.dataLoader = ROOT.TMVA.DataLoader(".") addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree addSignalTreeMethod = self.dataLoader.AddSignalTree # DEBUG: restrict memory # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024)) self.sampleTrees = [] for addTreeFcn, samples in [[ addBackgroundTreeMethod, self.samples['BKG'] ], [addSignalTreeMethod, self.samples['SIG']]]: for sample in samples: print('*' * 80, '\n%s\n' % sample, '*' * 80) for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True) sampleTree = tc.getTree() sampleTree.tree.SetCacheSize(32 * 1024) # prevent garbage collection self.sampleTrees.append(sampleTree) if sampleTree: treeScale = sampleTree.getScale( sample) * self.globalRescale # only non-empty trees can be added if sampleTree.tree.GetEntries() > 0: addTreeFcn( sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting) print('max mem used = %d' % (resource.getrusage( resource.RUSAGE_SELF).ru_maxrss)) else: print("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") if self.dataLoader: for var in self.MVA_Vars['Nominal']: self.dataLoader.AddVariable(var, 'D') else: for var in self.MVA_Vars['Nominal']: self.factory.AddVariable(var, 'D') return self # ---------------------------------------------------------------------------------------------------------------------- # backup old .xml and .info files # ---------------------------------------------------------------------------------------------------------------------- def backupOldFiles(self): success = False MVAdir = self.config.get('Directories', 'vhbbpath') + '/python/weights/' backupDir = MVAdir + 'backup/' try: os.makedirs(backupDir) except: pass freeNumber = 1 try: lastUsedBackupDirectories = sorted( glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True) freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split( '/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1 except Exception as e: print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m") freeNumber = -1 if freeNumber > -1: try: fileNamesToBackup = glob.glob(MVAdir + self.factoryname + '_' + self.mvaName + '.*') fileNamesToBackup += glob.glob( MVAdir + '/../mvatraining_MVA_ZllBDT_*.root') os.makedirs(backupDir + 'v%d/' % freeNumber) for fileNameToBackup in fileNamesToBackup: shutil.copy(fileNameToBackup, backupDir + 'v%d/' % freeNumber) success = True except Exception as e: print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m") return success def run(self): backupFiles = False try: backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights')) except: pass if backupFiles: print('backing up old BDT files') self.backupOldFiles() # ---------------------------------------------------------------------------------------------------------------------- # Execute TMVA # ---------------------------------------------------------------------------------------------------------------------- self.factory.Verbose() print('max mem used = %d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")' % (self.MVAtype, self.mvaName, self.MVAsettings)) print('max mem used = %d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) weightF = self.config.get('Weights', 'weightF') try: self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings) print("ROOT 5 style TMVA found") self.factory.SetSignalWeightExpression(weightF) self.factory.SetBackgroundWeightExpression(weightF) except: print("ROOT 6 style TMVA found, using data loader object!!! >_<") print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir) print(" data loader:", self.dataLoader) print(" type: ", self.MVAtype) print(" name: ", self.mvaName) print(" settings: ", self.MVAsettings) ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights' self.dataLoader.SetSignalWeightExpression(weightF) self.dataLoader.SetBackgroundWeightExpression(weightF) self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings) sys.stdout.flush() print('Execute TMVA: TrainAllMethods') print('max mem used = %d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.TrainAllMethods() sys.stdout.flush() print('Execute TMVA: TestAllMethods') print('max mem used = %d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.TestAllMethods() sys.stdout.flush() print('Execute TMVA: EvaluateAllMethods') print('max mem used = %d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.EvaluateAllMethods() sys.stdout.flush() print('Execute TMVA: output.Write') print('max mem used = %d' % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.trainingOutputFile.Close() return self def printInfo(self): #WRITE INFOFILE MVAdir = self.config.get('Directories', 'vhbbpath') + '/python/weights/' infofile = open( MVAdir + self.factoryname + '_' + self.mvaName + '.info', 'w') print('@DEBUG: output infofile name') print(infofile) info = mvainfo(self.mvaName) info.factoryname = self.factoryname info.factorysettings = self.factorysettings info.MVAtype = self.MVAtype info.MVAsettings = self.MVAsettings info.weightfilepath = MVAdir info.path = self.samplesPath info.varset = self.treeVarSet info.vars = self.MVA_Vars['Nominal'] pickle.dump(info, infofile) infofile.close() def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0): hSIG = ROOT.TH1D("hSig", "hSig", nBins, xMin, xMax) hBKG = ROOT.TH1D("hBkg", "hBkg", nBins, xMin, xMax) print("INFO: GetEntries() = ", tree.GetEntries()) if power != 1.0: print("INFO: rescale BDT score with power ", power) for event in tree: if power != 1.0: x = (getattr(event, self.mvaName) - xMin) / (xMax - xMin) if x < 0: x = 0 if x > 0.999999: x = 0.999999 value = math.pow(x, power) * (xMax - xMin) + xMin else: value = max(min(getattr(event, self.mvaName), xMax - 0.00001), xMin) weight = event.weight if event.classID == 1: hSIG.Fill(value, weight * rescaleSig) else: hBKG.Fill(value, weight * rescaleBkg) ssbSum = 0.0 sSum = 0 bSum = 0 sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}" print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----") print( sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)")) for i in range(nBins): ssbSum += hSIG.GetBinContent(1 + i) * hSIG.GetBinContent(1 + i) / ( hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) if ( hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) > 0 else 0 sSum += hSIG.GetBinContent(1 + i) bSum += hBKG.GetBinContent(1 + i) ssb = hSIG.GetBinContent(1 + i) / math.sqrt( hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) if ( hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) > 0 else 0 print( sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1 + i), 1), background=round( hBKG.GetBinContent(1 + i), 1), ssb=round(ssb, 3))) expectedSignificance = math.sqrt(ssbSum) print( sbTableFormat.format(bin="SUM", signal=round(sSum, 1), background=round(bSum, 1), ssb="\x1b[34mZ=%1.3f\x1b[0m" % expectedSignificance)) print("-" * 40) hSIG.Delete() hBKG.Delete() return expectedSignificance, sSum, bSum def estimateExpectedSignificance(self): print("INFO: open ", self.trainingOutputFileName) rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ") print("INFO: ->", rootFile) testTree = rootFile.Get('./TestTree') # run a few tests with different binnings and rescaling of BDT score self.getExpectedSignificance(testTree, 15, -0.8, 1.0) self.getExpectedSignificance(testTree, 15, -0.8, 0.9) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0) # close to nominal binning print("---- ~nominal TEST -----") esTest, sTest, bTest = self.getExpectedSignificance( testTree, 15, -0.8, 0.8) print("---- ~nominal TRAINING (without correct normalization) -----") trainTree = rootFile.Get('./TrainTree') esTrain, sTrain, bTrain = self.getExpectedSignificance( trainTree, 15, -0.8, 0.8) # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events rescaleSig = 1.0 * sTest / sTrain rescaleBkg = 1.0 * bTest / bTrain print("---- ~nominal TRAINING -----") trainTree = rootFile.Get('./TrainTree') esTrain, sTrain, bTrain = self.getExpectedSignificance( trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg)
class SampleTreesToNumpyConverter(object): def __init__(self, config, mvaName): self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) self.dataFormatVersion = 2 self.sampleTrees = [] self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) # region self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/eval sets self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') # rescale MC by 2 because of train/eval split self.globalRescale = 2.0 # variables and systematics self.treeVarSet = config.get(mvaName, 'treeVarSet') self.systematics = config.get('systematics', 'systematics').strip().split(' ') self.MVA_Vars = { 'Nominal': [ x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0 ] } for sys in self.systematics: self.MVA_Vars[sys] = [ x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0 ] self.weightSYS = [] self.weightWithoutBtag = self.config.get('Weights', 'weight_noBTag') self.weightSYSweights = {} for d in ['Up', 'Down']: for syst in [ 'HFStats1', 'HFStats2', 'LF', 'HF', 'LFStats1', 'LFStats2', 'cErr2', 'cErr1', 'JES' ]: systFullName = "btag_" + syst + "_" + d weightName = "bTagWeightCMVAV2_Moriond_" + syst + d self.weightSYSweights[ systFullName] = self.weightWithoutBtag + '*' + weightName self.weightSYS.append(systFullName) # samples self.sampleNames = { # 'BKG_TT': eval(self.config.get('Plot_general', 'TT')), # 'BKG_ST': eval(self.config.get('Plot_general', 'ST')), # 'BKG_VV': eval(self.config.get('Plot_general', 'VV')), # 'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')), # 'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')), # 'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')), # 'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')), # 'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')), 'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')), 'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')), } self.samples = { category: self.samplesInfo.get_samples(samples) for category, samples in self.sampleNames.iteritems() } def run(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- categories = self.samples.keys() datasetParts = {'train': self.trainCut, 'test': self.evalCut} systematics = self.systematics arrayLists = { datasetName: [] for datasetName in datasetParts.iterkeys() } arrayLists_sys = { x: {datasetName: [] for datasetName in datasetParts.iterkeys()} for x in systematics } weightLists = { datasetName: [] for datasetName in datasetParts.iterkeys() } targetLists = { datasetName: [] for datasetName in datasetParts.iterkeys() } weightListsSYS = { x: {datasetName: [] for datasetName in datasetParts.iterkeys()} for x in self.weightSYS } # standard weight expression weightF = self.config.get('Weights', 'weightF') for category in categories: for sample in self.samples[category]: print('*' * 80, '\n%s\n' % sample, '*' * 80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale( sample) * self.globalRescale print('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] features_sys = { x: self.MVA_Vars[x] for x in systematics } nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) inputData_sys = { x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics } # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) for k, features_s in features_sys.iteritems(): for feature in features_s: sampleTree.addFormula(feature) sampleTree.addFormula(weightF) for syst in self.weightSYS: sampleTree.addFormula(self.weightSYSweights[syst]) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section totalWeight = treeScale * sampleTree.evaluate( weightF) weightLists[datasetName].append(totalWeight) targetLists[datasetName].append( categories.index(category)) # add weights varied by (btag) systematics for syst in self.weightSYS: weightListsSYS[syst][datasetName].append( treeScale * sampleTree.evaluate( self.weightSYSweights[syst])) # fill systematics for k, feature_s in features_sys.iteritems(): for j, feature in enumerate(feature_s): inputData_sys[k][ i, j] = sampleTree.evaluate(feature) arrayLists[datasetName].append(inputData) for sys in systematics: arrayLists_sys[sys][datasetName].append( inputData_sys[sys]) else: print("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), }, 'category_labels': {idx: label for idx, label in enumerate(categories)}, 'meta': { 'version': self.dataFormatVersion, 'region': self.mvaName, 'cutName': self.treeCutName, 'cut': self.treeCut, 'trainCut': self.trainCut, 'testCut': self.evalCut, 'samples': self.sampleNames, 'weightF': weightF, 'weightSYS': self.weightSYS, 'variables': ' '.join(self.MVA_Vars['Nominal']) } } # add systematics variations for sys in systematics: self.data['train']['X_' + sys] = np.concatenate( arrayLists_sys[sys]['train'], axis=0) for syst in self.weightSYS: self.data['train']['sample_weight_' + syst] = np.array( weightListsSYS[syst]['train'], dtype=np.float32) numpyOutputFileName = './' + self.mvaName + '.dmpz' with gzip.open(numpyOutputFileName, 'wb') as outputFile: pickle.dump(self.data, outputFile) print(self.data['meta']) print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")
class SampleTreesToNumpyConverter(object): def __init__(self, config, mvaName, useSyst=True, useWeightSyst=True, testRun=False): self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) self.dataFormatVersion = 3 self.sampleTrees = [] self.config = config self.testRun = testRun self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) # region self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/eval sets self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') # rescale MC by 2 because of train/eval split self.globalRescale = 2.0 # variables and systematics self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]} self.weightSYS = [] self.weightSYSweights = {} self.systematics = [] if useSyst: print('INFO: use systematics in training!') self.systList = eval(self.config.get(mvaName, 'systematics')) if self.config.has_option(mvaName, 'systematics') else [] for syst in self.systList: systNameUp = syst+'_UP' if self.config.has_option('Weights',syst+'_UP') else syst+'_Up' systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down' self.systematics.append({ 'name': syst, 'U': self.config.get('Weights', systNameUp), 'D': self.config.get('Weights', systNameDown), }) # default: signal vs. background self.sampleNames = { 'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')), 'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')), } # for multi-output classifiers load dictionary from config self.categories = None if self.config.has_option(mvaName, 'classDict'): self.sampleNames = eval(self.config.get(mvaName, 'classDict')) self.categories = self.samples.keys() print("classes dict:", self.sampleNames) elif self.config.has_option(mvaName, 'classes'): self.sampleNames = dict(eval(self.config.get(mvaName, 'classes'))) self.categories = [x[0] for x in eval(self.config.get(mvaName, 'classes'))] self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()} if not self.categories: self.categories = self.samples.keys() if self.testRun: print("\x1b[31mDEBUG: TEST-RUN, using only small subset of samples!\x1b[0m") def run(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- categories = self.categories if categories: print("categories:") for i,category in enumerate(categories): print(" ",i,":", category) datasetParts = {'train': self.trainCut, 'test': self.evalCut} systematics = self.systematics arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} #arrayLists_sys = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in systematics} weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} weightListsSYS = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in self.weightSYS} # standard weight expression weightF = self.config.get('Weights','weightF') weightListSYStotal = {datasetName:[] for datasetName in datasetParts.iterkeys()} for i,category in enumerate(categories): if self.testRun: self.samples[category] = self.samples[category][0:1] for j,sample in enumerate(self.samples[category]): print ('*'*80,'\n%s (category %d/%d sample %d/%d)\n'%(sample, i+1, len(categories), j+1, len(self.samples[category])),'*'*80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale print ('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] #features_sys = {x: self.MVA_Vars[x] for x in systematics} nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) #inputData_sys = {x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics} # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) #for k, features_s in features_sys.iteritems(): # for feature in features_s: # sampleTree.addFormula(feature) sampleTree.addFormula(weightF) #for syst in self.weightSYS: # sampleTree.addFormula(self.weightSYSweights[syst]) for syst in self.systematics: sampleTree.addFormula(syst['U']) sampleTree.addFormula(syst['D']) useSpecialWeight = self.config.has_option('Weights', 'useSpecialWeight') and eval(self.config.get('Weights', 'useSpecialWeight')) if useSpecialWeight: sampleTree.addFormula(sample.specialweight) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section eventWeight = sampleTree.evaluate(weightF) specialWeight = sampleTree.evaluate(sample.specialweight) if useSpecialWeight else 1.0 totalWeight = treeScale * eventWeight * specialWeight weightLists[datasetName].append(totalWeight) targetLists[datasetName].append(categories.index(category)) # add weights varied by (btag) systematics #for syst in self.weightSYS: # weightListsSYS[syst][datasetName].append(treeScale * sampleTree.evaluate(self.weightSYSweights[syst])) deltas = [] for syst in self.systematics: delta_up = sampleTree.evaluate(syst['U']) - eventWeight delta_down = sampleTree.evaluate(syst['D']) - eventWeight delta = 0.5 * (np.abs(delta_up) + np.abs(delta_down)) deltas.append(delta*delta) totalDelta = np.sqrt(sum(deltas)) # convert to absolute error on total event weight weightListSYStotal[datasetName].append(treeScale * totalDelta * specialWeight) # fill systematics #for k, feature_s in features_sys.iteritems(): # for j, feature in enumerate(feature_s): # inputData_sys[k][i,j] = sampleTree.evaluate(feature) arrayLists[datasetName].append(inputData) #for sys in systematics: # arrayLists_sys[sys][datasetName].append(inputData_sys[sys]) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") ##systematics for training #puresystematics = deepcopy(systematics) #if 'Nominal' in puresystematics: # puresystematics.remove('Nominal') puresystematics = [x['name'] for x in self.systematics] # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), 'sample_weight_error': np.array(weightListSYStotal['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), 'sample_weight_error': np.array(weightListSYStotal['test'], dtype=np.float32), }, 'category_labels': {idx: label for idx, label in enumerate(categories)}, 'meta': { 'version': self.dataFormatVersion, 'region': self.mvaName, 'cutName': self.treeCutName, 'cut': self.treeCut, 'trainCut': self.trainCut, 'testCut': self.evalCut, 'samples': self.sampleNames, 'weightF': weightF, 'weightSYS': self.weightSYS, 'variables': ' '.join(self.MVA_Vars['Nominal']), 'systematics': puresystematics, } } ## add systematics variations #for sys in systematics: # self.data['train']['X_'+sys] = np.concatenate(arrayLists_sys[sys]['train'], axis=0) #for syst in self.weightSYS: # self.data['train']['sample_weight_'+syst] = np.array(weightListsSYS[syst]['train'], dtype=np.float32) if not os.path.exists("./dumps"): os.makedirs("dumps") baseName = './dumps/' +self.config.get("Directories","Dname").split("_")[1] + '_' + self.mvaName + '_' + datetime.datetime.now().strftime("%y%m%d") numpyOutputFileName = baseName + '.dmpz' hdf5OutputFileName = baseName + '.h5' print("INFO: saving output...") success = False try: if self.config.has_option(self.mvaName, 'writeNumpy') and eval(self.config.get(self.mvaName, 'writeNumpy')): self.saveAsPickledNumpy(numpyOutputFileName) success = True except Exception as e: print("ERROR: writing numpy array failed.", e) try: self.saveAsHDF5(hdf5OutputFileName) success = True except Exception as e: print("ERROR: writing HDF5 file failed.", e) if success: print("INFO: done.") return True else: print("ERROR: no output file written") return False def saveAsPickledNumpy(self, outputFileName): with gzip.open(outputFileName, 'wb') as outputFile: pickle.dump(self.data, outputFile) print("written to:\x1b[34m", outputFileName, " \x1b[0m") def saveAsHDF5(self, outputFileName): f = h5py.File(outputFileName, 'w') for k in ['meta', 'category_labels']: f.attrs[k] = json.dumps(self.data[k].items()) for k in ['train', 'test']: for k2 in self.data[k].keys(): f.create_dataset(k + '/' + k2, data=self.data[k][k2], compression="gzip", compression_opts=9) f.close() print("written to:\x1b[34m", outputFileName, " \x1b[0m")
ang_yield = eval(config.get('AngularLike', 'yields')) #path=opts.path pathIN = config.get('Directories', 'SYSin') pathOUT = config.get('Directories', 'SYSout') tmpDir = os.environ["TMPDIR"] print 'INput samples:\t%s' % pathIN print 'OUTput samples:\t%s' % pathOUT #storagesamples = config.get('Directories','storagesamples') namelist = opts.names.split(',') #load info info = ParseInfo(samplesinfo, pathIN) def deltaPhi(phi1, phi2): result = phi1 - phi2 while (result > math.pi): result -= 2 * math.pi while (result <= -math.pi): result += 2 * math.pi return result def resolutionBias(eta): if (eta < 0.5): return 0.052 if (eta < 1.1): return 0.057 if (eta < 1.7): return 0.096
class CachePlot(object): def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None): self.config = config self.sampleIdentifier = sampleIdentifier self.regions = list(set(regions)) self.forceRedo = forceRedo self.sampleTree = None self.samplesPath = self.config.get('Directories', 'plottingSamples') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.sampleNames = list( eval(self.config.get('Plot_general', 'samples'))) self.dataNames = list(eval(self.config.get('Plot_general', 'Data'))) self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames) self.regionsDict = {} for region in self.regions: treeCut = config.get('Cuts', region) self.regionsDict[region] = {'cut': treeCut} self.splitFilesChunkSize = splitFilesChunkSize self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.fileList = FileList.decompress(fileList) if fileList else None VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) def printInfo(self): print("REGION:".ljust(24), "CUT:") for region, regionInfo in self.regionsDict.iteritems(): print(" > ", region.ljust(20), regionInfo['cut']) def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval( self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] try: keepBranchesPlot += eval( self.config.get('Branches', 'keep_branches')) except: pass # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): try: if section.startswith( 'plotDef:') and self.config.has_option( section, 'relPath'): keepBranchesPlot.append( self.config.get(section, 'relPath')) except Exception as e: print("\x1b[31mWARNING: config error in:", section, "=>", e, "\x1b[0m") except Exception as e2: print( "\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m" ) print(e2) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass # plotting region cut for region, regionInfo in self.regionsDict.iteritems(): keepBranchesPlot.append(regionInfo['cut']) keepBranchesPlotFinal = BranchList( keepBranchesPlot).getListOfBranches() print("KEEP:", keepBranchesPlotFinal) # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] for sample in subsamples: # add cuts for all training regions for region, regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s' % region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append( self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general', 'addBlindingCut'): sampleCuts.append( self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format( region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print( "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m" ) raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk( self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print( "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m" ) raise Exception("SampleFilesHaveChanged") treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) else: print("INFO: already cached!", tc, "(", tc.hash, ")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
print "Compile external macros" print "=======================\n" #get locations: Wdir = config.get('Directories', 'Wdir') # working direcoty containing the ouput samplesinfo = config.get('Directories', 'samplesinfo') # samples_nosplit.cfg path = config.get('Directories', 'plottingSamples') # from which samples to plot section = 'Plot:%s' % region info = ParseInfo( samplesinfo, path ) #creates a list of Samples by reading the info in samples_nosplit.cfg and the conentent of the path. import os if os.path.exists("../interface/DrawFunctions_C.so"): print 'ROOT.gROOT.LoadMacro("../interface/DrawFunctions_C.so")' ROOT.gROOT.LoadMacro("../interface/DrawFunctions_C.so") if os.path.exists("../interface/VHbbNameSpace_h.so"): print 'ROOT.gROOT.LoadMacro("../interface/VHbbNameSpace_h.so")' ROOT.gROOT.LoadMacro("../interface/VHbbNameSpace_h.so") #----------Histo from trees------------ #Get the selections and the samples def doPlot():
class PlotHelper(object): def __init__(self, config, region, vars=None, title=None): self.config = config self.region = region self.vars = vars self.title = title if title and len(title) > 0 else None # VHbb namespace VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) # additional blinding cut: self.addBlindingCut = None if self.config.has_option( 'Plot_general', 'addBlindingCut' ): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general', 'addBlindingCut') print('adding add. blinding cut:', self.addBlindingCut) self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection = 'Plot:%s' % region if self.vars and type(self.vars) == list: self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] if not self.vars or len(self.vars) < 1: varListFromConfig = self.config.get(self.configSection, 'vars').split(',') print("VARS::", self.configSection, " => ", varListFromConfig) self.vars = [ x.strip() for x in varListFromConfig if len(x.strip()) > 0 ] # load samples self.data = eval(self.config.get( self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get( 'Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) self.groupDict = eval(self.config.get('Plot_general', 'Group')) self.subcutPlotName = '' self.histogramStacks = {} def prepare(self): print( "INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:" .format(region=region)) for var in self.vars: print(" > {var}".format(var=var)) self.histogramStacks = {} for var in self.vars: self.histogramStacks[var] = StackMaker(self.config, var, self.region, self.signalRegion, None, '_' + self.subcutPlotName, title=self.title) # add DATA + MC samples for sample in self.dataSamples + self.mcSamples: # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append( self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config) sampleTree = tc.getTree() if sampleTree: groupName = self.groupDict[sample.name] print(" > found the tree, #entries = ", sampleTree.tree.GetEntries()) print(" > group =", groupName) print(" > now adding the tree for vars=", self.vars) # add the sample tree for all the variables for var in self.vars: self.histogramStacks[var].addSampleTree( sample=sample, sampleTree=sampleTree, groupName=groupName) else: print("\x1b[31mERROR: sampleTree not available for ", sample, ", run caching again!!\x1b[0m") raise Exception("CachedTreeMissing") return self def run(self): # draw for var in self.vars: self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='{region}__{var}_'.format( region=self.region, var=var)) self.histogramStacks[var].Draw( outputFolder=self.plotPath, prefix='comp_{region}__{var}_'.format(region=self.region, var=var), normalize=True) return self def getHistogramStack(self, var): if var in self.vars and var in self.histogramStacks: return self.histogramStacks[var] else: return None
from ROOT import TAxis from ROOT import TLorentzVector from ROOT import TMath from ROOT import TLegend #from ROOT import cmath from ROOT import gStyle from ROOT import gPad from ROOT import TCanvas, TColor, TGaxis, TH1F, TPad from ROOT import kBlack, kBlue, kRed, kViolet # load configuration and list of used samples config = XbbConfigReader.read('Zll2018') path = "Zll2018config/samples_nosplit.ini" sampleInfo = ParseInfo(config, path, config=config) usedSamples = sampleInfo.get_samples(XbbConfigTools(config).getMC()) #usedSamples = sampleInfo.get_samples(['ZJetsHT100', 'ZH_Znunu']) usedSampleIdentifiers = list(set([x.identifier for x in usedSamples])) print('usedSampleIdentifiers', usedSampleIdentifiers) # some samples come from same set of ROOT trees (=have same identifier) # -> find list of unique identifiers to avoid to process same tree file twice #sampleIdentifiers = sampleInfo.getSampleIdentifiers() #usedSampleIdentifiers = ParseInfo.filterIdentifiers(sampleIdentifiers, usedSamples) # from which step to take the root trees directory = config.get('Directories', 'sysOUT4')
config = BetterConfigParser() config.read(opts.config) #namelist=opts.names.split(',') #print "namelist:",namelist pathIN = config.get('Directories', 'PREPin') pathOUT = config.get('Directories', 'PREPout') samplesinfo = config.get('Directories', 'samplesinfo') sampleconf = BetterConfigParser() sampleconf.read(samplesinfo) prefix = sampleconf.get('General', 'prefix') info = ParseInfo(samples_path=pathIN, config=config) print "samplesinfo:", samplesinfo cross_sections = {} samples = [] for job in info: if not job.identifier in samples: if type(job.xsec) is list: job.xsec = job.xsec[0] cross_sections[job.identifier] = job.xsec samples.append(job.identifier) for sample in samples: print sample, "\t", cross_sections[sample] # print dir(job) # print "job.name:",job.name," job.cross_section:",job.xsec # print "job.prefix:",job.prefix # if not job.name in namelist:
def __init__(self, opts): # get file list self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None print "len(filelist)",len(self.filelist), if len(self.filelist) > 0: print "filelist[0]:", self.filelist[0] else: print '' # config self.debug = 'XBBDEBUG' in os.environ self.verifyCopy = True self.opts = opts self.config = BetterConfigParser() self.config.read(opts.config) self.channel = self.config.get('Configuration', 'channel') # load namespace, TODO VHbbNameSpace = self.config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # directories self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') print 'INput samples:\t%s'%self.pathIN print 'OUTput samples:\t%s'%self.pathOUT self.fileLocator = FileLocator(config=self.config) # check if given sample identifier uniquely matches a samples from config matchingSamples = ParseInfo(samples_path=self.pathIN, config=self.config).find(identifier=opts.sampleIdentifier) if len(matchingSamples) != 1: print "ERROR: need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) self.sample = matchingSamples[0] # collections self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(self.collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', self.collections self.collections = self.parseCollectionList(self.collections) print 'after parsing:', self.collections # temorary folder to save the files of this job on the scratch temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex # input files self.subJobs = [] if opts.join: print("INFO: join input files! This is an experimental feature!") # translate naming convention of .txt file to imported files after the prep step inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist] self.subJobs.append({ 'inputFileNames': self.filelist, 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) else: # create separate subjob for all files (default!) for inputFileName in self.filelist: inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)] self.subJobs.append({ 'inputFileNames': [inputFileName], 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), })
# path=opts.path pathIN = config.get("Directories", "SYSin") pathOUT = config.get("Directories", "SYSout") tmpDir = os.environ["TMPDIR"] print "INput samples:\t%s" % pathIN print "OUTput samples:\t%s" % pathOUT # storagesamples = config.get('Directories','storagesamples') namelist = opts.names.split(",") # load info info = ParseInfo(samplesinfo, pathIN) def deltaPhi(phi1, phi2): result = phi1 - phi2 while result > math.pi: result -= 2 * math.pi while result <= -math.pi: result += 2 * math.pi return result def resolutionBias(eta): if eta < 0.5: return 0.052 if eta < 1.1:
train_list = (config.get('MVALists', 'List_for_submitscript')).split(',') print train_list for item in train_list: submit(item, repDict) if opts.task == 'dc': DC_vars = (config.get('LimitGeneral', 'List')).split(',') print DC_vars Plot_vars = [''] if opts.task == 'plot' or opts.task == 'singleplot' or opts.task == 'mergesingleplot' or opts.task == 'checksingleplot': Plot_vars = (config.get('Plot_general', 'List')).split(',') if not opts.task == 'prep': path = config.get("Directories", "samplepath") info = ParseInfo(samplesinfo, path) if opts.task == 'plot': repDict['queue'] = 'all.q' for item in Plot_vars: submit(item, repDict) if opts.task == 'trainReg': repDict['queue'] = 'all.q' submit('trainReg', repDict) elif opts.task == 'dc': repDict['queue'] = 'all.q' for item in DC_vars: # item here contains the dc name submit(item, repDict)
# Set rescale factor of 2 in case of TrainFlag if TrainFlag: MC_rescale_factor=1. #print 'I RESCALE BY 2.0' else: MC_rescale_factor = 1. #systematics up/down if doSYS == 'False': UD = [] else: UD = ['Up','Down'] #Parse samples configuration info = ParseInfo(samplesinfo,path) # get all the treeCut sets all_samples = info.get_samples(signals+backgrounds+additionals) signal_samples = info.get_samples(signals) background_samples = info.get_samples(backgrounds) data_sample_names = config.get('dc:%s'%var,'data').split(' ') data_samples = info.get_samples(data_sample_names) print '\n-----> Collecting all Samples...' print ' Signals : ', signals print ' Backgrounds : ', backgrounds print ' Data : ', data_sample_names #-------------------------------------------------------------------------------------------------
def __init__(self, config, mvaName): self.dataRepresentationVersion = 2 self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.logpath = config.get('Directories', 'logpath') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.mvaName = mvaName self.MVAsettings = config.get(mvaName,'MVAsettings') self.factoryname = 'scikit-test1' VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) self.signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(self.signalSampleNames), } # MVA signal region cuts self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/test samples self.datasets = ['train', 'test'] self.varsets = ['X', 'y', 'sample_weight'] self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.trainCut) print("TEST CUT:", self.evalCut) self.globalRescale = 2.0 # default parameters self.parameters = { 'factoryname': self.factoryname, 'mvaName': self.mvaName, 'MVAregionCut': self.treeCutName + ': ' + self.treeCut, #'classifier': 'GradientBoostingClassifier', 'classifier': 'RandomForestClassifier', #'classifier': 'ExtraTreesClassifier', #'classifier': 'FT_GradientBoostingClassifier', 'max_depth': None, 'max_leaf_nodes': None, 'class_weight': 'balanced', #'criterion': 'friedman_mse', 'criterion': 'gini', #'n_estimators': 3000, 'n_estimators': 400, #'learning_rate': 0.1, 'algorithm': 'SAMME.R', #'min_samples_leaf': 100, 'splitter': 'best', 'max_features': 4, 'subsample': 0.6, 'limit': -1, 'additional_signal_weight': 1.0, 'min_impurity_split': 0.0, 'bootstrap': True, } # load parameters from config in a format similar to Root TMVA parameter string self.MVAsettingsEvaluated = [] for mvaSetting in self.MVAsettings.split(':'): self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip()) try: self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()]) except: print("???:", mvaSetting) self.MVAsettingsEvaluated.append(mvaSetting) self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)
class PlotHelper(object): def __init__(self, config, region, vars = None, title=None, sampleIdentifier=None): self.config = config self.region = region self.vars = vars self.title = title if title and len(title)>0 else None self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) # input/output paths self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection='Plot:%s'%region # variables if self.vars and type(self.vars) == list: self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] # if variables not specified in command line, read from config if not self.vars or len(self.vars) < 1: varListFromConfig = self.config.get(self.configSection, 'vars').split(',') print ("VARS::", self.configSection, " => ", varListFromConfig) self.vars = [x.strip() for x in varListFromConfig if len(x.strip()) > 0] # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general','addBlindingCut') print ('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers] self.mcSamples = [x for x in self.mcSamples if x.identifier in self.sampleIdentifiers] self.groupDict = eval(self.config.get('Plot_general', 'Group')) self.subcutPlotName = '' self.histogramStacks = {} def prepare(self): print ("INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:".format(region=region)) for var in self.vars: print (" > {var}".format(var=var)) self.histogramStacks = {} for var in self.vars: self.histogramStacks[var] = StackMaker(self.config, var, self.region, self.signalRegion, None, '_'+self.subcutPlotName, title=self.title) # add DATA + MC samples for sample in self.dataSamples + self.mcSamples: # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append(self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config ) sampleTree = tc.getTree() if sampleTree: groupName = self.groupDict[sample.name] print (" > found the tree, #entries = ", sampleTree.tree.GetEntries()) print (" > group =", groupName) print (" > now adding the tree for vars=", self.vars) # add the sample tree for all the variables for var in self.vars: self.histogramStacks[var].addSampleTree(sample=sample, sampleTree=sampleTree, groupName=groupName, cut=self.subcut if self.subcut else '1') else: print ("\x1b[31mERROR: sampleTree not available for ", sample,", run caching again!!\x1b[0m") raise Exception("CachedTreeMissing") return self def run(self): # draw for var in self.vars: self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='{region}__{var}_'.format(region=self.region, var=var)) if self.config.has_option('Plot_general', 'drawNormalizedPlots') and eval(self.config.get('Plot_general', 'drawNormalizedPlots')): self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='comp_{region}__{var}_'.format(region=self.region, var=var), normalize=True) return self def getHistogramStack(self, var): if var in self.vars and var in self.histogramStacks: return self.histogramStacks[var] else: return None
def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split( ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection = 'Plot:%s' % region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option( 'Plot_general', 'addBlindingCut' ): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general', 'addBlindingCut') print('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get( self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get( 'Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [ x for x in self.dataSamples if x.identifier in self.sampleIdentifiers ] self.mcSamples = [ x for x in self.mcSamples if x.identifier in self.sampleIdentifiers ]
#path=opts.path pathIN = config.get('Directories','SYSin') pathOUT = config.get('Directories','SYSout') tmpDir = os.environ["TMPDIR"] print 'INput samples:\t%s'%pathIN print 'OUTput samples:\t%s'%pathOUT #storagesamples = config.get('Directories','storagesamples') namelist=opts.names.split(',') #load info info = ParseInfo(samplesinfo,pathIN) def deltaPhi(phi1, phi2): result = phi1 - phi2 while (result > math.pi): result -= 2*math.pi while (result <= -math.pi): result += 2*math.pi return result def resolutionBias(eta): if(eta< 0.5): return 0.052 if(eta< 1.1): return 0.057 if(eta< 1.7): return 0.096 if(eta< 2.3): return 0.134 if(eta< 5): return 0.28 return 0
class MvaTrainingHelper(object): def __init__(self, config, mvaName): self.config = config self.factoryname = config.get('factory', 'factoryname') self.factorysettings = config.get('factory', 'factorysettings') if config.has_option('Directories', 'trainingSamples'): self.samplesPath = config.get('Directories', 'trainingSamples') else: self.samplesPath = config.get('Directories', 'MVAin') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.MVAtype = config.get(mvaName, 'MVAtype') self.MVAsettings = config.get(mvaName,'MVAsettings') self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(signalSampleNames), } self.treeCutName = config.get(mvaName, 'treeCut') if config.has_option(mvaName, 'treeCut') else mvaName self.treeCut = config.get('Cuts', self.treeCutName) self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.TrainCut) print("EVAL CUT:", self.EvalCut) self.globalRescale = 2.0 self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName) print("INFO: MvaTrainingHelper class created.") def prepare(self): self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE") # ---------------------------------------------------------------------------------------------------------------------- # create TMVA factory # ---------------------------------------------------------------------------------------------------------------------- self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings) if self.trainingOutputFile and self.factory: print ("INFO: initialized MvaTrainingHelper.", self.factory) else: print ("\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m") # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/eval trees # ---------------------------------------------------------------------------------------------------------------------- try: addBackgroundTreeMethod = self.factory.AddBackgroundTree addSignalTreeMethod = self.factory.AddSignalTree self.dataLoader = None except: print("oh no..") # the DataLoader wants to be called '.' self.dataLoader = ROOT.TMVA.DataLoader(".") addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree addSignalTreeMethod = self.dataLoader.AddSignalTree if self.config.has_option('Weights','useSpecialWeight') and eval(self.config.get('Weights','useSpecialWeight')): print("\x1b[31mERROR: specialweight cannot be used with TMVA training, set it to false and add the DY_specialWeight to weightF!!\x1b[0m") raise Exception("SpecialWeightNotSupported") # DEBUG: restrict memory # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024)) self.sampleTrees = [] for addTreeFcn, samples in [ [addBackgroundTreeMethod, self.samples['BKG']], [addSignalTreeMethod, self.samples['SIG']] ]: for sample in samples: print ('*'*80,'\n%s\n'%sample,'*'*80) for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() sampleTree.tree.SetCacheSize(32*1024) # prevent garbage collection self.sampleTrees.append(sampleTree) if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale # only non-empty trees can be added if sampleTree.tree.GetEntries() > 0: addTreeFcn(sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting) print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") if self.dataLoader: for var in self.MVA_Vars['Nominal']: self.dataLoader.AddVariable(var, 'D') else: for var in self.MVA_Vars['Nominal']: self.factory.AddVariable(var, 'D') return self # ---------------------------------------------------------------------------------------------------------------------- # backup old .xml and .info files # ---------------------------------------------------------------------------------------------------------------------- def backupOldFiles(self): success = False MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/' backupDir = MVAdir + 'backup/' try: os.makedirs(backupDir) except: pass freeNumber = 1 try: lastUsedBackupDirectories = sorted(glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True) freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split('/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1 except Exception as e: print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m") freeNumber = -1 if freeNumber > -1: try: fileNamesToBackup = glob.glob(MVAdir + self.factoryname+'_'+self.mvaName + '.*') fileNamesToBackup += glob.glob(MVAdir + '/../mvatraining_MVA_ZllBDT_*.root') os.makedirs(backupDir + 'v%d/'%freeNumber) for fileNameToBackup in fileNamesToBackup: shutil.copy(fileNameToBackup, backupDir + 'v%d/'%freeNumber) success = True except Exception as e: print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m") return success def run(self): backupFiles = False try: backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights')) except: pass if backupFiles: print('backing up old BDT files') self.backupOldFiles() # ---------------------------------------------------------------------------------------------------------------------- # Execute TMVA # ---------------------------------------------------------------------------------------------------------------------- self.factory.Verbose() print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")'%(self.MVAtype, self.mvaName, self.MVAsettings)) print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) weightF = self.config.get('Weights','weightF') try: self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings) print("ROOT 5 style TMVA found") self.factory.SetSignalWeightExpression(weightF) self.factory.SetBackgroundWeightExpression(weightF) except: print("ROOT 6 style TMVA found, using data loader object!!! >_<") print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir) print(" data loader:", self.dataLoader) print(" type: ", self.MVAtype) print(" name: ", self.mvaName) print(" settings: ", self.MVAsettings) ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights' self.dataLoader.SetSignalWeightExpression(weightF) self.dataLoader.SetBackgroundWeightExpression(weightF) self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings) sys.stdout.flush() print('Execute TMVA: TrainAllMethods') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.TrainAllMethods() sys.stdout.flush() print('Execute TMVA: TestAllMethods') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.TestAllMethods() sys.stdout.flush() print('Execute TMVA: EvaluateAllMethods') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.factory.EvaluateAllMethods() sys.stdout.flush() print('Execute TMVA: output.Write') print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)) self.trainingOutputFile.Close() return self def printInfo(self): #WRITE INFOFILE MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/' infofile = open(MVAdir+self.factoryname+'_'+self.mvaName+'.info','w') print ('@DEBUG: output infofile name') print (infofile) info=mvainfo(self.mvaName) info.factoryname=self.factoryname info.factorysettings=self.factorysettings info.MVAtype=self.MVAtype info.MVAsettings=self.MVAsettings info.weightfilepath=MVAdir info.path=self.samplesPath info.varset=self.treeVarSet info.vars=self.MVA_Vars['Nominal'] pickle.dump(info,infofile) infofile.close() def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0): hSIG = ROOT.TH1D("hSig","hSig",nBins,xMin,xMax) hBKG = ROOT.TH1D("hBkg","hBkg",nBins,xMin,xMax) print("INFO: GetEntries() = ", tree.GetEntries()) if power != 1.0: print("INFO: rescale BDT score with power ", power) for event in tree: if power != 1.0: x = (getattr(event, self.mvaName)-xMin)/(xMax-xMin) if x<0: x=0 if x>0.999999: x=0.999999 value = math.pow(x, power)*(xMax-xMin)+xMin else: value = max(min(getattr(event, self.mvaName),xMax-0.00001),xMin) weight = event.weight if event.classID == 1: hSIG.Fill(value, weight * rescaleSig) else: hBKG.Fill(value, weight * rescaleBkg) ssbSum = 0.0 sSum = 0 bSum = 0 sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}" print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----") print(sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)")) for i in range(nBins): ssbSum += hSIG.GetBinContent(1+i)*hSIG.GetBinContent(1+i)/(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0 sSum += hSIG.GetBinContent(1+i) bSum += hBKG.GetBinContent(1+i) ssb = hSIG.GetBinContent(1+i)/math.sqrt(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0 print(sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1+i),2), background=round(hBKG.GetBinContent(1+i),2), ssb=round(ssb,3))) expectedSignificance = math.sqrt(ssbSum) print(sbTableFormat.format(bin="SUM", signal=round(sSum,1), background=round(bSum,1), ssb="\x1b[34mZ=%1.3f\x1b[0m"%expectedSignificance)) print("-"*40) hSIG.Delete() hBKG.Delete() return expectedSignificance, sSum, bSum def estimateExpectedSignificance(self): print("INFO: open ", self.trainingOutputFileName) rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ") print("INFO: ->", rootFile) testTree = rootFile.Get('./TestTree') # run a few tests with different binnings and rescaling of BDT score self.getExpectedSignificance(testTree, 15, -0.8, 1.0) self.getExpectedSignificance(testTree, 15, -0.8, 0.9) self.getExpectedSignificance(testTree, 15, -0.8, 0.8) self.getExpectedSignificance(testTree, 15, -0.8, 0.75) self.getExpectedSignificance(testTree, 15, -0.8, 0.7) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5) self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0) # close to nominal binning print("---- ~nominal TEST -----") esTest, sTest, bTest = self.getExpectedSignificance(testTree, 15, -0.8, 0.8) print("---- ~nominal TRAINING (without correct normalization) -----") trainTree = rootFile.Get('./TrainTree') esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8) # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events rescaleSig = 1.0*sTest/sTrain rescaleBkg = 1.0*bTest/bTrain print("---- ~nominal TRAINING -----") trainTree = rootFile.Get('./TrainTree') esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg) def getbdtHistogram(self, tree): hSIG = ROOT.TH1D("hSig","TMVA overtraining check for classifier: %s"%self.mvaName,40,-1,1) hBKG = ROOT.TH1D("hBkg","TMVA overtraining check for classifier: %s"%self.mvaName,40,-1,1) print("INFO: GetEntries() = ", tree.GetEntries()) for event in tree: value = getattr(event, self.mvaName) if event.classID == 1: hSIG.Fill(value) else: hBKG.Fill(value) return [hSIG, hBKG] def setTMVASyle(self): # style self.hSIGtest.SetLineColor(ROOT.TColor.GetColor("#0000ee")) self.hSIGtest.SetLineWidth(1) self.hSIGtest.SetFillStyle(1001) self.hSIGtest.SetFillColor(ROOT.TColor.GetColor("#7d99d1")) self.hSIGtest.SetTitle("TMVA overtraining check for classifier: %s"%self.mvaName ) self.hBKGtest.SetLineColor(ROOT.TColor.GetColor("#ff0000")) self.hBKGtest.SetLineWidth(1) self.hBKGtest.SetFillStyle(3554) self.hBKGtest.SetFillColor(ROOT.TColor.GetColor("#ff0000")) self.hBKGtest.SetTitle(self.hSIGtest.GetTitle()) self.hSIGtrain.SetMarkerColor(self.hSIGtest.GetLineColor()) self.hSIGtrain.SetMarkerSize(0.7) self.hSIGtrain.SetMarkerStyle(20) self.hSIGtrain.SetLineWidth(1) self.hSIGtrain.SetLineColor(self.hSIGtest.GetLineColor()) self.hSIGtrain.SetTitle(self.hSIGtest.GetTitle()) self.hBKGtrain.SetMarkerColor(self.hBKGtest.GetLineColor()) self.hBKGtrain.SetMarkerSize(0.7) self.hBKGtrain.SetMarkerStyle(20) self.hBKGtrain.SetLineWidth(1) self.hBKGtrain.SetLineColor(self.hBKGtest.GetLineColor()) self.hBKGtrain.SetTitle(self.hSIGtest.GetTitle()) TMVAStyle = ROOT.TStyle(ROOT.gROOT.GetStyle("Plain"))# // our style is based on Plain TMVAStyle.SetName("TMVA") TMVAStyle.SetTitle("TMVA style based on \"Plain\" with modifications defined in tmvaglob.C") ROOT.gROOT.GetListOfStyles().Add(TMVAStyle) ROOT.gROOT.SetStyle("TMVA") TMVAStyle.SetLineStyleString( 5, "[52 12]" ) TMVAStyle.SetLineStyleString( 6, "[22 12]" ) TMVAStyle.SetLineStyleString( 7, "[22 10 7 10]" ) UsePaperStyle = False #// the pretty color palette of old TMVAStyle.SetPalette((18 if UsePaperStyle else 1)) #// use plain black on white colors TMVAStyle.SetFrameBorderMode(0) TMVAStyle.SetCanvasBorderMode(0) TMVAStyle.SetPadBorderMode(0) TMVAStyle.SetPadColor(0) TMVAStyle.SetFillStyle(0) TMVAStyle.SetLegendBorderSize(0) c_TitleBox = ROOT.TColor.GetColor( "#5D6B7D" ) c_TitleText = ROOT.TColor.GetColor( "#FFFFFF" ) c_TitleBorder = ROOT.TColor.GetColor( "#7D8B9D" ) c_FrameFill = ROOT.TColor.GetColor( "#fffffd" ) c_Canvas = ROOT.TColor.GetColor( "#f0f0f0" ) TMVAStyle.SetTitleFillColor( c_TitleBox ) TMVAStyle.SetTitleTextColor( c_TitleText ) TMVAStyle.SetTitleBorderSize( 1 ) TMVAStyle.SetLineColor( c_TitleBorder ) if not UsePaperStyle: TMVAStyle.SetFrameFillColor( c_FrameFill ) TMVAStyle.SetCanvasColor( c_Canvas ) #// set the paper & margin sizes TMVAStyle.SetPaperSize(20,26) TMVAStyle.SetPadTopMargin(0.10) TMVAStyle.SetPadRightMargin(0.05) TMVAStyle.SetPadBottomMargin(0.11) TMVAStyle.SetPadLeftMargin(0.12) #// use bold lines and markers TMVAStyle.SetMarkerStyle(21) TMVAStyle.SetMarkerSize(0.3) TMVAStyle.SetHistLineWidth(2) TMVAStyle.SetLineStyleString(2,"[12 12]") #// postscript dashes #// do not display any of the standard histogram decorations TMVAStyle.SetOptTitle(1) TMVAStyle.SetTitleH(0.052) TMVAStyle.SetOptStat(0) TMVAStyle.SetOptFit(0) #// put tick marks on top and RHS of plots TMVAStyle.SetPadTickX(1) TMVAStyle.SetPadTickY(1) def nomrmaliseHist(self,hSIG, hBKG): if (hSIG.GetSumw2N() == 0): hSIG.Sumw2() if (hBKG and hBKG.GetSumw2N() == 0): hBKG.Sumw2() if(hSIG.GetSumOfWeights()!=0): dx = (hSIG.GetXaxis().GetXmax() - hSIG.GetXaxis().GetXmin())/hSIG.GetNbinsX() hSIG.Scale(1.0/hSIG.GetSumOfWeights()/dx) if (hBKG != 0 and hBKG.GetSumOfWeights()!=0): dx = (hBKG.GetXaxis().GetXmax() - hBKG.GetXaxis().GetXmin())/hBKG.GetNbinsX() hBKG.Scale( 1.0/hBKG.GetSumOfWeights()/dx ) def drawOvertraining(self): #normalise histograms self.nomrmaliseHist(self.hSIGtest, self.hBKGtest) self.nomrmaliseHist(self.hSIGtrain, self.hBKGtrain) c = ROOT.TCanvas("canvas1", "TMVA comparison %s"%self.mvaName, 0, 200, 600, 468) # frame limits (choose judicuous x range) nrms = 10 xmin = ROOT.TMath.Max(ROOT.TMath.Min(self.hSIGtest.GetMean() - nrms*self.hSIGtest.GetRMS(), self.hBKGtest.GetMean() - nrms*self.hBKGtest.GetRMS() ),self.hSIGtest.GetXaxis().GetXmin() ) xmax = ROOT.TMath.Min(ROOT.TMath.Max(self.hSIGtest.GetMean() + nrms*self.hSIGtest.GetRMS(), self.hBKGtest.GetMean() + nrms*self.hBKGtest.GetRMS()), self.hSIGtest.GetXaxis().GetXmax()) ymin = 0 maxMult = 1.3 #maxMult = (htype == CompareType) ? 1.3 : 1.2 ymax = ROOT.TMath.Max(self.hSIGtest.GetMaximum(), self.hBKGtest.GetMaximum())*maxMult ymax = ROOT.TMath.Max(ymax,ROOT.TMath.Max(self.hSIGtrain.GetMaximum(), self.hBKGtrain.GetMaximum())*maxMult) #print ('ymax is', ymax) #print (self.hSIGtest.GetMaximum()) #print (self.hBKGtest.GetMaximum()) #print (self.hSIGtrain.GetMaximum()) #print (self.hBKGtrain.GetMaximum()) #sys.exit() # build a frame nb = 500 hFrameName = "frame" + self.mvaName #o = ROOT.gROOT.FindObject(hFrameName) frame = ROOT.TH2F(hFrameName, self.hSIGtest.GetTitle(), nb, xmin, xmax, nb, ymin, ymax ) frame.GetXaxis().SetTitle(self.mvaName + " response") frame.GetYaxis().SetTitle("(1/N) dN^{ }/^{ }dx") #TMVAGlob.SetFrameStyle( frame ) frame.SetLabelOffset( 0.012, "X" ) frame.SetLabelOffset( 0.012, "Y" ) frame.GetXaxis().SetTitleOffset( 1.25 ) frame.GetYaxis().SetTitleOffset( 1.22 ) frame.GetXaxis().SetTitleSize( 0.045) frame.GetYaxis().SetTitleSize( 0.045) frame.GetXaxis().SetLabelSize( 0.04) frame.GetYaxis().SetLabelSize( 0.04) #// global style settings ROOT.gPad.SetTicks() ROOT.gPad.SetLeftMargin ( 0.108) ROOT.gPad.SetRightMargin ( 0.050) ROOT.gPad.SetBottomMargin( 0.120) # eventually: draw the frame frame.Draw() c.GetPad(0).SetLeftMargin(0.105 ) frame.GetYaxis().SetTitleOffset( 1.2 ) # Draw legend legend = ROOT.TLegend(c.GetLeftMargin(), 1 - c.GetTopMargin() - 0.12, c.GetLeftMargin() + 0.40, 1 - c.GetTopMargin() ) legend.SetFillStyle(1) legend.AddEntry(self.hSIGtest,"Signal" + " (test sample)", "F") legend.AddEntry(self.hBKGtest,"Background" + " (test sample)", "F") legend.SetBorderSize(1) legend.SetMargin(0.2) legend.Draw("same") legend2= ROOT.TLegend( 1 - c.GetRightMargin() - 0.42, 1 - c.GetTopMargin() - 0.12, 1 - c.GetRightMargin(), 1 - c.GetTopMargin() ) legend2.SetFillStyle(1) legend2.SetBorderSize(1) legend2.AddEntry(self.hSIGtrain,"Signal (training sample)","P") legend2.AddEntry(self.hBKGtrain,"Background (training sample)","P") legend2.SetMargin( 0.1 ) legend2.Draw("same") self.setTMVASyle() self.hSIGtest.Draw('samehist') self.hBKGtest.Draw('samehist') self.hSIGtrain.Draw('e1same') self.hBKGtrain.Draw('e1same') #perform K-S test print("--- Perform Kolmogorov-Smirnov tests") #//Double_t kolS = sig->KolmogorovTest( self.hSIGtrain, "X" ); #//Double_t kolB = bgd->KolmogorovTest( bgdOv, "X" ); kolS = self.hSIGtest.KolmogorovTest( self.hSIGtrain); kolB = self.hBKGtest.KolmogorovTest( self.hBKGtrain); print ("--- Goodness of signal (background) consistency: " + str(kolS) + " (" + str(kolB) + ")") probatext = "Kolmogorov-Smirnov test: signal (background) probability = % 5.3g (%5.3g)"% (kolS, kolB) tt = ROOT.TText(0.12, 0.74, probatext) tt.SetNDC() tt.SetTextSize(0.032) tt.AppendPad() # redraw axes frame.Draw("sameaxis") #/text for overflows nbin = self.hSIGtest.GetNbinsX() dxu = self.hSIGtest.GetBinWidth(0) dxo = self.hSIGtest.GetBinWidth(nbin+1) uoflow = "U/O-flow (S,B): (%.1f, %.1f)%% / (%.1f, %.1f)%%"% (self.hSIGtest.GetBinContent(0)*dxu*100, self.hBKGtest.GetBinContent(0)*dxu*100, self.hSIGtest.GetBinContent(nbin+1)*dxo*100, self.hBKGtest.GetBinContent(nbin+1)*dxo*100) t = ROOT.TText( 0.975, 0.115, uoflow ) t.SetNDC() t.SetTextSize( 0.030 ) t.SetTextAngle( 90 ) t.AppendPad() # update canvas c.Update() MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/' c.SaveAs(MVAdir+'overtraining%s.pdf'%self.mvaName) print ('I saved the canvase in', MVAdir+'overtraining%s.pdf'%self.mvaName) def saveOvertrainingPlots(self): print("INFO: open ", self.trainingOutputFileName) rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ") print("INFO: ->", rootFile) self.hSIGtest = rootFile.Get('./Method_%s/%s/MVA_%s_S'%(self.mvaName,self.mvaName,self.mvaName)) self.hBKGtest = rootFile.Get('./Method_%s/%s/MVA_%s_B'%(self.mvaName,self.mvaName,self.mvaName)) self.hSIGtrain = rootFile.Get('./Method_%s/%s/MVA_%s_Train_S'%(self.mvaName,self.mvaName,self.mvaName)) self.hBKGtrain = rootFile.Get('./Method_%s/%s/MVA_%s_Train_B'%(self.mvaName,self.mvaName,self.mvaName)) print("./Method_%s/%s/MVA_%s_Train_B"%(self.mvaName,self.mvaName,self.mvaName)) self.drawOvertraining()
class CacheTraining(object): def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesDefinitions = self.config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], []))) self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], []))) self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get(trainingRegion, 'treeCut') treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0] systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else [] mvaVars = config.get(treeVarSet, 'Nominal').split(' ') weightVars = [] #for systematic in systematics: for syst in systematics: systNameUp = syst+'_UP' if self.config.has_option('Weights',syst+'_UP') else syst+'_Up' systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down' weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)] self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, 'weightVars': weightVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace=config.get('VHbbNameSpace','library') ROOT.gSystem.Load(VHbbNameSpace) def printInfo(self): print ("REGION:".ljust(24),"CUT:") for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): print (" > ",trainingRegion.ljust(20), trainingRegionInfo['cut']) def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print ('*'*80) print (' ',sampleToCache) print ('*'*80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [x for x in self.samples if x.identifier == sampleToCache] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) for weightVar in trainingRegionInfo['weightVars']: branchListOfMVAVars.addCut(weightVar) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format(region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut==self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True ) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print ("nothing to do!")
class SampleTreesToNumpyConverter(object): def __init__(self, config, mvaName): self.mvaName = mvaName VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) self.dataFormatVersion = 2 self.sampleTrees = [] self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) # region self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/eval sets self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') # rescale MC by 2 because of train/eval split self.globalRescale = 2.0 # variables and systematics self.treeVarSet = config.get(mvaName, 'treeVarSet') self.systematics = config.get('systematics', 'systematics').strip().split(' ') self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]} for sys in self.systematics: self.MVA_Vars[sys] = [x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0] # samples self.sampleNames = { # 'BKG_TT': eval(self.config.get('Plot_general', 'TT')), # 'BKG_ST': eval(self.config.get('Plot_general', 'ST')), # 'BKG_VV': eval(self.config.get('Plot_general', 'VV')), # 'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')), # 'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')), # 'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')), # 'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')), # 'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')), 'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')), 'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')), } self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()} def run(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- categories = self.samples.keys() datasetParts = {'train': self.trainCut, 'test': self.evalCut} systematics = self.systematics arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} arrayLists_sys = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in systematics} weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} # standard weight expression weightF = self.config.get('Weights','weightF') for category in categories: for sample in self.samples[category]: print ('*'*80,'\n%s\n'%sample,'*'*80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale print ('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] features_sys = {x: self.MVA_Vars[x] for x in systematics} nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) inputData_sys = {x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics} # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) for k, features_s in features_sys.iteritems(): for feature in features_s: sampleTree.addFormula(feature) sampleTree.addFormula(weightF) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section totalWeight = treeScale * sampleTree.evaluate(weightF) weightLists[datasetName].append(totalWeight) targetLists[datasetName].append(categories.index(category)) # fill systematics for k, feature_s in features_sys.iteritems(): for j, feature in enumerate(feature_s): inputData_sys[k][i,j] = sampleTree.evaluate(feature) arrayLists[datasetName].append(inputData) for sys in systematics: arrayLists_sys[sys][datasetName].append(inputData_sys[sys]) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), }, 'category_labels': {idx: label for idx, label in enumerate(categories)}, 'meta': { 'version': self.dataFormatVersion, 'region': self.mvaName, 'cutName': self.treeCutName, 'cut': self.treeCut, 'trainCut': self.trainCut, 'testCut': self.evalCut, 'samples': self.sampleNames, 'weightF': weightF, 'variables': ' '.join(self.MVA_Vars['Nominal']) } } # add systematics variations for sys in systematics: self.data['train']['X_'+sys] = np.concatenate(arrayLists_sys[sys]['train'], axis=0) numpyOutputFileName = './' + self.mvaName + '.dmpz' with gzip.open(numpyOutputFileName, 'wb') as outputFile: pickle.dump(self.data, outputFile) print(self.data['meta']) print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")
backgrounds = eval(backgrounds) print '\n -----> Training Backgrounds: ', backgrounds treeVarSet = config.get(run,'treeVarSet') #print '\n -----> Training Features: ', treeVarSet #variables #TreeVar Array MVA_Vars={} MVA_Vars['Nominal']=config.get(treeVarSet,'Nominal') MVA_Vars['Nominal']=MVA_Vars['Nominal'].split(' ') #Infofile info = ParseInfo(samplesinfo,path) #Workdir workdir=ROOT.gDirectory.GetPath() # Test and Train event cuts #TrainCut = '%s & EventForTraining==1' % TCut #EvalCut = '%s & EventForTraining==0' % TCut TrainCut= TCut +' & evt%2==0' EvalCut = TCut +' & evt%2!=0' cuts = [TrainCut,EvalCut] print '\n ------> with Train Cuts: ', TrainCut print ' Test Cuts : ', EvalCut
print "Unknown Pt region" pt_region = 'NoSysRegion' #sys.exit("Unknown Pt region") # Set rescale factor of 2 in case of TrainFalg if TrainFlag: MC_rescale_factor = 2. print 'I RESCALE BY 2.0' else: MC_rescale_factor = 1. #systematics up/down UD = ['Up', 'Down'] print 'Parse the sample information' print '============================\n' #Parse samples configuration info = ParseInfo(samplesinfo, path) # get all the treeCut sets # create different sample Lists print 'Get the sample list' print '===================\n' all_samples = info.get_samples(signals + backgrounds + additionals) print 'workspace_datacard-all_samples:', [job.name for job in all_samples] signal_samples = info.get_samples(signals) print 'signal samples:', [job.name for job in signal_samples] background_samples = info.get_samples(backgrounds) data_sample_names = config.get('dc:%s' % var, 'data').split(' ') print 'data_sample_names are', data_sample_names data_samples = info.get_samples(data_sample_names)
for item in train_list: submit(item,repDict) if opts.task == 'dc': #DC_vars = config.items('Limit') DC_vars= (config.get('LimitGeneral','List')).split(',') print DC_vars if opts.task == 'plot': Plot_vars= (config.get('Plot_general','List')).split(',') if not opts.task == 'prep': path = config.get("Directories","samplepath") samplesinfo = config.get("Directories","samplesinfo") info = ParseInfo(samplesinfo,path) if opts.task == 'plot': repDict['queue'] = 'all.q' for item in Plot_vars: submit(item,repDict) if opts.task == 'trainReg': repDict['queue'] = 'all.q' submit('trainReg',repDict) elif opts.task == 'dc': repDict['queue'] = 'all.q' for item in DC_vars: if 'ZH%s'%opts.mass in item:
import sys import os from myutils.XbbConfig import XbbConfigReader, XbbConfigTools from myutils import ParseInfo from myutils.FileLocator import FileLocator from myutils.XbbTools import XbbTools argv = sys.argv parser = OptionParser() parser.add_option("-T","--tag", dest="tag", default='', help="config tag") parser.add_option("-D","--directory", dest="directory", default='MVAout', help="directory name, e.g. MVAout") parser.add_option("-S","--sample", dest="sample", default='TT*', help="sample") (opts, args) = parser.parse_args(argv) config = XbbConfigTools(config=XbbConfigReader.read(opts.tag)) path = config.get("Directories", opts.directory) sampleInfoDirectory = config.get('Directories', 'samplefiles') info = ParseInfo(samples_path=path, config=config) # only take first sample which matches sampleIdentifier = XbbTools.filterSampleList(info.getSampleIdentifiers(), XbbTools.parseSamplesList(opts.sample))[0] # get list of ORIGINAL file names for this sample: /store/... sampleTreeFileNames = XbbTools.getSampleTreeFileNames(sampleInfoDirectory, sampleIdentifier) fileLocator = FileLocator(config=config) # get local name of ffirst file localFilename = fileLocator.getFilePath(path, sampleIdentifier, sampleTreeFileNames[0]) print(localFilename)
config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis", "tag") #get locations: Wdir = config.get('Directories', 'Wdir') samplesinfo = config.get('Directories', 'samplesinfo') #read shape systematics systematics = config.get('systematics', 'systematics') #systematics INpath = config.get('Directories', 'MVAin') OUTpath = config.get('Directories', 'MVAout') info = ParseInfo(samplesinfo, INpath) arglist = '' if not evaluate_optimisation: arglist = opts.discr #RTight_blavla,bsbsb else: # print '@INFO: Evaluating bdt for optimisation' arglist = weight namelistIN = opts.names namelist = namelistIN.split(',') print('namelist', namelist) # sys.exit(1)
class MvaTrainingHelper(object): def __init__(self, config, mvaName): self.dataRepresentationVersion = 2 self.config = config self.samplesPath = config.get('Directories', 'MVAin') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.logpath = config.get('Directories', 'logpath') self.treeVarSet = config.get(mvaName, 'treeVarSet') self.mvaName = mvaName self.MVAsettings = config.get(mvaName,'MVAsettings') self.factoryname = 'scikit-test1' VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # variables self.MVA_Vars = {} self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ') # samples self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds')) self.signalSampleNames = eval(config.get(mvaName, 'signals')) self.samples = { 'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames), 'SIG': self.samplesInfo.get_samples(self.signalSampleNames), } # MVA signal region cuts self.treeCutName = config.get(mvaName, 'treeCut') self.treeCut = config.get('Cuts', self.treeCutName) # split in train/test samples self.datasets = ['train', 'test'] self.varsets = ['X', 'y', 'sample_weight'] self.trainCut = config.get('Cuts', 'TrainCut') self.evalCut = config.get('Cuts', 'EvalCut') print("TRAINING CUT:", self.trainCut) print("TEST CUT:", self.evalCut) self.globalRescale = 2.0 # default parameters self.parameters = { 'factoryname': self.factoryname, 'mvaName': self.mvaName, 'MVAregionCut': self.treeCutName + ': ' + self.treeCut, #'classifier': 'GradientBoostingClassifier', 'classifier': 'RandomForestClassifier', #'classifier': 'ExtraTreesClassifier', #'classifier': 'FT_GradientBoostingClassifier', 'max_depth': None, 'max_leaf_nodes': None, 'class_weight': 'balanced', #'criterion': 'friedman_mse', 'criterion': 'gini', #'n_estimators': 3000, 'n_estimators': 400, #'learning_rate': 0.1, 'algorithm': 'SAMME.R', #'min_samples_leaf': 100, 'splitter': 'best', 'max_features': 4, 'subsample': 0.6, 'limit': -1, 'additional_signal_weight': 1.0, 'min_impurity_split': 0.0, 'bootstrap': True, } # load parameters from config in a format similar to Root TMVA parameter string self.MVAsettingsEvaluated = [] for mvaSetting in self.MVAsettings.split(':'): self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip()) try: self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()]) except: print("???:", mvaSetting) self.MVAsettingsEvaluated.append(mvaSetting) self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated) # load numpy arrays with training/testing data def loadCachedNumpyArrays(self, cachedFilesPath): cached = True try: with open(cachedFilesPath + '/scikit_input.dmp', 'rb') as inputFile: self.data = pickle.load(inputFile) print("INFO: found numpy arrays for input in:", cachedFilesPath) except: cached = False return cached # save numpy arrays with training/testing data def writeNumpyArrays(self, cachedFilesPath): with open(cachedFilesPath + '/scikit_input.dmp', 'wb') as outputFile: pickle.dump(self.data, outputFile) print("INFO: wrote numpy arrays for input to:", cachedFilesPath) def getCachedNumpyArrayPath(self): identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__V:%r'%self.dataRepresentationVersion varsHash = hashlib.sha224(identifier).hexdigest() cachedFilesPath = self.logpath + '/../cache/' + varsHash + '/' return cachedFilesPath def getHash(self): identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__PAR:%r'%self.parameters return hashlib.sha224(identifier).hexdigest()[:8] def prepare(self): # ---------------------------------------------------------------------------------------------------------------------- # add sig/bkg x training/testing trees # ---------------------------------------------------------------------------------------------------------------------- self.sampleTrees = [] categories = ['BKG', 'SIG'] datasetParts = {'train': self.trainCut, 'test': self.evalCut} cachedFilesPath = self.getCachedNumpyArrayPath() try: os.makedirs(cachedFilesPath) except: pass # load numpy arrays from disk if they have been already created if self.loadCachedNumpyArrays(cachedFilesPath): return self arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()} # standard weight expression weightF = self.config.get('Weights','weightF') for category in categories: for sample in self.samples[category]: print ('*'*80,'\n%s\n'%sample,'*'*80) for datasetName, additionalCut in datasetParts.iteritems(): # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) # cut from the mva region if self.treeCut: sampleCuts.append(self.treeCut) # get ROOT tree for selected sample & region cut tc = TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=self.config, debug=True ) sampleTree = tc.getTree() if sampleTree: treeScale = sampleTree.getScale(sample) * self.globalRescale print ('scale:', treeScale) # initialize numpy array nSamples = sampleTree.GetEntries() features = self.MVA_Vars['Nominal'] nFeatures = len(features) print('nFeatures:', nFeatures) inputData = np.zeros((nSamples, nFeatures), dtype=np.float32) # initialize formulas for ROOT tree for feature in features: sampleTree.addFormula(feature) sampleTree.addFormula(weightF) # fill numpy array from ROOT tree for i, event in enumerate(sampleTree): for j, feature in enumerate(features): inputData[i, j] = sampleTree.evaluate(feature) # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section totalWeight = treeScale * sampleTree.evaluate(weightF) weightLists[datasetName].append(totalWeight) targetLists[datasetName].append(categories.index(category)) arrayLists[datasetName].append(inputData) else: print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m") raise Exception("CachedTreeMissing") # concatenate all data from different samples self.data = { 'train': { 'X': np.concatenate(arrayLists['train'], axis=0), 'y': np.array(targetLists['train'], dtype=np.float32), 'sample_weight': np.array(weightLists['train'], dtype=np.float32), }, 'test': { 'X': np.concatenate(arrayLists['test'], axis=0), 'y': np.array(targetLists['test'], dtype=np.float32), 'sample_weight': np.array(weightLists['test'], dtype=np.float32), }, } # write numpy arrays to disk self.writeNumpyArrays(cachedFilesPath) return self def verify_data(self): valid = True for dataset in self.datasets: for var in self.varsets: print("DEBUG: self.data['{dataset}']['{var}'].shape = {shape}".format(dataset=dataset, var=var, shape=self.data[dataset][var].shape)) for dataset in self.datasets: for i in range(len(self.varsets)-1): valid = valid and self.data[dataset][self.varsets[i]].shape[0] == self.data[dataset][self.varsets[i+1]].shape[0] return valid def run(self): if not self.verify_data(): print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m") raise Exception("BadTrainingInputData") applyClassWeights = False if self.parameters['classifier'] == 'GradientBoostingClassifier': clf = GradientBoostingClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], subsample=self.parameters['subsample'], min_impurity_split=self.parameters['min_impurity_split'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'RandomForestClassifier': clf = RandomForestClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], bootstrap=self.parameters['bootstrap'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'ExtraTreesClassifier': clf = ExtraTreesClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], bootstrap=self.parameters['bootstrap'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier': rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0) clf0 = GradientBoostingClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], max_leaf_nodes=self.parameters['max_leaf_nodes'], criterion=self.parameters['criterion'], max_features=self.parameters['max_features'], n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], subsample=self.parameters['subsample'], min_impurity_split=self.parameters['min_impurity_split'], ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True clf = make_pipeline(rt, clf0) elif self.parameters['classifier'] == 'XGBClassifier': clf = XGBClassifier( learning_rate=self.parameters['learning_rate'], max_depth=self.parameters['max_depth'], n_estimators=self.parameters['n_estimators'], objective='binary:logitraw', colsample_bytree=self.parameters['colsample_bytree'], subsample=self.parameters['subsample'], min_child_weight=self.parameters['min_child_weight'], gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0, #reg_alpha=8, reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0, reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0, ) if self.parameters['class_weight'] == 'balanced': applyClassWeights = True elif self.parameters['classifier'] == 'MLPClassifier': classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']} clf = MLPClassifier(**classifierParams) elif self.parameters['classifier'] in ['SVC', 'LinearSVC']: ''' clf = SVC( C=1.0, cache_size=4000, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=100000, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=True ) ''' bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False if self.parameters['classifier'] == 'LinearSVC': clf = LinearSVC( class_weight='balanced', dual=self.parameters['dual'], max_iter=self.parameters['max_iter'], C=self.parameters['C'], penalty=self.parameters['penalty'], loss=self.parameters['loss'], tol=self.parameters['tol'], verbose=True, ) else: # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000 clf = SVC( C=self.parameters['C'], cache_size=self.parameters['cache_size'], class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=self.parameters['degree'], gamma=self.parameters['gamma'], kernel=self.parameters['kernel'], max_iter=self.parameters['max_iter'], probability=False, random_state=None, shrinking=self.parameters['shrinking'], tol=self.parameters['tol'], verbose=True ) if bagged: n_estimators = bagged if 'bag_oversampling' in self.parameters: n_estimators = int(n_estimators * self.parameters['bag_oversampling']) clf0 = clf clf = BaggingClassifier( clf0, max_samples=1.0 / bagged, max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0, bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False, n_estimators=n_estimators, ) else: clf = AdaBoostClassifier( DecisionTreeClassifier( min_samples_leaf=self.parameters['min_samples_leaf'], max_depth=self.parameters['max_depth'], class_weight=self.parameters['class_weight'], criterion=self.parameters['criterion'], splitter=self.parameters['splitter'], max_features=self.parameters['max_features'], ), n_estimators=self.parameters['n_estimators'], learning_rate=self.parameters['learning_rate'], algorithm=self.parameters['algorithm'], ) #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile: # clf = pickle.load(inputFile) # preprocessing print("transformation...") if 'scaler' in self.parameters: if self.parameters['scaler'] == 'standard': self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X']) elif self.parameters['scaler'] == 'minmax': self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X']) elif self.parameters['scaler'] == 'robust': self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X']) else: self.scaler = None else: self.scaler = None if self.scaler: self.data['train']['X'] = self.scaler.transform(self.data['train']['X']) self.data['test']['X'] = self.scaler.transform(self.data['test']['X']) # SHUFFLE all samples before self.shuffle = False if self.shuffle: print("shuffle input data...") for dataset in self.datasets: nSamples = self.data[dataset][self.varsets[0]].shape[0] randomPermutation = np.random.permutation(nSamples) for var in self.varsets: self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0) # LIMIT number of training samples # recommended to also shuffle samples before, because they are ordered by signal/background limitNumTrainingSamples = self.parameters['limit'] if (limitNumTrainingSamples > 0): print("limit training samples to:", limitNumTrainingSamples) #for dataset in self.datasets: # for var in self.varsets: # self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples] for dataset in self.datasets: self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False) # oversample upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None if upscale: upscalemax = self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal indices = [] for i in range(len(self.data['train']['sample_weight'])): #print(x) x= self.data['train']['sample_weight'][i] if self.data['train']['y'][i] > 0.5: x *= upscalesignal n = x * upscale # limit oversampling factor! if n > upscalemax: n=upscalemax if n<1: n=1 intN = int(n) indices += [i]*intN #floatN = n-intN #if floatN > 0: # if random.uniform(0.0,1.0) < floatN: # indices += [i] self.data['train']['X'] = self.data['train']['X'][indices] self.data['train']['y'] = self.data['train']['y'][indices] self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices] self.verify_data() # BALANCE weights # calculate total weights and class_weights nSig = len([x for x in self.data['train']['y'] if x >= 0.5]) nBkg = len([x for x in self.data['train']['y'] if x < 0.5]) print("#SIG:", nSig) print("#BKG:", nBkg) weightsSignal = [] weightsBackground = [] for i in range(len(self.data['train']['sample_weight'])): if self.data['train']['y'][i] < 0.5: weightsBackground.append(self.data['train']['sample_weight'][i]) else: weightsSignal.append(self.data['train']['sample_weight'][i]) weightsSignal.sort() weightsBackground.sort() totalWeightSignal = sum(weightsSignal) totalWeightBackground = sum(weightsBackground) signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight'] backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground print("SUM of weights for signal:", totalWeightSignal) print("SUM of weights for background:", totalWeightBackground) if applyClassWeights: print("re-weight signals by:", signalReweight) print("re-weight background by:", backgroundReweight) for i in range(len(self.data['train']['sample_weight'])): if self.data['train']['y'][i] < 0.5: self.data['train']['sample_weight'][i] *= backgroundReweight else: self.data['train']['sample_weight'][i] *= signalReweight else: print("DO NOT re-weight signals by:", signalReweight) print("...") # TRAINING learningCurve = [] if self.parameters['classifier'] == 'XGBClassifier': clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True) else: try: clf = clf.fit(**self.data['train']) except: clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y']) if 'rounds' in self.parameters and self.parameters['rounds'] > 1: for rNumber in range(self.parameters['rounds']): results = clf.predict_proba(self.data['test']['X']) auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight']) print(" round ", rNumber, " AUC=", auc1) learningCurve.append(auc1) clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y']) print("***** FIT done") # TEST try: results = clf.decision_function(self.data['test']['X']) print("***** EVALUATION on test sample done") results_train = clf.decision_function(self.data['train']['X']) print("***** EVALUATION on training sample done") print("R:", results.shape, results) results = np.c_[np.ones(results.shape[0]), results] results_train = np.c_[np.ones(results_train.shape[0]), results_train] except: results = clf.predict_proba(self.data['test']['X']) results_train = clf.predict_proba(self.data['train']['X']) # ROC curve print("calculating auc...") auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight']) auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight']) print("AUC:", auc1, " (training:", auc_training, ")") print("**** compute quantiles") qx = np.array([0.01, 0.99]) qy = np.array([0.0, 0.0]) thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0) nS = len(results) for i in range(nS): thq.Fill(results[i][1]) thq.GetQuantiles(2, qy, qx) # rescaling of SCORE to [0, 1] minProb = 2.0 maxProb = -1.0 #for i in range(len(self.data['train']['X'])): # if results_train[i][1] > maxProb: # maxProb = results_train[i][1] # if results_train[i][1] < minProb: # minProb = results_train[i][1] #for i in range(len(self.data['test']['X'])): # if results[i][1] > maxProb: # maxProb = results[i][1] # if results[i][1] < minProb: # minProb = results[i][1] minProb = qy[0] maxProb = qy[1] delta = maxProb-minProb minProb -= delta * 0.01 maxProb += delta * 0.10 useSqrt = False # fill TRAINING SCORE histogram (class probability) h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0) h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0) for i in range(len(self.data['train']['X'])): result = (results_train[i][1]-minProb)/(maxProb-minProb)