Example #1
0
    def __init__(self,
                 config,
                 sampleIdentifier,
                 trainingRegions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'backgrounds'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.signalSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'signals'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.samples = self.samplesInfo.get_samples(
            list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            systematics = [
                x for x in config.get('systematics', 'systematics').split(' ')
                if len(x.strip()) > 0
            ]
            mvaVars = []
            for systematic in systematics:
                mvaVars += config.get(treeVarSet,
                                      systematic).strip().split(' ')
            self.trainingRegionsDict[trainingRegion] = {
                'cut': config.get('Cuts', treeCutName),
                'vars': mvaVars,
            }

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
Example #2
0
    def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = eval(self.config.get('Plot_general', 'samples'))
        self.dataNames = eval(self.config.get('Plot_general', 'Data'))
        self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None
    
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)
Example #3
0
    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName, 'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet,
                                              'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0

        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(
            factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")
Example #4
0
    def __init__(self,
                 config,
                 sampleIdentifier,
                 regions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 forceRedo=False,
                 fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = list(
            eval(self.config.get('Plot_general', 'samples')))
        self.dataNames = list(eval(self.config.get('Plot_general', 'Data')))
        self.samples = self.samplesInfo.get_samples(self.sampleNames +
                                                    self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)
Example #5
0
    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers]
            self.mcSamples =   [x for x in self.mcSamples   if x.identifier in self.sampleIdentifiers]
Example #6
0
    def customInit(self, initVars):
        self.sample = initVars['sample']
        self.sampleTree = initVars['sampleTree']
        self.config = initVars['config']
        self.samplesInfo = ParseInfo(samples_path=self.config.get(
            'Directories', 'dcSamples'),
                                     config=self.config)
        self.subsamples = [
            x for x in self.samplesInfo
            if x.identifier == self.sample.identifier and x.subsample
        ]
        print("INFO: subsamples/cut")
        for s in self.subsamples:
            print(" >", s.name, s.subcut)
            self.sampleTree.addFormula(s.subcut)

        if not self.groupDict:
            self.groupDict = eval(self.config.get('LimitGeneral', 'Group'))

        self.groupNames = list(set(self.groupDict.values()))
        self.groups = {
            k: [x for x, y in self.groupDict.iteritems() if y == k]
            for k in self.groupNames
        }

        for groupName, sampleNames in self.groups.iteritems():
            self.branches.append({
                'name': self.prefix + groupName,
                'formula': self.isInGroup,
                'arguments': groupName
            })

        self.branches.append({
            'name': 'sampleIndex',
            'formula': self.getSampleIndex,
            'type': 'i'
        })

        if self.eventCountsDict:
            self.branches.append({
                'name': 'event_unique',
                'formula': self.getEventNumber,
                'type': 'l'
            })

            if len(self.sampleTree.sampleFileNames) != 1:
                print(
                    "ERROR: adding unique event numbers for chains is not implemented!"
                )
                raise Exception("SampleGroup__customInit__not_implemented")
            self.eventNumberOffset = self.eventCountsDict[
                self.sample.identifier][self.sampleTree.sampleFileNames[0]]
Example #7
0
    def __init__(self, config, region, vars = None, title=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title)>0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # additional blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] 
        
        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection, 'vars').split(',')
            print ("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [x.strip() for x in varListFromConfig if len(x.strip()) > 0]

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}
Example #8
0
    def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], [])))
        self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], [])))
        self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0]
            systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else []
            mvaVars = config.get(treeVarSet, 'Nominal').split(' ')
            weightVars = []
            #for systematic in systematics:
            for syst in systematics: 
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'
                weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)]

            self.trainingRegionsDict[trainingRegion] = {
                    'cut': config.get('Cuts', treeCutName),
                    'vars': mvaVars,
                    'weightVars': weightVars,
                    }

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize
        
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        ROOT.gSystem.Load(VHbbNameSpace)
Example #9
0
    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0
        
        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")
    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics', 'systematics').strip().split(' ')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}
        for sys in self.systematics:
            self.MVA_Vars[sys] = [x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0]

        # samples
        self.sampleNames = {
#                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
#                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
#                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
#                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
#                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
#                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
#                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
#                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}
Example #11
0
class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0
        
        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")


    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print ("INFO: initialized MvaTrainingHelper.", self.factory) 
        else:
            print ("\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m") 

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [
                    [addBackgroundTreeMethod, self.samples['BKG']],
                    [addSignalTreeMethod, self.samples['SIG']]
                ]:
            for sample in samples:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32*1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self

    # ----------------------------------------------------------------------------------------------------------------------
    # backup old .xml and .info files 
    # ----------------------------------------------------------------------------------------------------------------------
    def backupOldFiles(self):
        success = False
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        backupDir = MVAdir + 'backup/'
        try:
            os.makedirs(backupDir)
        except:
            pass
        freeNumber = 1
        try:
            lastUsedBackupDirectories = sorted(glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True)
            freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split('/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1
        except Exception as e:
            print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
            freeNumber = -1
        if freeNumber > -1:
            try:
                fileNamesToBackup = glob.glob(MVAdir + self.factoryname+'_'+self.mvaName + '.*')
                fileNamesToBackup += glob.glob(MVAdir + '/../mvatraining_MVA_ZllBDT_*.root')
                os.makedirs(backupDir + 'v%d/'%freeNumber)
                for fileNameToBackup in fileNamesToBackup:
                    shutil.copy(fileNameToBackup, backupDir + 'v%d/'%freeNumber)
                success = True
            except Exception as e:
                print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
        return success


    def run(self):
        backupFiles = False
        try:
            backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights'))
        except:
            pass
        if backupFiles:
            print('backing up old BDT files')
            self.backupOldFiles()
        # ----------------------------------------------------------------------------------------------------------------------
        # Execute TMVA
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory.Verbose()
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")'%(self.MVAtype, self.mvaName, self.MVAsettings))
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        weightF = self.config.get('Weights','weightF')
        try:
            self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings)
            print("ROOT 5 style TMVA found")
            self.factory.SetSignalWeightExpression(weightF)
            self.factory.SetBackgroundWeightExpression(weightF)
        except:
            print("ROOT 6 style TMVA found, using data loader object!!! >_<")
            print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir)
            print(" data loader:", self.dataLoader)
            print(" type:       ", self.MVAtype)
            print(" name:       ", self.mvaName)
            print(" settings:   ", self.MVAsettings)
            ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights'
            self.dataLoader.SetSignalWeightExpression(weightF)
            self.dataLoader.SetBackgroundWeightExpression(weightF)
            self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings)
        sys.stdout.flush()
        print('Execute TMVA: TrainAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TrainAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: TestAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TestAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: EvaluateAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.EvaluateAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: output.Write')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.trainingOutputFile.Close()
        return self

    def printInfo(self):
        #WRITE INFOFILE
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        infofile = open(MVAdir+self.factoryname+'_'+self.mvaName+'.info','w')
        print ('@DEBUG: output infofile name')
        print (infofile)

        info=mvainfo(self.mvaName)
        info.factoryname=self.factoryname
        info.factorysettings=self.factorysettings
        info.MVAtype=self.MVAtype
        info.MVAsettings=self.MVAsettings
        info.weightfilepath=MVAdir
        info.path=self.samplesPath
        info.varset=self.treeVarSet
        info.vars=self.MVA_Vars['Nominal']
        pickle.dump(info,infofile)
        infofile.close()

    def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0):
        hSIG = ROOT.TH1D("hSig","hSig",nBins,xMin,xMax)
        hBKG = ROOT.TH1D("hBkg","hBkg",nBins,xMin,xMax)
        print("INFO: GetEntries() = ", tree.GetEntries())
        if power != 1.0:
            print("INFO: rescale BDT score with power ", power)
        for event in tree:
            if power != 1.0:
                x = (getattr(event, self.mvaName)-xMin)/(xMax-xMin)
                if x<0:
                    x=0
                if x>0.999999:
                    x=0.999999
                value = math.pow(x, power)*(xMax-xMin)+xMin
            else:
                value = max(min(getattr(event, self.mvaName),xMax-0.00001),xMin)

            weight = event.weight
            if event.classID == 1:
                hSIG.Fill(value, weight * rescaleSig)
            else:
                hBKG.Fill(value, weight * rescaleBkg)
        ssbSum = 0.0
        sSum = 0
        bSum = 0
        sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}"
        print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----")
        print(sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)"))
        for i in range(nBins):
            ssbSum += hSIG.GetBinContent(1+i)*hSIG.GetBinContent(1+i)/(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            sSum += hSIG.GetBinContent(1+i)
            bSum += hBKG.GetBinContent(1+i)
            ssb = hSIG.GetBinContent(1+i)/math.sqrt(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            print(sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1+i),1), background=round(hBKG.GetBinContent(1+i),1), ssb=round(ssb,3)))
        expectedSignificance = math.sqrt(ssbSum)
        print(sbTableFormat.format(bin="SUM", signal=round(sSum,1), background=round(bSum,1), ssb="\x1b[34mZ=%1.3f\x1b[0m"%expectedSignificance))
        print("-"*40)
        hSIG.Delete()
        hBKG.Delete()
        return expectedSignificance, sSum, bSum

    def estimateExpectedSignificance(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)
        testTree = rootFile.Get('./TestTree')

        # run a few tests with different binnings and rescaling of BDT score
        self.getExpectedSignificance(testTree, 15, -0.8, 1.0)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.9)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0)

        # close to nominal binning
        print("---- ~nominal TEST -----")
        esTest, sTest, bTest = self.getExpectedSignificance(testTree, 15, -0.8, 0.8)
        print("---- ~nominal TRAINING (without correct normalization) -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8)

        # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes
        # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events
        rescaleSig = 1.0*sTest/sTrain
        rescaleBkg = 1.0*bTest/bTrain
        print("---- ~nominal TRAINING -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg)
Example #12
0
TrainFlag = eval(config.get('Analysis','TrainFlag'))
btagLibrary = config.get('BTagReshaping','library')
samplesinfo=config.get('Directories','samplesinfo')
channel=config.get('Configuration','channel')
VHbbNameSpace=config.get('VHbbNameSpace','library')
ROOT.gSystem.Load(VHbbNameSpace)
pathIN = config.get('Directories','SYSin')
pathOUT = config.get('Directories','SYSout')
tmpDir = config.get('Directories','scratch')
print 'INput samples:\t%s'%pathIN
print 'OUTput samples:\t%s'%pathOUT

fileLocator = FileLocator(config=config)

# samples
info = ParseInfo(samplesinfo, pathIN)
matchingSamples = [x for x in info if x.identifier==opts.sampleIdentifier and not x.subsample]
if len(matchingSamples) != 1:
    print "need exactly 1 sample identifier as input with -S !!"
    print matchingSamples
    exit(1)
sample = matchingSamples[0]

# TODO: 
collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0  else []
if len(collections) < 1:
    print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m"
print 'collections to add:', collections


for fileName in filelist:
Example #13
0
signals = eval(signals)
#backgrounds
backgrounds = config.get(run, 'backgrounds')
backgrounds = eval(backgrounds)
treeVarSet = config.get(run, 'treeVarSet')
print 'signals are', signals
print 'backgrounds are', backgrounds

#variables
#TreeVar Array
MVA_Vars = {}
MVA_Vars['Nominal'] = config.get(treeVarSet, 'Nominal')
MVA_Vars['Nominal'] = MVA_Vars['Nominal'].split(' ')

#Infofile
info = ParseInfo(samplesinfo, path)

#Workdir
workdir = ROOT.gDirectory.GetPath()

#Remove EventForTraining in order to run the MVA directly from the PREP step
#TrainCut='%s & !((evt%s)==0 || isData)'%(TCut,'%2')
#EvalCut= '%s & ((evt%s)==0 || isData)'%(TCut,'%2')
TrainCut = '!((evt%2)==0 || isData)'
EvalCut = '((evt%2)==0 || isData)'
#TrainCut='%s & EventForTraining==1'%TCut
#EvalCut='%s & EventForTraining==0'%TCut

if data_as_signal:
    TrainCut = '1'
    EvalCut = '1'
Example #14
0
#Import after configure to get help message
from myutils import BetterConfigParser, progbar, printc, ParseInfo, MvaEvaluator

config = BetterConfigParser()
config.read(opts.config)
anaTag = config.get("Analysis","tag")

#get locations:
Wdir = config.get('Directories','Wdir')
samplesinfo = config.get('Directories','samplesinfo')

#systematics
INpath  = config.get('Directories','MVAin')
OUTpath = config.get('Directories','MVAout')

info = ParseInfo(samplesinfo,INpath)

arglist = opts.discr #RTight_blavla,bsbsb

namelistIN = opts.names
namelist   = namelistIN.split(',')
print ('\n-----> SampleList: ', namelist)

MVAlist = arglist.split(',')
print ('-----> MVAList:', MVAlist)

#CONFIG
#factory
factoryname = config.get('factory','factoryname')

# unique training name
Example #15
0
    def __init__(self, config, region, vars=None, title=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # additional blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0]

        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection,
                                                'vars').split(',')
            print("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [
                x.strip() for x in varListFromConfig if len(x.strip()) > 0
            ]

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}
Example #16
0
class CachePlot(object):

    def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = eval(self.config.get('Plot_general', 'samples'))
        self.dataNames = eval(self.config.get('Plot_general', 'Data'))
        self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None
    
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

    def printInfo(self):
        print ("REGION:".ljust(24),"CUT:")
        for region,regionInfo in self.regionsDict.iteritems():
            print (" > ",region.ljust(20), regionInfo['cut'])

    def run(self):

        # keep additional branches for plotting
        try:
            keepBranchesPlot = eval(self.config.get('Branches', 'keep_branches_plot'))
        except:
            keepBranchesPlot = []
        try:
            keepBranchesPlot += eval(self.config.get('Branches', 'keep_branches'))
        except:
            pass

        # also keep some branches which might be used later in variables definition and weights
        try:
            for section in self.config.sections():
                if section.startswith('plotDef:') and self.config.has_option(section, 'relPath'):
                    keepBranchesPlot.append(self.config.get(section, 'relPath'))
        except Exception as e:
            print("\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m")
            print(e)
        try:
            keepBranchesPlot.append(self.config.get('Weights', 'weightF'))
        except:
            pass
        # plotting region cut
        for region,regionInfo in self.regionsDict.iteritems():
            keepBranchesPlot.append(regionInfo['cut'])
        keepBranchesPlotFinal = BranchList(keepBranchesPlot).getListOfBranches()
        print("KEEP:", keepBranchesPlotFinal)


        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print ('*'*80)
            print (' ',sampleToCache)
            print ('*'*80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            sampleTree = None

            # for all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [x for x in self.samples if x.identifier == sampleToCache]
            for sample in subsamples:

                # add cuts for all training regions
                for region,regionInfo in self.regionsDict.iteritems():

                    configSection = 'Plot:%s'%region
                    
                    # cuts
                    sampleCuts = [sample.subcut]
                    if regionInfo['cut']:
                        sampleCuts.append(regionInfo['cut'])
                    if self.config.has_option(configSection, 'Datacut'):
                        sampleCuts.append(self.config.get(configSection, 'Datacut'))
                    if self.config.has_option('Plot_general','addBlindingCut'):
                        sampleCuts.append(self.config.has_option('Plot_general', 'addBlindingCut'))

                    # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.)
                    cacheName = 'plot:{region}_{sample}'.format(region=region, sample=sample.name)

                    # add cache object
                    tc = TreeCache.TreeCache(
                        name=cacheName,
                        sample=sample.name,
                        cutList=sampleCuts,
                        inputFolder=self.samplesPath,
                        splitFilesChunks=self.splitFilesChunks,
                        chunkNumber=self.chunkNumber,
                        splitFilesChunkSize=self.splitFilesChunkSize,
                        fileList=self.fileList,
                        branches=keepBranchesPlotFinal,
                        config=self.config,
                        debug=True
                    )

                    # check if this part of the sample is already cached
                    isCached = tc.partIsCached()
                    if not isCached or self.forceRedo:
                        if isCached:
                            tc.deleteCachedFiles(chunkNumber=self.chunkNumber)

                        # for the first sample which comes from this files, load the tree
                        if not self.sampleTree:
                            self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True)
                            if not self.sampleTree or not self.sampleTree.tree:
                                print ("\x1b[31mERROR: creation of sample tree failed!!\x1b[0m")
                                raise Exception("CreationOfSampleTreeFailed")
                            # consistency check on the file list at submission time and now
                            fileListNow = self.sampleTree.getSampleFileNameChunk(self.chunkNumber)
                            if self.fileList and (sorted(self.fileList) != sorted(fileListNow)):
                                print ("\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m")
                                raise Exception("SampleFilesHaveChanged")

                        treeCaches.append(tc.setSampleTree(self.sampleTree).cache())
                    else:
                        print ("INFO: already cached!",tc, "(",tc.hash,")")

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print ("nothing to do!")
    def __init__(self, config, mvaName, useSyst=True, useWeightSyst=True, testRun=False):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 3
        self.sampleTrees = []
        self.config = config
        self.testRun = testRun
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}

        self.weightSYS = []
        self.weightSYSweights = {}

        self.systematics = []
        if useSyst:
            print('INFO: use systematics in training!')
            self.systList = eval(self.config.get(mvaName, 'systematics')) if self.config.has_option(mvaName, 'systematics') else []
            for syst in self.systList:
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'

                self.systematics.append({
                    'name': syst,
                    'U': self.config.get('Weights', systNameUp),
                    'D': self.config.get('Weights', systNameDown),
                    })

        # default: signal vs. background
        self.sampleNames = {
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        # for multi-output classifiers load dictionary from config
        self.categories = None
        if self.config.has_option(mvaName, 'classDict'):
            self.sampleNames = eval(self.config.get(mvaName, 'classDict'))
            self.categories = self.samples.keys()
            print("classes dict:", self.sampleNames)
        elif self.config.has_option(mvaName, 'classes'):
            self.sampleNames = dict(eval(self.config.get(mvaName, 'classes')))
            self.categories = [x[0] for x in eval(self.config.get(mvaName, 'classes'))]
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}
        if not self.categories:
            self.categories = self.samples.keys()
        if self.testRun:
            print("\x1b[31mDEBUG: TEST-RUN, using only small subset of samples!\x1b[0m")
Example #18
0
    def __init__(self,
                 config,
                 sampleIdentifier,
                 trainingRegions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        if config.has_option('Directories', 'trainingSamples'):
            self.samplesPath = self.config.get('Directories',
                                               'trainingSamples')
        else:
            self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'backgrounds'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.signalSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'signals'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        # can include DATA in the .h5 files for training
        self.dataSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'data'))
                    if self.config.has_option(trainingRegion, 'data') else []
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.samples = self.samplesInfo.get_samples(
            list(
                set(self.backgroundSampleNames + self.signalSampleNames +
                    self.dataSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(
                trainingRegion, 'treeCut') if config.has_option(
                    trainingRegion, 'treeCut') else trainingRegion
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0]
            if config.has_option(trainingRegion, 'systematics'):
                systematicsString = config.get(trainingRegion,
                                               'systematics').strip()
                if systematicsString.startswith('['):
                    systematics = eval(systematicsString)
                else:
                    systematics = systematicsString.split(' ')
            else:
                systematics = []
            mvaVars = config.get(treeVarSet, 'Nominal').split(' ')
            weightVars = []
            #for systematic in systematics:
            for syst in systematics:
                systNameUp = syst + '_UP' if self.config.has_option(
                    'Weights', syst + '_UP') else syst + '_Up'
                systNameDown = syst + '_DOWN' if self.config.has_option(
                    'Weights', syst + '_DOWN') else syst + '_Down'
                if self.config.has_option('Weights', systNameUp):
                    weightVars.append(self.config.get('Weights', systNameUp))
                if self.config.has_option('Weights', systNameDown):
                    weightVars.append(self.config.get('Weights', systNameDown))

            self.trainingRegionsDict[trainingRegion] = {
                'cut': config.get('Cuts', treeCutName),
                'vars': mvaVars,
                'weightVars': weightVars,
            }

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
Example #19
0
config = BetterConfigParser()
config.read(opts.config)
anaTag = config.get("Analysis", "tag")

#get locations:
Wdir = config.get('Directories', 'Wdir')
samplesinfo = config.get('Directories', 'samplesinfo')

#systematics
INpath = config.get('Directories', 'MVAin')
OUTpath = config.get('Directories', 'MVAout')

#read shape systematics
systematics = config.get('systematics', 'systematics')

info = ParseInfo(samplesinfo, INpath)

arglist = opts.discr  #RTight_blavla,bsbsb

namelistIN = opts.names
namelist = namelistIN.split(',')
print('\n-----> SampleList: ', namelist)

MVAlist = arglist.split(',')
print('-----> MVAList:', MVAlist)

#CONFIG
#factory
factoryname = config.get('factory', 'factoryname')

# unique training name
Example #20
0
class SkimsHelper(object):

    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers]
            self.mcSamples =   [x for x in self.mcSamples   if x.identifier in self.sampleIdentifiers]

    def prepare(self):
        # add DATA + MC samples
        self.fileNames = []
        for sample in self.dataSamples + self.mcSamples:
            print(sample.identifier)
            
            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)
            
            # get sample tree from cache
            self.fileNames += TreeCache.TreeCache(
                    sample=sample,
                    cutList=sampleCuts,
                    inputFolder=self.samplesPath,
                    config=config
                ).findCachedFileNames()
        if len(self.fileNames) < 1:
            print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m")
        return self

    def run(self):
        name = self.config.get('Configuration', 'channel') if self.config.has_option('Configuration', 'channel') else '_'
        timestamp = datetime.datetime.now().strftime("%y%m%d")
        tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root'
        destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root'

        sampleTree = SampleTree(self.fileNames, config=self.config) 

        if self.config.has_option('Plot_general', 'controlSample'):
            controlSampleDict = eval(self.config.get('Plot_general', 'controlSample'))
            controlSample = controlSampleDict[self.region] if self.region in controlSampleDict else -1
            sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i")
            print("INFO: setting controlSample to", controlSample)

        sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False)
        sampleTree.process()

        # copy to final destination
        if sampleTree.getNumberOfOutputTrees() > 0:
            try:
                self.fileLocator.cp(tmpName, destName, force=True)
                print('copy ', tmpName, destName)

                if not self.fileLocator.isValidRootFile(destName):
                    print("\x1b[31mERROR: copy failed, output is broken!\x1b[0m")
                else:
                    try:
                        self.fileLocator.rm(tmpName)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
Example #21
0
    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut')
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics',
                                      'systematics').strip().split(' ')
        self.MVA_Vars = {
            'Nominal': [
                x for x in config.get(self.treeVarSet,
                                      'Nominal').strip().split(' ')
                if len(x.strip()) > 0
            ]
        }
        for sys in self.systematics:
            self.MVA_Vars[sys] = [
                x for x in config.get(self.treeVarSet, sys).strip().split(' ')
                if len(x.strip()) > 0
            ]

        self.weightSYS = []
        self.weightWithoutBtag = self.config.get('Weights', 'weight_noBTag')
        self.weightSYSweights = {}
        for d in ['Up', 'Down']:
            for syst in [
                    'HFStats1', 'HFStats2', 'LF', 'HF', 'LFStats1', 'LFStats2',
                    'cErr2', 'cErr1', 'JES'
            ]:
                systFullName = "btag_" + syst + "_" + d
                weightName = "bTagWeightCMVAV2_Moriond_" + syst + d
                self.weightSYSweights[
                    systFullName] = self.weightWithoutBtag + '*' + weightName
                self.weightSYS.append(systFullName)

        # samples
        self.sampleNames = {
            #                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
            #                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
            #                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
            #                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
            #                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
            #                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
            #                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
            #                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
            'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
            'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
        }
        self.samples = {
            category: self.samplesInfo.get_samples(samples)
            for category, samples in self.sampleNames.iteritems()
        }
                  default=None,
                  help="max number of files to process")
(opts, args) = parser.parse_args(argv)
config = BetterConfigParser()
config.read(opts.config)

fileList = FileList.decompress(
    opts.fileList) if len(opts.fileList) > 0 else None

pathOUT = config.get('Directories', 'PREPout')
samplefiles = config.get('Directories', 'samplefiles')
sampleconf = config

whereToLaunch = config.get('Configuration', 'whereToLaunch')

info = ParseInfo(samples_path=None, config=config)
samples = [
    x for x in info
    if not x.subsample and (len(opts.sampleIdentifier) == 0 or x.identifier in
                            opts.sampleIdentifier.split(','))
]
treeCopier = copytreePSI.CopyTreePSI(config=config)
if opts.limit and len(samples) > int(opts.limit):
    samples = samples[:int(opts.limit)]
for sample in samples:
    treeCopier.copytreePSI(pathIN=samplefiles,
                           pathOUT=pathOUT,
                           folderName=sample.identifier,
                           skimmingCut=sample.addtreecut,
                           fileList=fileList)
Example #23
0
class SkimsHelper(object):
    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(
            ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [
                x for x in self.dataSamples
                if x.identifier in self.sampleIdentifiers
            ]
            self.mcSamples = [
                x for x in self.mcSamples
                if x.identifier in self.sampleIdentifiers
            ]

    def prepare(self):
        # add DATA + MC samples
        self.fileNames = []
        for sample in self.dataSamples + self.mcSamples:
            print(sample.identifier)

            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(
                    self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)

            # get sample tree from cache
            tc = TreeCache.TreeCache(sample=sample,
                                     cutList=sampleCuts,
                                     inputFolder=self.samplesPath,
                                     config=config)
            if tc.isCached():
                self.fileNames += tc.findCachedFileNames()
            else:
                print("ERROR: not cached, run cacheplot again")
                raise Exception("NotCached")
        if len(self.fileNames) < 1:
            print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m")
        return self

    def run(self):
        name = self.config.get('Configuration',
                               'channel') if self.config.has_option(
                                   'Configuration', 'channel') else '_'
        timestamp = datetime.datetime.now().strftime("%y%m%d")
        tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root'
        destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root'

        sampleTree = SampleTree(self.fileNames, config=self.config)

        if self.config.has_option('Plot_general', 'controlSample'):
            controlSampleDict = eval(
                self.config.get('Plot_general', 'controlSample'))
            controlSample = controlSampleDict[
                self.region] if self.region in controlSampleDict else -1
            sampleTree.addOutputBranch("controlSample",
                                       lambda x: controlSample,
                                       branchType="i")
            print("INFO: setting controlSample to", controlSample)

        sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False)
        sampleTree.process()

        # copy to final destination
        if sampleTree.getNumberOfOutputTrees() > 0:
            try:
                self.fileLocator.cp(tmpName, destName, force=True)
                print('copy ', tmpName, destName)

                if not self.fileLocator.isValidRootFile(destName):
                    print(
                        "\x1b[31mERROR: copy failed, output is broken!\x1b[0m")
                else:
                    try:
                        self.fileLocator.rm(tmpName)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
Example #24
0
    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) 
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)
Example #25
0
class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) 
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)

    # load numpy arrays with training/testing data
    def loadCachedNumpyArrays(self, cachedFilesPath):
        cached = True
        try:
            with open(cachedFilesPath + '/scikit_input.dmp', 'rb') as inputFile:
                self.data = pickle.load(inputFile)
            print("INFO: found numpy arrays for input in:", cachedFilesPath)
        except:
            cached = False
        return cached

    # save numpy arrays with training/testing data
    def writeNumpyArrays(self, cachedFilesPath):
        with open(cachedFilesPath + '/scikit_input.dmp', 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print("INFO: wrote numpy arrays for input to:", cachedFilesPath)

    def getCachedNumpyArrayPath(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__V:%r'%self.dataRepresentationVersion
        varsHash = hashlib.sha224(identifier).hexdigest()
        cachedFilesPath = self.logpath + '/../cache/' + varsHash + '/'
        return cachedFilesPath

    def getHash(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__PAR:%r'%self.parameters
        return hashlib.sha224(identifier).hexdigest()[:8]

    def prepare(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        self.sampleTrees = []
        categories = ['BKG', 'SIG']
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        cachedFilesPath = self.getCachedNumpyArrayPath() 
        try:
            os.makedirs(cachedFilesPath)
        except:
            pass
        
        # load numpy arrays from disk if they have been already created
        if self.loadCachedNumpyArrays(cachedFilesPath):
            return self

        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                        arrayLists[datasetName].append(inputData)

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0),
                    'y': np.array(targetLists['test'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                }

        # write numpy arrays to disk
        self.writeNumpyArrays(cachedFilesPath)

        return self

    def verify_data(self):
        valid = True
        for dataset in self.datasets:
            for var in self.varsets:
                print("DEBUG: self.data['{dataset}']['{var}'].shape = {shape}".format(dataset=dataset, var=var, shape=self.data[dataset][var].shape))

        for dataset in self.datasets:
            for i in range(len(self.varsets)-1):
                valid = valid and self.data[dataset][self.varsets[i]].shape[0] == self.data[dataset][self.varsets[i+1]].shape[0]
        return valid

    def run(self):

        if not self.verify_data():
            print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m")
            raise Exception("BadTrainingInputData")

        applyClassWeights = False
        if self.parameters['classifier'] == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'RandomForestClassifier':
            clf = RandomForestClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier':
            rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0)
            clf0 = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
            clf = make_pipeline(rt, clf0)
        elif self.parameters['classifier'] == 'XGBClassifier':
            clf = XGBClassifier(
                    learning_rate=self.parameters['learning_rate'],
                    max_depth=self.parameters['max_depth'],
                    n_estimators=self.parameters['n_estimators'],
                    objective='binary:logitraw',
                    colsample_bytree=self.parameters['colsample_bytree'],
                    subsample=self.parameters['subsample'],
                    min_child_weight=self.parameters['min_child_weight'],
                    gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0,
                    #reg_alpha=8,
                    reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0,
                    reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0,
                    ) 
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'MLPClassifier':
            classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']}
            clf = MLPClassifier(**classifierParams) 
        elif self.parameters['classifier'] in ['SVC', 'LinearSVC']:
            '''
            clf = SVC(
                        C=1.0,
                        cache_size=4000,
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma='auto',
                        kernel='rbf',
                        max_iter=100000,
                        probability=False,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=True
                    )
            '''
            bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False
            if self.parameters['classifier'] == 'LinearSVC':
                clf = LinearSVC(
                            class_weight='balanced',
                            dual=self.parameters['dual'],
                            max_iter=self.parameters['max_iter'],
                            C=self.parameters['C'],
                            penalty=self.parameters['penalty'],
                            loss=self.parameters['loss'],
                            tol=self.parameters['tol'],
                            verbose=True,
                        )
            else:
                # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000
                clf =  SVC(
                        C=self.parameters['C'],
                        cache_size=self.parameters['cache_size'],
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=self.parameters['degree'],
                        gamma=self.parameters['gamma'],
                        kernel=self.parameters['kernel'],
                        max_iter=self.parameters['max_iter'],
                        probability=False,
                        random_state=None,
                        shrinking=self.parameters['shrinking'],
                        tol=self.parameters['tol'],
                        verbose=True
                    )

            if bagged:
                n_estimators = bagged
                if 'bag_oversampling' in self.parameters:
                    n_estimators = int(n_estimators * self.parameters['bag_oversampling'])

                clf0 = clf
                clf = BaggingClassifier(
                        clf0,
                        max_samples=1.0 / bagged,
                        max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0,
                        bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False,
                        n_estimators=n_estimators,
                    )

        else:
            clf = AdaBoostClassifier(
                    DecisionTreeClassifier(
                        min_samples_leaf=self.parameters['min_samples_leaf'], 
                        max_depth=self.parameters['max_depth'], 
                        class_weight=self.parameters['class_weight'], 
                        criterion=self.parameters['criterion'],
                        splitter=self.parameters['splitter'],
                        max_features=self.parameters['max_features'],
                        ), 
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    algorithm=self.parameters['algorithm'],
                )

        #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile:
        #    clf = pickle.load(inputFile)

        # preprocessing
        print("transformation...")

        if 'scaler' in self.parameters:
            if self.parameters['scaler'] == 'standard':
                self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'minmax':
                self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'robust':
                self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X'])
            else:
                self.scaler = None
        else:
            self.scaler = None

        if self.scaler:
            self.data['train']['X'] = self.scaler.transform(self.data['train']['X'])
            self.data['test']['X'] = self.scaler.transform(self.data['test']['X'])

        # SHUFFLE all samples before
        self.shuffle = False
        if self.shuffle:
            print("shuffle input data...")
            for dataset in self.datasets:
                nSamples = self.data[dataset][self.varsets[0]].shape[0]
                randomPermutation = np.random.permutation(nSamples)
                for var in self.varsets:
                    self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0)

        # LIMIT number of training samples
        # recommended to also shuffle samples before, because they are ordered by signal/background
        limitNumTrainingSamples = self.parameters['limit']
        if (limitNumTrainingSamples > 0):
            print("limit training samples to:", limitNumTrainingSamples)
            #for dataset in self.datasets:
            #    for var in self.varsets:
            #        self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples]
            for dataset in self.datasets:
                self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False)

        # oversample
        upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None
        if upscale:
            upscalemax =  self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 
            upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal
            indices = []
            for i in range(len(self.data['train']['sample_weight'])):
                #print(x)
                x= self.data['train']['sample_weight'][i]
                if self.data['train']['y'][i] > 0.5:
                    x *= upscalesignal
                n = x * upscale
                # limit oversampling factor!
                if n > upscalemax:
                    n=upscalemax
                if n<1:
                    n=1
                intN = int(n)
                indices += [i]*intN
                #floatN = n-intN
                #if floatN > 0:
                #    if random.uniform(0.0,1.0) < floatN:
                #        indices += [i]

            self.data['train']['X'] = self.data['train']['X'][indices]
            self.data['train']['y'] = self.data['train']['y'][indices]
            self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices]
            self.verify_data()

        # BALANCE weights
        # calculate total weights and class_weights
        nSig = len([x for x in self.data['train']['y'] if x >= 0.5])
        nBkg = len([x for x in self.data['train']['y'] if x < 0.5])
        print("#SIG:", nSig)
        print("#BKG:", nBkg)
        weightsSignal = []
        weightsBackground = []
        for i in range(len(self.data['train']['sample_weight'])):
            if self.data['train']['y'][i] < 0.5:
                weightsBackground.append(self.data['train']['sample_weight'][i])
            else:
                weightsSignal.append(self.data['train']['sample_weight'][i])
        weightsSignal.sort()
        weightsBackground.sort()
        totalWeightSignal = sum(weightsSignal)
        totalWeightBackground = sum(weightsBackground)
        signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight']
        backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground
        print("SUM of weights for signal:", totalWeightSignal)
        print("SUM of weights for background:", totalWeightBackground)
        
        if applyClassWeights:
            print("re-weight signals by:", signalReweight)
            print("re-weight background by:", backgroundReweight)
            for i in range(len(self.data['train']['sample_weight'])):
                if self.data['train']['y'][i] < 0.5:
                    self.data['train']['sample_weight'][i] *= backgroundReweight
                else:
                    self.data['train']['sample_weight'][i] *= signalReweight
        else:
            print("DO NOT re-weight signals by:", signalReweight)
        print("...")
        # TRAINING

        learningCurve = []
        if self.parameters['classifier'] == 'XGBClassifier':
            clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True)
        else:
            try:
                clf = clf.fit(**self.data['train'])
            except:
                clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])
                
                if 'rounds' in self.parameters and self.parameters['rounds'] > 1:
                    for rNumber in range(self.parameters['rounds']):
                        results = clf.predict_proba(self.data['test']['X']) 
                        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
                        print(" round ", rNumber, " AUC=", auc1)
                        learningCurve.append(auc1)
                        clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])

        print("***** FIT done")

        # TEST
        try:
            results = clf.decision_function(self.data['test']['X'])
            print("***** EVALUATION on test sample done")
            results_train = clf.decision_function(self.data['train']['X'])
            print("***** EVALUATION on training sample done")

            print("R:", results.shape, results)

            results = np.c_[np.ones(results.shape[0]), results]
            results_train = np.c_[np.ones(results_train.shape[0]), results_train]
        except:
            results = clf.predict_proba(self.data['test']['X'])
            results_train = clf.predict_proba(self.data['train']['X'])

        # ROC curve
        print("calculating auc...")
        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
        auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight'])
        print("AUC:", auc1, " (training:", auc_training, ")")

        print("**** compute quantiles")
        qx = np.array([0.01, 0.99])
        qy = np.array([0.0, 0.0])
        thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0)
        nS = len(results)
        for i in range(nS):
            thq.Fill(results[i][1])
        thq.GetQuantiles(2, qy, qx)

        # rescaling of SCORE to [0, 1]
        minProb = 2.0
        maxProb = -1.0
        #for i in range(len(self.data['train']['X'])):
        #    if results_train[i][1] > maxProb:
        #        maxProb = results_train[i][1]
        #    if results_train[i][1] < minProb:
        #        minProb = results_train[i][1]
        #for i in range(len(self.data['test']['X'])):
        #    if results[i][1] > maxProb:
        #        maxProb = results[i][1]
        #    if results[i][1] < minProb:
        #        minProb = results[i][1]

        minProb = qy[0]
        maxProb = qy[1]
        delta = maxProb-minProb
        minProb -= delta * 0.01
        maxProb += delta * 0.10

        useSqrt = False

        # fill TRAINING SCORE histogram (class probability)
        h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0)
        h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0)
        for i in range(len(self.data['train']['X'])):
            result = (results_train[i][1]-minProb)/(maxProb-minProb)
Example #26
0
class CacheTraining(object):
    def __init__(self,
                 config,
                 sampleIdentifier,
                 trainingRegions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'backgrounds'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.signalSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'signals'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.samples = self.samplesInfo.get_samples(
            list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            systematics = [
                x for x in config.get('systematics', 'systematics').split(' ')
                if len(x.strip()) > 0
            ]
            mvaVars = []
            for systematic in systematics:
                mvaVars += config.get(treeVarSet,
                                      systematic).strip().split(' ')
            self.trainingRegionsDict[trainingRegion] = {
                'cut': config.get('Cuts', treeCutName),
                'vars': mvaVars,
            }

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

    def printInfo(self):
        print("REGION:".ljust(24), "CUT:")
        for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
        ):
            print(" > ", trainingRegion.ljust(20), trainingRegionInfo['cut'])

    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print('*' * 80)
            print(' ', sampleToCache)
            print('*' * 80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            self.sampleTree = None

            # use all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [
                x for x in self.samples if x.identifier == sampleToCache
            ]

            # list of branches to keep for use as MVA input variables
            branchListOfMVAVars = BranchList()
            for sample in subsamples:
                for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
                ):
                    for additionalCut in [self.TrainCut, self.EvalCut]:
                        branchListOfMVAVars.addCut(trainingRegionInfo['vars'])
            branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF'))
            mvaBranches = branchListOfMVAVars.getListOfBranches()

            # loop over all samples
            for sample in subsamples:

                # add cuts for all training regions
                for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
                ):

                    # add cuts for training and evaluation
                    for additionalCut in [self.TrainCut, self.EvalCut]:

                        # cuts
                        sampleCuts = [sample.subcut]
                        if additionalCut:
                            sampleCuts.append(additionalCut)
                        if trainingRegionInfo['cut']:
                            sampleCuts.append(trainingRegionInfo['cut'])

                        # add cache object
                        tc = TreeCache.TreeCache(
                            name='{region}_{sample}_{tr}'.format(
                                region=trainingRegion,
                                sample=sample.name,
                                tr='TRAIN'
                                if additionalCut == self.TrainCut else 'EVAL'),
                            sample=sample.name,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            splitFilesChunks=self.splitFilesChunks,
                            chunkNumber=self.chunkNumber,
                            splitFilesChunkSize=self.splitFilesChunkSize,
                            branches=mvaBranches,
                            config=self.config,
                            debug=True)

                        # check if this part of the sample is already cached
                        isCached = tc.partIsCached()
                        if not isCached or self.force:
                            if isCached:
                                tc.deleteCachedFiles(
                                    chunkNumber=self.chunkNumber)
                            # for the first sample which comes from this files, load the tree
                            if not self.sampleTree:
                                self.sampleTree = SampleTree(
                                    {
                                        'name': sample.identifier,
                                        'folder': self.samplesPath
                                    },
                                    splitFilesChunkSize=self.
                                    splitFilesChunkSize,
                                    chunkNumber=self.chunkNumber,
                                    config=self.config,
                                    saveMemory=True)
                            treeCaches.append(
                                tc.setSampleTree(self.sampleTree).cache())

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print("nothing to do!")
Example #27
0
class MvaTrainingHelper(object):
    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName, 'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet,
                                              'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0

        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(
            factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")

    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName,
                                                  "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname,
                                         self.trainingOutputFile,
                                         self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print("INFO: initialized MvaTrainingHelper.", self.factory)
        else:
            print(
                "\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m"
            )

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [[
                addBackgroundTreeMethod, self.samples['BKG']
        ], [addSignalTreeMethod, self.samples['SIG']]]:
            for sample in samples:
                print('*' * 80, '\n%s\n' % sample, '*' * 80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(sample=sample,
                                             cutList=sampleCuts,
                                             inputFolder=self.samplesPath,
                                             config=self.config,
                                             debug=True)
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32 * 1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(
                            sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(
                                sampleTree.tree, treeScale,
                                ROOT.TMVA.Types.kTraining if additionalCut
                                == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d' % (resource.getrusage(
                                resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print("\x1b[31mERROR: TREE NOT FOUND:", sample.name,
                              " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self

    # ----------------------------------------------------------------------------------------------------------------------
    # backup old .xml and .info files
    # ----------------------------------------------------------------------------------------------------------------------
    def backupOldFiles(self):
        success = False
        MVAdir = self.config.get('Directories',
                                 'vhbbpath') + '/python/weights/'
        backupDir = MVAdir + 'backup/'
        try:
            os.makedirs(backupDir)
        except:
            pass
        freeNumber = 1
        try:
            lastUsedBackupDirectories = sorted(
                glob.glob(backupDir + '/v*/'),
                key=lambda x: int(x.strip('/').split('/')[-1][1:]),
                reverse=True)
            freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split(
                '/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1
        except Exception as e:
            print("\x1b[31mERROR: creating backup of MVA files failed!", e,
                  "\x1b[0m")
            freeNumber = -1
        if freeNumber > -1:
            try:
                fileNamesToBackup = glob.glob(MVAdir + self.factoryname + '_' +
                                              self.mvaName + '.*')
                fileNamesToBackup += glob.glob(
                    MVAdir + '/../mvatraining_MVA_ZllBDT_*.root')
                os.makedirs(backupDir + 'v%d/' % freeNumber)
                for fileNameToBackup in fileNamesToBackup:
                    shutil.copy(fileNameToBackup,
                                backupDir + 'v%d/' % freeNumber)
                success = True
            except Exception as e:
                print("\x1b[31mERROR: creating backup of MVA files failed!", e,
                      "\x1b[0m")
        return success

    def run(self):
        backupFiles = False
        try:
            backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights'))
        except:
            pass
        if backupFiles:
            print('backing up old BDT files')
            self.backupOldFiles()
        # ----------------------------------------------------------------------------------------------------------------------
        # Execute TMVA
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory.Verbose()
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")' %
              (self.MVAtype, self.mvaName, self.MVAsettings))
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        weightF = self.config.get('Weights', 'weightF')
        try:
            self.factory.BookMethod(self.MVAtype, self.mvaName,
                                    self.MVAsettings)
            print("ROOT 5 style TMVA found")
            self.factory.SetSignalWeightExpression(weightF)
            self.factory.SetBackgroundWeightExpression(weightF)
        except:
            print("ROOT 6 style TMVA found, using data loader object!!! >_<")
            print(" weights dir:",
                  ROOT.TMVA.gConfig().GetIONames().fWeightFileDir)
            print(" data loader:", self.dataLoader)
            print(" type:       ", self.MVAtype)
            print(" name:       ", self.mvaName)
            print(" settings:   ", self.MVAsettings)
            ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights'
            self.dataLoader.SetSignalWeightExpression(weightF)
            self.dataLoader.SetBackgroundWeightExpression(weightF)
            self.factory.BookMethod(self.dataLoader, self.MVAtype,
                                    self.mvaName, self.MVAsettings)
        sys.stdout.flush()
        print('Execute TMVA: TrainAllMethods')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TrainAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: TestAllMethods')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TestAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: EvaluateAllMethods')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.EvaluateAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: output.Write')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.trainingOutputFile.Close()
        return self

    def printInfo(self):
        #WRITE INFOFILE
        MVAdir = self.config.get('Directories',
                                 'vhbbpath') + '/python/weights/'
        infofile = open(
            MVAdir + self.factoryname + '_' + self.mvaName + '.info', 'w')
        print('@DEBUG: output infofile name')
        print(infofile)

        info = mvainfo(self.mvaName)
        info.factoryname = self.factoryname
        info.factorysettings = self.factorysettings
        info.MVAtype = self.MVAtype
        info.MVAsettings = self.MVAsettings
        info.weightfilepath = MVAdir
        info.path = self.samplesPath
        info.varset = self.treeVarSet
        info.vars = self.MVA_Vars['Nominal']
        pickle.dump(info, infofile)
        infofile.close()

    def getExpectedSignificance(self,
                                tree,
                                nBins,
                                xMin,
                                xMax,
                                power=1.0,
                                rescaleSig=1.0,
                                rescaleBkg=1.0):
        hSIG = ROOT.TH1D("hSig", "hSig", nBins, xMin, xMax)
        hBKG = ROOT.TH1D("hBkg", "hBkg", nBins, xMin, xMax)
        print("INFO: GetEntries() = ", tree.GetEntries())
        if power != 1.0:
            print("INFO: rescale BDT score with power ", power)
        for event in tree:
            if power != 1.0:
                x = (getattr(event, self.mvaName) - xMin) / (xMax - xMin)
                if x < 0:
                    x = 0
                if x > 0.999999:
                    x = 0.999999
                value = math.pow(x, power) * (xMax - xMin) + xMin
            else:
                value = max(min(getattr(event, self.mvaName), xMax - 0.00001),
                            xMin)

            weight = event.weight
            if event.classID == 1:
                hSIG.Fill(value, weight * rescaleSig)
            else:
                hBKG.Fill(value, weight * rescaleBkg)
        ssbSum = 0.0
        sSum = 0
        bSum = 0
        sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}"
        print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----")
        print(
            sbTableFormat.format(bin="bin",
                                 signal="signal",
                                 background="background",
                                 ssb="S/sqrt(S+B)"))
        for i in range(nBins):
            ssbSum += hSIG.GetBinContent(1 + i) * hSIG.GetBinContent(1 + i) / (
                hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) if (
                    hSIG.GetBinContent(1 + i) +
                    hBKG.GetBinContent(1 + i)) > 0 else 0
            sSum += hSIG.GetBinContent(1 + i)
            bSum += hBKG.GetBinContent(1 + i)
            ssb = hSIG.GetBinContent(1 + i) / math.sqrt(
                hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) if (
                    hSIG.GetBinContent(1 + i) +
                    hBKG.GetBinContent(1 + i)) > 0 else 0
            print(
                sbTableFormat.format(bin=i,
                                     signal=round(hSIG.GetBinContent(1 + i),
                                                  1),
                                     background=round(
                                         hBKG.GetBinContent(1 + i), 1),
                                     ssb=round(ssb, 3)))
        expectedSignificance = math.sqrt(ssbSum)
        print(
            sbTableFormat.format(bin="SUM",
                                 signal=round(sSum, 1),
                                 background=round(bSum, 1),
                                 ssb="\x1b[34mZ=%1.3f\x1b[0m" %
                                 expectedSignificance))
        print("-" * 40)
        hSIG.Delete()
        hBKG.Delete()
        return expectedSignificance, sSum, bSum

    def estimateExpectedSignificance(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)
        testTree = rootFile.Get('./TestTree')

        # run a few tests with different binnings and rescaling of BDT score
        self.getExpectedSignificance(testTree, 15, -0.8, 1.0)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.9)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0)

        # close to nominal binning
        print("---- ~nominal TEST -----")
        esTest, sTest, bTest = self.getExpectedSignificance(
            testTree, 15, -0.8, 0.8)
        print("---- ~nominal TRAINING (without correct normalization) -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(
            trainTree, 15, -0.8, 0.8)

        # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes
        # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events
        rescaleSig = 1.0 * sTest / sTrain
        rescaleBkg = 1.0 * bTest / bTrain
        print("---- ~nominal TRAINING -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(
            trainTree,
            15,
            -0.8,
            0.8,
            rescaleSig=rescaleSig,
            rescaleBkg=rescaleBkg)
Example #28
0
class SampleTreesToNumpyConverter(object):
    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut')
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics',
                                      'systematics').strip().split(' ')
        self.MVA_Vars = {
            'Nominal': [
                x for x in config.get(self.treeVarSet,
                                      'Nominal').strip().split(' ')
                if len(x.strip()) > 0
            ]
        }
        for sys in self.systematics:
            self.MVA_Vars[sys] = [
                x for x in config.get(self.treeVarSet, sys).strip().split(' ')
                if len(x.strip()) > 0
            ]

        self.weightSYS = []
        self.weightWithoutBtag = self.config.get('Weights', 'weight_noBTag')
        self.weightSYSweights = {}
        for d in ['Up', 'Down']:
            for syst in [
                    'HFStats1', 'HFStats2', 'LF', 'HF', 'LFStats1', 'LFStats2',
                    'cErr2', 'cErr1', 'JES'
            ]:
                systFullName = "btag_" + syst + "_" + d
                weightName = "bTagWeightCMVAV2_Moriond_" + syst + d
                self.weightSYSweights[
                    systFullName] = self.weightWithoutBtag + '*' + weightName
                self.weightSYS.append(systFullName)

        # samples
        self.sampleNames = {
            #                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
            #                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
            #                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
            #                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
            #                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
            #                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
            #                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
            #                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
            'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
            'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
        }
        self.samples = {
            category: self.samplesInfo.get_samples(samples)
            for category, samples in self.sampleNames.iteritems()
        }

    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.samples.keys()
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }
        arrayLists_sys = {
            x: {datasetName: []
                for datasetName in datasetParts.iterkeys()}
            for x in systematics
        }
        weightLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }
        targetLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }

        weightListsSYS = {
            x: {datasetName: []
                for datasetName in datasetParts.iterkeys()}
            for x in self.weightSYS
        }

        # standard weight expression
        weightF = self.config.get('Weights', 'weightF')

        for category in categories:
            for sample in self.samples[category]:
                print('*' * 80, '\n%s\n' % sample, '*' * 80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(sample=sample,
                                             cutList=sampleCuts,
                                             inputFolder=self.samplesPath,
                                             config=self.config,
                                             debug=True)
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(
                            sample) * self.globalRescale
                        print('scale:', treeScale)

                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        features_sys = {
                            x: self.MVA_Vars[x]
                            for x in systematics
                        }
                        nFeatures = len(features)
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures),
                                             dtype=np.float32)
                        inputData_sys = {
                            x: np.zeros((nSamples, nFeatures),
                                        dtype=np.float32)
                            for x in systematics
                        }

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        for k, features_s in features_sys.iteritems():
                            for feature in features_s:
                                sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        for syst in self.weightSYS:
                            sampleTree.addFormula(self.weightSYSweights[syst])

                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(
                                weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(
                                categories.index(category))

                            # add weights varied by (btag) systematics
                            for syst in self.weightSYS:
                                weightListsSYS[syst][datasetName].append(
                                    treeScale * sampleTree.evaluate(
                                        self.weightSYSweights[syst]))

                            # fill systematics
                            for k, feature_s in features_sys.iteritems():
                                for j, feature in enumerate(feature_s):
                                    inputData_sys[k][
                                        i, j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        for sys in systematics:
                            arrayLists_sys[sys][datasetName].append(
                                inputData_sys[sys])

                    else:
                        print("\x1b[31mERROR: TREE NOT FOUND:", sample.name,
                              " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
            'train': {
                'X': np.concatenate(arrayLists['train'], axis=0),
                'y': np.array(targetLists['train'], dtype=np.float32),
                'sample_weight': np.array(weightLists['train'],
                                          dtype=np.float32),
            },
            'test': {
                'X': np.concatenate(arrayLists['test'], axis=0),
                'y': np.array(targetLists['test'], dtype=np.float32),
                'sample_weight': np.array(weightLists['test'],
                                          dtype=np.float32),
            },
            'category_labels':
            {idx: label
             for idx, label in enumerate(categories)},
            'meta': {
                'version': self.dataFormatVersion,
                'region': self.mvaName,
                'cutName': self.treeCutName,
                'cut': self.treeCut,
                'trainCut': self.trainCut,
                'testCut': self.evalCut,
                'samples': self.sampleNames,
                'weightF': weightF,
                'weightSYS': self.weightSYS,
                'variables': ' '.join(self.MVA_Vars['Nominal'])
            }
        }
        # add systematics variations
        for sys in systematics:
            self.data['train']['X_' + sys] = np.concatenate(
                arrayLists_sys[sys]['train'], axis=0)
        for syst in self.weightSYS:
            self.data['train']['sample_weight_' + syst] = np.array(
                weightListsSYS[syst]['train'], dtype=np.float32)

        numpyOutputFileName = './' + self.mvaName + '.dmpz'
        with gzip.open(numpyOutputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print(self.data['meta'])
        print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")
class SampleTreesToNumpyConverter(object):

    def __init__(self, config, mvaName, useSyst=True, useWeightSyst=True, testRun=False):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 3
        self.sampleTrees = []
        self.config = config
        self.testRun = testRun
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}

        self.weightSYS = []
        self.weightSYSweights = {}

        self.systematics = []
        if useSyst:
            print('INFO: use systematics in training!')
            self.systList = eval(self.config.get(mvaName, 'systematics')) if self.config.has_option(mvaName, 'systematics') else []
            for syst in self.systList:
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'

                self.systematics.append({
                    'name': syst,
                    'U': self.config.get('Weights', systNameUp),
                    'D': self.config.get('Weights', systNameDown),
                    })

        # default: signal vs. background
        self.sampleNames = {
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        # for multi-output classifiers load dictionary from config
        self.categories = None
        if self.config.has_option(mvaName, 'classDict'):
            self.sampleNames = eval(self.config.get(mvaName, 'classDict'))
            self.categories = self.samples.keys()
            print("classes dict:", self.sampleNames)
        elif self.config.has_option(mvaName, 'classes'):
            self.sampleNames = dict(eval(self.config.get(mvaName, 'classes')))
            self.categories = [x[0] for x in eval(self.config.get(mvaName, 'classes'))]
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}
        if not self.categories:
            self.categories = self.samples.keys()
        if self.testRun:
            print("\x1b[31mDEBUG: TEST-RUN, using only small subset of samples!\x1b[0m")


    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.categories 
        if categories:
            print("categories:")
            for i,category in enumerate(categories):
                print(" ",i,":", category)
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        #arrayLists_sys = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in systematics}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}

        weightListsSYS = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in self.weightSYS} 
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        weightListSYStotal = {datasetName:[] for datasetName in datasetParts.iterkeys()}

        for i,category in enumerate(categories):
            if self.testRun:
                self.samples[category] = self.samples[category][0:1]
            for j,sample in enumerate(self.samples[category]):
                print ('*'*80,'\n%s (category %d/%d sample %d/%d)\n'%(sample, i+1, len(categories), j+1, len(self.samples[category])),'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        #features_sys = {x: self.MVA_Vars[x] for x in systematics} 
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)
                        #inputData_sys = {x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics}

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        #for k, features_s in features_sys.iteritems():
                        #    for feature in features_s:
                        #        sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        #for syst in self.weightSYS:
                        #    sampleTree.addFormula(self.weightSYSweights[syst])
                        for syst in self.systematics:
                            sampleTree.addFormula(syst['U'])
                            sampleTree.addFormula(syst['D'])

                        useSpecialWeight = self.config.has_option('Weights', 'useSpecialWeight') and eval(self.config.get('Weights', 'useSpecialWeight')) 
                        if useSpecialWeight:
                            sampleTree.addFormula(sample.specialweight)

                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            eventWeight = sampleTree.evaluate(weightF)
                            specialWeight =  sampleTree.evaluate(sample.specialweight) if useSpecialWeight else 1.0 
                            totalWeight = treeScale * eventWeight * specialWeight 
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))
                            
                            # add weights varied by (btag) systematics
                            #for syst in self.weightSYS:
                            #    weightListsSYS[syst][datasetName].append(treeScale * sampleTree.evaluate(self.weightSYSweights[syst]))
                            deltas = []
                            for syst in self.systematics:
                                delta_up   = sampleTree.evaluate(syst['U']) - eventWeight
                                delta_down = sampleTree.evaluate(syst['D']) - eventWeight
                                delta = 0.5 * (np.abs(delta_up) + np.abs(delta_down))
                                deltas.append(delta*delta)
                            totalDelta = np.sqrt(sum(deltas))

                            # convert to absolute error on total event weight
                            weightListSYStotal[datasetName].append(treeScale * totalDelta * specialWeight)

                            # fill systematics 
                            #for k, feature_s in features_sys.iteritems():
                            #    for j, feature in enumerate(feature_s):
                            #        inputData_sys[k][i,j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        #for sys in systematics:
                        #    arrayLists_sys[sys][datasetName].append(inputData_sys[sys])

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        ##systematics for training
        #puresystematics = deepcopy(systematics)
        #if 'Nominal' in puresystematics:
        #    puresystematics.remove('Nominal')
        puresystematics = [x['name'] for x in self.systematics]

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    'sample_weight_error': np.array(weightListSYStotal['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0), 
                    'y': np.array(targetLists['test'], dtype=np.float32), 
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    'sample_weight_error': np.array(weightListSYStotal['test'], dtype=np.float32),
                    },
                'category_labels': {idx: label for idx, label in enumerate(categories)},
                'meta': {
                    'version': self.dataFormatVersion,
                    'region': self.mvaName,
                    'cutName': self.treeCutName,
                    'cut': self.treeCut,
                    'trainCut': self.trainCut,
                    'testCut': self.evalCut,
                    'samples': self.sampleNames,
                    'weightF': weightF,
                    'weightSYS': self.weightSYS,
                    'variables': ' '.join(self.MVA_Vars['Nominal']),
                    'systematics': puresystematics,
                    }
                }
        ## add systematics variations
        #for sys in systematics:
        #    self.data['train']['X_'+sys] = np.concatenate(arrayLists_sys[sys]['train'], axis=0)
        #for syst in self.weightSYS:
        #    self.data['train']['sample_weight_'+syst] = np.array(weightListsSYS[syst]['train'], dtype=np.float32)

        if not os.path.exists("./dumps"):
            os.makedirs("dumps")
        baseName = './dumps/' +self.config.get("Directories","Dname").split("_")[1] + '_' + self.mvaName + '_' + datetime.datetime.now().strftime("%y%m%d")
        numpyOutputFileName = baseName + '.dmpz'
        hdf5OutputFileName = baseName + '.h5'
        print("INFO: saving output...")
        
        success = False
        try:
            if self.config.has_option(self.mvaName, 'writeNumpy') and eval(self.config.get(self.mvaName, 'writeNumpy')):
                self.saveAsPickledNumpy(numpyOutputFileName)
                success = True
        except Exception as e:
            print("ERROR: writing numpy array failed.", e)

        try:
            self.saveAsHDF5(hdf5OutputFileName)
            success = True
        except Exception as e:
            print("ERROR: writing HDF5 file failed.", e)

        if success:
            print("INFO: done.")
            return True
        else:
            print("ERROR: no output file written")
            return False

    def saveAsPickledNumpy(self, outputFileName):
        with gzip.open(outputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print("written to:\x1b[34m", outputFileName, " \x1b[0m")

    def saveAsHDF5(self, outputFileName):
        f = h5py.File(outputFileName, 'w')
        for k in ['meta', 'category_labels']:
            f.attrs[k] = json.dumps(self.data[k].items())
        for k in ['train', 'test']:
            for k2 in self.data[k].keys():
                f.create_dataset(k + '/' + k2, data=self.data[k][k2], compression="gzip", compression_opts=9)
        f.close()
        print("written to:\x1b[34m", outputFileName, " \x1b[0m")
ang_yield = eval(config.get('AngularLike', 'yields'))

#path=opts.path
pathIN = config.get('Directories', 'SYSin')
pathOUT = config.get('Directories', 'SYSout')
tmpDir = os.environ["TMPDIR"]

print 'INput samples:\t%s' % pathIN
print 'OUTput samples:\t%s' % pathOUT

#storagesamples = config.get('Directories','storagesamples')

namelist = opts.names.split(',')

#load info
info = ParseInfo(samplesinfo, pathIN)


def deltaPhi(phi1, phi2):
    result = phi1 - phi2
    while (result > math.pi):
        result -= 2 * math.pi
    while (result <= -math.pi):
        result += 2 * math.pi
    return result


def resolutionBias(eta):
    if (eta < 0.5): return 0.052
    if (eta < 1.1): return 0.057
    if (eta < 1.7): return 0.096
Example #31
0
class CachePlot(object):
    def __init__(self,
                 config,
                 sampleIdentifier,
                 regions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 forceRedo=False,
                 fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = list(
            eval(self.config.get('Plot_general', 'samples')))
        self.dataNames = list(eval(self.config.get('Plot_general', 'Data')))
        self.samples = self.samplesInfo.get_samples(self.sampleNames +
                                                    self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

    def printInfo(self):
        print("REGION:".ljust(24), "CUT:")
        for region, regionInfo in self.regionsDict.iteritems():
            print(" > ", region.ljust(20), regionInfo['cut'])

    def run(self):

        # keep additional branches for plotting
        try:
            keepBranchesPlot = eval(
                self.config.get('Branches', 'keep_branches_plot'))
        except:
            keepBranchesPlot = []
        try:
            keepBranchesPlot += eval(
                self.config.get('Branches', 'keep_branches'))
        except:
            pass

        # also keep some branches which might be used later in variables definition and weights
        try:
            for section in self.config.sections():
                try:
                    if section.startswith(
                            'plotDef:') and self.config.has_option(
                                section, 'relPath'):
                        keepBranchesPlot.append(
                            self.config.get(section, 'relPath'))
                except Exception as e:
                    print("\x1b[31mWARNING: config error in:", section, "=>",
                          e, "\x1b[0m")
        except Exception as e2:
            print(
                "\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m"
            )
            print(e2)
        try:
            keepBranchesPlot.append(self.config.get('Weights', 'weightF'))
        except:
            pass
        # plotting region cut
        for region, regionInfo in self.regionsDict.iteritems():
            keepBranchesPlot.append(regionInfo['cut'])
        keepBranchesPlotFinal = BranchList(
            keepBranchesPlot).getListOfBranches()
        print("KEEP:", keepBranchesPlotFinal)

        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print('*' * 80)
            print(' ', sampleToCache)
            print('*' * 80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            sampleTree = None

            # for all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [
                x for x in self.samples if x.identifier == sampleToCache
            ]
            for sample in subsamples:

                # add cuts for all training regions
                for region, regionInfo in self.regionsDict.iteritems():

                    configSection = 'Plot:%s' % region

                    # cuts
                    sampleCuts = [sample.subcut]
                    if regionInfo['cut']:
                        sampleCuts.append(regionInfo['cut'])
                    if self.config.has_option(configSection, 'Datacut'):
                        sampleCuts.append(
                            self.config.get(configSection, 'Datacut'))
                    if self.config.has_option('Plot_general',
                                              'addBlindingCut'):
                        sampleCuts.append(
                            self.config.has_option('Plot_general',
                                                   'addBlindingCut'))

                    # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.)
                    cacheName = 'plot:{region}_{sample}'.format(
                        region=region, sample=sample.name)

                    # add cache object
                    tc = TreeCache.TreeCache(
                        name=cacheName,
                        sample=sample.name,
                        cutList=sampleCuts,
                        inputFolder=self.samplesPath,
                        splitFilesChunks=self.splitFilesChunks,
                        chunkNumber=self.chunkNumber,
                        splitFilesChunkSize=self.splitFilesChunkSize,
                        fileList=self.fileList,
                        branches=keepBranchesPlotFinal,
                        config=self.config,
                        debug=True)

                    # check if this part of the sample is already cached
                    isCached = tc.partIsCached()
                    if not isCached or self.forceRedo:
                        if isCached:
                            tc.deleteCachedFiles(chunkNumber=self.chunkNumber)

                        # for the first sample which comes from this files, load the tree
                        if not self.sampleTree:
                            self.sampleTree = SampleTree(
                                {
                                    'name': sample.identifier,
                                    'folder': self.samplesPath
                                },
                                splitFilesChunkSize=self.splitFilesChunkSize,
                                chunkNumber=self.chunkNumber,
                                config=self.config,
                                saveMemory=True)
                            if not self.sampleTree or not self.sampleTree.tree:
                                print(
                                    "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m"
                                )
                                raise Exception("CreationOfSampleTreeFailed")
                            # consistency check on the file list at submission time and now
                            fileListNow = self.sampleTree.getSampleFileNameChunk(
                                self.chunkNumber)
                            if self.fileList and (sorted(self.fileList) !=
                                                  sorted(fileListNow)):
                                print(
                                    "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m"
                                )
                                raise Exception("SampleFilesHaveChanged")

                        treeCaches.append(
                            tc.setSampleTree(self.sampleTree).cache())
                    else:
                        print("INFO: already cached!", tc, "(", tc.hash, ")")

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print("nothing to do!")
Example #32
0
print "Compile external macros"
print "=======================\n"

#get locations:
Wdir = config.get('Directories',
                  'Wdir')  # working direcoty containing the ouput
samplesinfo = config.get('Directories', 'samplesinfo')  # samples_nosplit.cfg

path = config.get('Directories',
                  'plottingSamples')  # from which samples to plot

section = 'Plot:%s' % region

info = ParseInfo(
    samplesinfo, path
)  #creates a list of Samples by reading the info in samples_nosplit.cfg and the conentent of the path.

import os
if os.path.exists("../interface/DrawFunctions_C.so"):
    print 'ROOT.gROOT.LoadMacro("../interface/DrawFunctions_C.so")'
    ROOT.gROOT.LoadMacro("../interface/DrawFunctions_C.so")

if os.path.exists("../interface/VHbbNameSpace_h.so"):
    print 'ROOT.gROOT.LoadMacro("../interface/VHbbNameSpace_h.so")'
    ROOT.gROOT.LoadMacro("../interface/VHbbNameSpace_h.so")


#----------Histo from trees------------
#Get the selections and the samples
def doPlot():
Example #33
0
class PlotHelper(object):
    def __init__(self, config, region, vars=None, title=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # additional blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0]

        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection,
                                                'vars').split(',')
            print("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [
                x.strip() for x in varListFromConfig if len(x.strip()) > 0
            ]

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}

    def prepare(self):
        print(
            "INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:"
            .format(region=region))
        for var in self.vars:
            print("  > {var}".format(var=var))

        self.histogramStacks = {}
        for var in self.vars:
            self.histogramStacks[var] = StackMaker(self.config,
                                                   var,
                                                   self.region,
                                                   self.signalRegion,
                                                   None,
                                                   '_' + self.subcutPlotName,
                                                   title=self.title)

        # add DATA + MC samples
        for sample in self.dataSamples + self.mcSamples:

            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(
                    self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)

            # get sample tree from cache
            tc = TreeCache.TreeCache(sample=sample,
                                     cutList=sampleCuts,
                                     inputFolder=self.samplesPath,
                                     config=config)
            sampleTree = tc.getTree()

            if sampleTree:
                groupName = self.groupDict[sample.name]
                print(" > found the tree, #entries = ",
                      sampleTree.tree.GetEntries())
                print("   > group =", groupName)
                print(" > now adding the tree for vars=", self.vars)

                # add the sample tree for all the variables
                for var in self.vars:
                    self.histogramStacks[var].addSampleTree(
                        sample=sample,
                        sampleTree=sampleTree,
                        groupName=groupName)
            else:
                print("\x1b[31mERROR: sampleTree not available for ", sample,
                      ", run caching again!!\x1b[0m")
                raise Exception("CachedTreeMissing")
        return self

    def run(self):
        # draw
        for var in self.vars:
            self.histogramStacks[var].Draw(outputFolder=self.plotPath,
                                           prefix='{region}__{var}_'.format(
                                               region=self.region, var=var))
            self.histogramStacks[var].Draw(
                outputFolder=self.plotPath,
                prefix='comp_{region}__{var}_'.format(region=self.region,
                                                      var=var),
                normalize=True)

        return self

    def getHistogramStack(self, var):
        if var in self.vars and var in self.histogramStacks:
            return self.histogramStacks[var]
        else:
            return None
Example #34
0
from ROOT import TAxis
from ROOT import TLorentzVector
from ROOT import TMath
from ROOT import TLegend
#from ROOT import cmath

from ROOT import gStyle
from ROOT import gPad

from ROOT import TCanvas, TColor, TGaxis, TH1F, TPad
from ROOT import kBlack, kBlue, kRed, kViolet

# load configuration and list of used samples
config = XbbConfigReader.read('Zll2018')
path = "Zll2018config/samples_nosplit.ini"
sampleInfo = ParseInfo(config, path, config=config)

usedSamples = sampleInfo.get_samples(XbbConfigTools(config).getMC())
#usedSamples = sampleInfo.get_samples(['ZJetsHT100', 'ZH_Znunu'])

usedSampleIdentifiers = list(set([x.identifier for x in usedSamples]))
print('usedSampleIdentifiers', usedSampleIdentifiers)

# some samples come from same set of ROOT trees (=have same identifier)
# -> find list of unique identifiers to avoid to process same tree file twice
#sampleIdentifiers = sampleInfo.getSampleIdentifiers()
#usedSampleIdentifiers = ParseInfo.filterIdentifiers(sampleIdentifiers, usedSamples)

# from which step to take the root trees
directory = config.get('Directories', 'sysOUT4')
Example #35
0
config = BetterConfigParser()
config.read(opts.config)

#namelist=opts.names.split(',')
#print "namelist:",namelist

pathIN = config.get('Directories', 'PREPin')
pathOUT = config.get('Directories', 'PREPout')
samplesinfo = config.get('Directories', 'samplesinfo')
sampleconf = BetterConfigParser()
sampleconf.read(samplesinfo)

prefix = sampleconf.get('General', 'prefix')

info = ParseInfo(samples_path=pathIN, config=config)
print "samplesinfo:", samplesinfo
cross_sections = {}
samples = []
for job in info:
    if not job.identifier in samples:
        if type(job.xsec) is list: job.xsec = job.xsec[0]
        cross_sections[job.identifier] = job.xsec
        samples.append(job.identifier)

for sample in samples:
    print sample, "\t", cross_sections[sample]
#    print dir(job)
#    print "job.name:",job.name," job.cross_section:",job.xsec
#    print "job.prefix:",job.prefix
#    if not job.name in namelist:
Example #36
0
    def __init__(self, opts):

        # get file list
        self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None
        print "len(filelist)",len(self.filelist),
        if len(self.filelist) > 0:
            print "filelist[0]:", self.filelist[0]
        else:
            print ''

        # config
        self.debug = 'XBBDEBUG' in os.environ
        self.verifyCopy = True
        self.opts = opts
        self.config = BetterConfigParser()
        self.config.read(opts.config)
        self.channel = self.config.get('Configuration', 'channel')

        # load namespace, TODO
        VHbbNameSpace = self.config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # directories
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')
        print 'INput samples:\t%s'%self.pathIN
        print 'OUTput samples:\t%s'%self.pathOUT

        self.fileLocator = FileLocator(config=self.config)

        # check if given sample identifier uniquely matches a samples from config
        matchingSamples = ParseInfo(samples_path=self.pathIN, config=self.config).find(identifier=opts.sampleIdentifier)
        if len(matchingSamples) != 1:
            print "ERROR: need exactly 1 sample identifier as input with -S !!"
            print matchingSamples
            exit(1)
        self.sample = matchingSamples[0]

        # collections
        self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0  else []
        if len(self.collections) < 1:
            print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m"
        print 'collections to add:', self.collections
        self.collections = self.parseCollectionList(self.collections)
        print 'after parsing:', self.collections

        # temorary folder to save the files of this job on the scratch
        temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex

        # input files
        self.subJobs = []
        if opts.join:
            print("INFO: join input files! This is an experimental feature!")

            # translate naming convention of .txt file to imported files after the prep step
            inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist]

            self.subJobs.append({
                'inputFileNames': self.filelist,
                'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep],
                'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]),
                'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]),
                })

        else:
            
            # create separate subjob for all files (default!)
            for inputFileName in self.filelist:
                inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)]

                self.subJobs.append({
                    'inputFileNames': [inputFileName],
                    'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep],
                    'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]),
                    'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]),
                    })
# path=opts.path
pathIN = config.get("Directories", "SYSin")
pathOUT = config.get("Directories", "SYSout")
tmpDir = os.environ["TMPDIR"]

print "INput samples:\t%s" % pathIN
print "OUTput samples:\t%s" % pathOUT


# storagesamples = config.get('Directories','storagesamples')


namelist = opts.names.split(",")

# load info
info = ParseInfo(samplesinfo, pathIN)


def deltaPhi(phi1, phi2):
    result = phi1 - phi2
    while result > math.pi:
        result -= 2 * math.pi
    while result <= -math.pi:
        result += 2 * math.pi
    return result


def resolutionBias(eta):
    if eta < 0.5:
        return 0.052
    if eta < 1.1:
Example #38
0
    train_list = (config.get('MVALists', 'List_for_submitscript')).split(',')
    print train_list
    for item in train_list:
        submit(item, repDict)

if opts.task == 'dc':
    DC_vars = (config.get('LimitGeneral', 'List')).split(',')
    print DC_vars

Plot_vars = ['']
if opts.task == 'plot' or opts.task == 'singleplot' or opts.task == 'mergesingleplot' or opts.task == 'checksingleplot':
    Plot_vars = (config.get('Plot_general', 'List')).split(',')

if not opts.task == 'prep':
    path = config.get("Directories", "samplepath")
    info = ParseInfo(samplesinfo, path)

if opts.task == 'plot':
    repDict['queue'] = 'all.q'
    for item in Plot_vars:
        submit(item, repDict)

if opts.task == 'trainReg':
    repDict['queue'] = 'all.q'
    submit('trainReg', repDict)

elif opts.task == 'dc':
    repDict['queue'] = 'all.q'
    for item in DC_vars:
        # item here contains the dc name
        submit(item, repDict)
Example #39
0
# Set rescale factor of 2 in case of TrainFlag
if TrainFlag:
    MC_rescale_factor=1.
    #print 'I RESCALE BY 2.0'
else: MC_rescale_factor = 1.

#systematics up/down

if doSYS == 'False':
    UD = []
else:
    UD = ['Up','Down']

#Parse samples configuration
info = ParseInfo(samplesinfo,path)

# get all the treeCut sets
all_samples        = info.get_samples(signals+backgrounds+additionals)
signal_samples     = info.get_samples(signals) 
background_samples = info.get_samples(backgrounds) 
data_sample_names  = config.get('dc:%s'%var,'data').split(' ')
data_samples       = info.get_samples(data_sample_names)

print '\n-----> Collecting all Samples...'
print '         Signals     : ', signals
print '         Backgrounds : ', backgrounds
print '         Data        : ', data_sample_names


#-------------------------------------------------------------------------------------------------
Example #40
0
    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)
Example #41
0
class PlotHelper(object):

    def __init__(self, config, region, vars = None, title=None, sampleIdentifier=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title)>0 else None
        self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # input/output paths
        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region

        # variables
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] 

        # if variables not specified in command line, read from config
        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection, 'vars').split(',')
            print ("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [x.strip() for x in varListFromConfig if len(x.strip()) > 0]
        
        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers]
            self.mcSamples =   [x for x in self.mcSamples   if x.identifier in self.sampleIdentifiers]

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}


    def prepare(self):
        print ("INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:".format(region=region))
        for var in self.vars:
            print ("  > {var}".format(var=var))

        self.histogramStacks = {}
        for var in self.vars:
            self.histogramStacks[var] = StackMaker(self.config, var, self.region, self.signalRegion, None, '_'+self.subcutPlotName, title=self.title)
        
        # add DATA + MC samples
        for sample in self.dataSamples + self.mcSamples:
            
            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)
            
            # get sample tree from cache
            tc = TreeCache.TreeCache(
                    sample=sample,
                    cutList=sampleCuts,
                    inputFolder=self.samplesPath,
                    config=config
                )
            sampleTree = tc.getTree()

            if sampleTree:
                groupName = self.groupDict[sample.name]  
                print (" > found the tree, #entries = ", sampleTree.tree.GetEntries())
                print ("   > group =", groupName)
                print (" > now adding the tree for vars=", self.vars)
                
                # add the sample tree for all the variables
                for var in self.vars:
                    self.histogramStacks[var].addSampleTree(sample=sample, sampleTree=sampleTree, groupName=groupName, cut=self.subcut if self.subcut else '1')
            else:
                print ("\x1b[31mERROR: sampleTree not available for ", sample,", run caching again!!\x1b[0m")
                raise Exception("CachedTreeMissing")
        return self

    def run(self):
        # draw
        for var in self.vars:
            self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='{region}__{var}_'.format(region=self.region, var=var))
            if self.config.has_option('Plot_general', 'drawNormalizedPlots') and eval(self.config.get('Plot_general', 'drawNormalizedPlots')):
                self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='comp_{region}__{var}_'.format(region=self.region, var=var), normalize=True)

        return self

    def getHistogramStack(self, var):
        if var in self.vars and var in self.histogramStacks:
            return self.histogramStacks[var]
        else:
            return None
Example #42
0
    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(
            ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [
                x for x in self.dataSamples
                if x.identifier in self.sampleIdentifiers
            ]
            self.mcSamples = [
                x for x in self.mcSamples
                if x.identifier in self.sampleIdentifiers
            ]
#path=opts.path
pathIN = config.get('Directories','SYSin')
pathOUT = config.get('Directories','SYSout')
tmpDir = os.environ["TMPDIR"]

print 'INput samples:\t%s'%pathIN
print 'OUTput samples:\t%s'%pathOUT


#storagesamples = config.get('Directories','storagesamples')


namelist=opts.names.split(',')

#load info
info = ParseInfo(samplesinfo,pathIN)

def deltaPhi(phi1, phi2): 
    result = phi1 - phi2
    while (result > math.pi): result -= 2*math.pi
    while (result <= -math.pi): result += 2*math.pi
    return result

def resolutionBias(eta):
    if(eta< 0.5): return 0.052
    if(eta< 1.1): return 0.057
    if(eta< 1.7): return 0.096
    if(eta< 2.3): return 0.134
    if(eta< 5): return 0.28
    return 0
Example #44
0
class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        if config.has_option('Directories', 'trainingSamples'):
            self.samplesPath = config.get('Directories', 'trainingSamples')
        else:
            self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) 

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut') if config.has_option(mvaName, 'treeCut') else mvaName
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0
        
        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")


    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print ("INFO: initialized MvaTrainingHelper.", self.factory) 
        else:
            print ("\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m") 

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        if self.config.has_option('Weights','useSpecialWeight') and eval(self.config.get('Weights','useSpecialWeight')):
            print("\x1b[31mERROR: specialweight cannot be used with TMVA training, set it to false and add the DY_specialWeight to weightF!!\x1b[0m")
            raise Exception("SpecialWeightNotSupported")
        
        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [
                    [addBackgroundTreeMethod, self.samples['BKG']],
                    [addSignalTreeMethod, self.samples['SIG']]
                ]:
            for sample in samples:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32*1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self

    # ----------------------------------------------------------------------------------------------------------------------
    # backup old .xml and .info files 
    # ----------------------------------------------------------------------------------------------------------------------
    def backupOldFiles(self):
        success = False
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        backupDir = MVAdir + 'backup/'
        try:
            os.makedirs(backupDir)
        except:
            pass
        freeNumber = 1
        try:
            lastUsedBackupDirectories = sorted(glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True)
            freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split('/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1
        except Exception as e:
            print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
            freeNumber = -1
        if freeNumber > -1:
            try:
                fileNamesToBackup = glob.glob(MVAdir + self.factoryname+'_'+self.mvaName + '.*')
                fileNamesToBackup += glob.glob(MVAdir + '/../mvatraining_MVA_ZllBDT_*.root')
                os.makedirs(backupDir + 'v%d/'%freeNumber)
                for fileNameToBackup in fileNamesToBackup:
                    shutil.copy(fileNameToBackup, backupDir + 'v%d/'%freeNumber)
                success = True
            except Exception as e:
                print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
        return success


    def run(self):
        backupFiles = False
        try:
            backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights'))
        except:
            pass
        if backupFiles:
            print('backing up old BDT files')
            self.backupOldFiles()
        # ----------------------------------------------------------------------------------------------------------------------
        # Execute TMVA
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory.Verbose()
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")'%(self.MVAtype, self.mvaName, self.MVAsettings))
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        weightF = self.config.get('Weights','weightF')
        try:
            self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings)
            print("ROOT 5 style TMVA found")
            self.factory.SetSignalWeightExpression(weightF)
            self.factory.SetBackgroundWeightExpression(weightF)
        except:
            print("ROOT 6 style TMVA found, using data loader object!!! >_<")
            print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir)
            print(" data loader:", self.dataLoader)
            print(" type:       ", self.MVAtype)
            print(" name:       ", self.mvaName)
            print(" settings:   ", self.MVAsettings)
            ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights'
            self.dataLoader.SetSignalWeightExpression(weightF)
            self.dataLoader.SetBackgroundWeightExpression(weightF)
            self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings)
        sys.stdout.flush()
        print('Execute TMVA: TrainAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TrainAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: TestAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TestAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: EvaluateAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.EvaluateAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: output.Write')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.trainingOutputFile.Close()
        return self

    def printInfo(self):
        #WRITE INFOFILE
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        infofile = open(MVAdir+self.factoryname+'_'+self.mvaName+'.info','w')
        print ('@DEBUG: output infofile name')
        print (infofile)

        info=mvainfo(self.mvaName)
        info.factoryname=self.factoryname
        info.factorysettings=self.factorysettings
        info.MVAtype=self.MVAtype
        info.MVAsettings=self.MVAsettings
        info.weightfilepath=MVAdir
        info.path=self.samplesPath
        info.varset=self.treeVarSet
        info.vars=self.MVA_Vars['Nominal']
        pickle.dump(info,infofile)
        infofile.close()

    def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0):
        hSIG = ROOT.TH1D("hSig","hSig",nBins,xMin,xMax)
        hBKG = ROOT.TH1D("hBkg","hBkg",nBins,xMin,xMax)
        print("INFO: GetEntries() = ", tree.GetEntries())
        if power != 1.0:
            print("INFO: rescale BDT score with power ", power)
        for event in tree:
            if power != 1.0:
                x = (getattr(event, self.mvaName)-xMin)/(xMax-xMin)
                if x<0:
                    x=0
                if x>0.999999:
                    x=0.999999
                value = math.pow(x, power)*(xMax-xMin)+xMin
            else:
                value = max(min(getattr(event, self.mvaName),xMax-0.00001),xMin)

            weight = event.weight
            if event.classID == 1:
                hSIG.Fill(value, weight * rescaleSig)
            else:
                hBKG.Fill(value, weight * rescaleBkg)
        ssbSum = 0.0
        sSum = 0
        bSum = 0
        sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}"
        print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----")
        print(sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)"))
        for i in range(nBins):
            ssbSum += hSIG.GetBinContent(1+i)*hSIG.GetBinContent(1+i)/(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            sSum += hSIG.GetBinContent(1+i)
            bSum += hBKG.GetBinContent(1+i)
            ssb = hSIG.GetBinContent(1+i)/math.sqrt(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            print(sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1+i),2), background=round(hBKG.GetBinContent(1+i),2), ssb=round(ssb,3)))
        expectedSignificance = math.sqrt(ssbSum)
        print(sbTableFormat.format(bin="SUM", signal=round(sSum,1), background=round(bSum,1), ssb="\x1b[34mZ=%1.3f\x1b[0m"%expectedSignificance))
        print("-"*40)
        hSIG.Delete()
        hBKG.Delete()
        return expectedSignificance, sSum, bSum

    def estimateExpectedSignificance(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)
        testTree = rootFile.Get('./TestTree')

        # run a few tests with different binnings and rescaling of BDT score
        self.getExpectedSignificance(testTree, 15, -0.8, 1.0)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.9)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.75)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.7)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0)

        # close to nominal binning
        print("---- ~nominal TEST -----")
        esTest, sTest, bTest = self.getExpectedSignificance(testTree, 15, -0.8, 0.8)
        print("---- ~nominal TRAINING (without correct normalization) -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8)

        # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes
        # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events
        rescaleSig = 1.0*sTest/sTrain
        rescaleBkg = 1.0*bTest/bTrain
        print("---- ~nominal TRAINING -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg)

    def getbdtHistogram(self, tree):
        hSIG = ROOT.TH1D("hSig","TMVA overtraining check for classifier: %s"%self.mvaName,40,-1,1)
        hBKG = ROOT.TH1D("hBkg","TMVA overtraining check for classifier: %s"%self.mvaName,40,-1,1)
        print("INFO: GetEntries() = ", tree.GetEntries())
        for event in tree:
            value = getattr(event, self.mvaName)

            if event.classID == 1:
                hSIG.Fill(value)
            else:
                hBKG.Fill(value)
        return [hSIG, hBKG]

    def setTMVASyle(self):
        # style
        self.hSIGtest.SetLineColor(ROOT.TColor.GetColor("#0000ee"))
        self.hSIGtest.SetLineWidth(1)
        self.hSIGtest.SetFillStyle(1001)
        self.hSIGtest.SetFillColor(ROOT.TColor.GetColor("#7d99d1"))
        self.hSIGtest.SetTitle("TMVA overtraining check for classifier: %s"%self.mvaName )

        self.hBKGtest.SetLineColor(ROOT.TColor.GetColor("#ff0000"))
        self.hBKGtest.SetLineWidth(1)
        self.hBKGtest.SetFillStyle(3554)
        self.hBKGtest.SetFillColor(ROOT.TColor.GetColor("#ff0000"))
        self.hBKGtest.SetTitle(self.hSIGtest.GetTitle())


        self.hSIGtrain.SetMarkerColor(self.hSIGtest.GetLineColor())
        self.hSIGtrain.SetMarkerSize(0.7)
        self.hSIGtrain.SetMarkerStyle(20)
        self.hSIGtrain.SetLineWidth(1)
        self.hSIGtrain.SetLineColor(self.hSIGtest.GetLineColor())
        self.hSIGtrain.SetTitle(self.hSIGtest.GetTitle())

        self.hBKGtrain.SetMarkerColor(self.hBKGtest.GetLineColor())
        self.hBKGtrain.SetMarkerSize(0.7)
        self.hBKGtrain.SetMarkerStyle(20)
        self.hBKGtrain.SetLineWidth(1)
        self.hBKGtrain.SetLineColor(self.hBKGtest.GetLineColor())
        self.hBKGtrain.SetTitle(self.hSIGtest.GetTitle())

        TMVAStyle = ROOT.TStyle(ROOT.gROOT.GetStyle("Plain"))# // our style is based on Plain
        TMVAStyle.SetName("TMVA")
        TMVAStyle.SetTitle("TMVA style based on \"Plain\" with modifications defined in tmvaglob.C")
        ROOT.gROOT.GetListOfStyles().Add(TMVAStyle)
        ROOT.gROOT.SetStyle("TMVA")
         	
        TMVAStyle.SetLineStyleString( 5, "[52 12]" )
        TMVAStyle.SetLineStyleString( 6, "[22 12]" )
        TMVAStyle.SetLineStyleString( 7, "[22 10 7 10]" )
        
        UsePaperStyle = False
    
        #// the pretty color palette of old
        TMVAStyle.SetPalette((18 if UsePaperStyle else 1))
    
        #// use plain black on white colors
        TMVAStyle.SetFrameBorderMode(0)
        TMVAStyle.SetCanvasBorderMode(0)
        TMVAStyle.SetPadBorderMode(0)
        TMVAStyle.SetPadColor(0)
        TMVAStyle.SetFillStyle(0)
    
        TMVAStyle.SetLegendBorderSize(0)
    
        c_TitleBox = ROOT.TColor.GetColor( "#5D6B7D" )
        c_TitleText = ROOT.TColor.GetColor( "#FFFFFF" )
        c_TitleBorder = ROOT.TColor.GetColor( "#7D8B9D" )
        c_FrameFill = ROOT.TColor.GetColor( "#fffffd" )
        c_Canvas = ROOT.TColor.GetColor( "#f0f0f0" )


        TMVAStyle.SetTitleFillColor( c_TitleBox )
        TMVAStyle.SetTitleTextColor( c_TitleText )
        TMVAStyle.SetTitleBorderSize( 1 )
        TMVAStyle.SetLineColor( c_TitleBorder )
        if not UsePaperStyle:
            TMVAStyle.SetFrameFillColor( c_FrameFill )
            TMVAStyle.SetCanvasColor( c_Canvas )
    
        #// set the paper & margin sizes
        TMVAStyle.SetPaperSize(20,26)
        TMVAStyle.SetPadTopMargin(0.10)
        TMVAStyle.SetPadRightMargin(0.05)
        TMVAStyle.SetPadBottomMargin(0.11)
        TMVAStyle.SetPadLeftMargin(0.12)
    
        #// use bold lines and markers
        TMVAStyle.SetMarkerStyle(21)
        TMVAStyle.SetMarkerSize(0.3)
        TMVAStyle.SetHistLineWidth(2)
        TMVAStyle.SetLineStyleString(2,"[12 12]") #// postscript dashes
    
        #// do not display any of the standard histogram decorations
        TMVAStyle.SetOptTitle(1)
        TMVAStyle.SetTitleH(0.052)
    
        TMVAStyle.SetOptStat(0)
        TMVAStyle.SetOptFit(0)
    
        #// put tick marks on top and RHS of plots
        TMVAStyle.SetPadTickX(1)
        TMVAStyle.SetPadTickY(1)

    def nomrmaliseHist(self,hSIG, hBKG):
        if (hSIG.GetSumw2N() == 0):
            hSIG.Sumw2()
        if (hBKG and hBKG.GetSumw2N() == 0): 
            hBKG.Sumw2()
     
        if(hSIG.GetSumOfWeights()!=0): 
            dx = (hSIG.GetXaxis().GetXmax() - hSIG.GetXaxis().GetXmin())/hSIG.GetNbinsX()
            hSIG.Scale(1.0/hSIG.GetSumOfWeights()/dx)
        if (hBKG != 0 and hBKG.GetSumOfWeights()!=0):
            dx = (hBKG.GetXaxis().GetXmax() - hBKG.GetXaxis().GetXmin())/hBKG.GetNbinsX()
            hBKG.Scale( 1.0/hBKG.GetSumOfWeights()/dx )

    def drawOvertraining(self):

        #normalise histograms
        self.nomrmaliseHist(self.hSIGtest, self.hBKGtest)
        self.nomrmaliseHist(self.hSIGtrain, self.hBKGtrain)

        c = ROOT.TCanvas("canvas1", "TMVA comparison %s"%self.mvaName, 0, 200, 600, 468) 

        # frame limits (choose judicuous x range)
        nrms = 10
        xmin = ROOT.TMath.Max(ROOT.TMath.Min(self.hSIGtest.GetMean() - nrms*self.hSIGtest.GetRMS(), self.hBKGtest.GetMean() - nrms*self.hBKGtest.GetRMS() ),self.hSIGtest.GetXaxis().GetXmin() )
        xmax = ROOT.TMath.Min(ROOT.TMath.Max(self.hSIGtest.GetMean() + nrms*self.hSIGtest.GetRMS(), self.hBKGtest.GetMean() + nrms*self.hBKGtest.GetRMS()), self.hSIGtest.GetXaxis().GetXmax())
        ymin = 0
        maxMult = 1.3
        #maxMult = (htype == CompareType) ? 1.3 : 1.2
        ymax = ROOT.TMath.Max(self.hSIGtest.GetMaximum(), self.hBKGtest.GetMaximum())*maxMult
        ymax = ROOT.TMath.Max(ymax,ROOT.TMath.Max(self.hSIGtrain.GetMaximum(), self.hBKGtrain.GetMaximum())*maxMult)
        #print ('ymax is', ymax)
        #print (self.hSIGtest.GetMaximum())
        #print (self.hBKGtest.GetMaximum())
        #print (self.hSIGtrain.GetMaximum())
        #print (self.hBKGtrain.GetMaximum())
        #sys.exit()
   
        # build a frame
        nb = 500
        hFrameName = "frame" + self.mvaName
        #o = ROOT.gROOT.FindObject(hFrameName)
        frame = ROOT.TH2F(hFrameName, self.hSIGtest.GetTitle(), nb, xmin, xmax, nb, ymin, ymax )
        frame.GetXaxis().SetTitle(self.mvaName + " response")
        frame.GetYaxis().SetTitle("(1/N) dN^{ }/^{ }dx")

        #TMVAGlob.SetFrameStyle( frame )
        frame.SetLabelOffset( 0.012, "X" )
        frame.SetLabelOffset( 0.012, "Y" )
        frame.GetXaxis().SetTitleOffset( 1.25 )
        frame.GetYaxis().SetTitleOffset( 1.22 )
        frame.GetXaxis().SetTitleSize( 0.045)
        frame.GetYaxis().SetTitleSize( 0.045)
        frame.GetXaxis().SetLabelSize( 0.04)
        frame.GetYaxis().SetLabelSize( 0.04)

        #// global style settings
        ROOT.gPad.SetTicks()
        ROOT.gPad.SetLeftMargin  ( 0.108)
        ROOT.gPad.SetRightMargin ( 0.050)
        ROOT.gPad.SetBottomMargin( 0.120)
   
        # eventually: draw the frame
        frame.Draw()  
    
        c.GetPad(0).SetLeftMargin(0.105 )
        frame.GetYaxis().SetTitleOffset( 1.2 )

        # Draw legend               
        legend = ROOT.TLegend(c.GetLeftMargin(), 1 - c.GetTopMargin() - 0.12, c.GetLeftMargin() + 0.40, 1 - c.GetTopMargin() )
        legend.SetFillStyle(1)
        legend.AddEntry(self.hSIGtest,"Signal"     + " (test sample)", "F")
        legend.AddEntry(self.hBKGtest,"Background" + " (test sample)", "F")
        legend.SetBorderSize(1)
        legend.SetMargin(0.2)
        legend.Draw("same")

        legend2= ROOT.TLegend( 1 - c.GetRightMargin() - 0.42, 1 - c.GetTopMargin() - 0.12, 1 - c.GetRightMargin(), 1 - c.GetTopMargin() )
        legend2.SetFillStyle(1)
        legend2.SetBorderSize(1)
        legend2.AddEntry(self.hSIGtrain,"Signal (training sample)","P")
        legend2.AddEntry(self.hBKGtrain,"Background (training sample)","P")
        legend2.SetMargin( 0.1 )
        legend2.Draw("same")

        self.setTMVASyle()

        self.hSIGtest.Draw('samehist')
        self.hBKGtest.Draw('samehist')
        self.hSIGtrain.Draw('e1same')
        self.hBKGtrain.Draw('e1same')

        #perform K-S test
        print("--- Perform Kolmogorov-Smirnov tests")
        #//Double_t kolS = sig->KolmogorovTest( self.hSIGtrain, "X" );
        #//Double_t kolB = bgd->KolmogorovTest( bgdOv, "X" );
        kolS = self.hSIGtest.KolmogorovTest( self.hSIGtrain);
        kolB = self.hBKGtest.KolmogorovTest( self.hBKGtrain);
        print ("--- Goodness of signal (background) consistency: " + str(kolS) + " (" + str(kolB) + ")")

        probatext = "Kolmogorov-Smirnov test: signal (background) probability = % 5.3g (%5.3g)"% (kolS, kolB)
        tt = ROOT.TText(0.12, 0.74, probatext)
        tt.SetNDC()
        tt.SetTextSize(0.032)
        tt.AppendPad()

        # redraw axes
        frame.Draw("sameaxis")

        #/text for overflows
        nbin = self.hSIGtest.GetNbinsX()
        dxu  = self.hSIGtest.GetBinWidth(0)
        dxo  = self.hSIGtest.GetBinWidth(nbin+1)
        uoflow =  "U/O-flow (S,B): (%.1f, %.1f)%% / (%.1f, %.1f)%%"% (self.hSIGtest.GetBinContent(0)*dxu*100, self.hBKGtest.GetBinContent(0)*dxu*100, self.hSIGtest.GetBinContent(nbin+1)*dxo*100, self.hBKGtest.GetBinContent(nbin+1)*dxo*100)
        t = ROOT.TText( 0.975, 0.115, uoflow )
        t.SetNDC()
        t.SetTextSize( 0.030 )
        t.SetTextAngle( 90 )
        t.AppendPad()    
   
        # update canvas
        c.Update()


        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        c.SaveAs(MVAdir+'overtraining%s.pdf'%self.mvaName)
        print ('I saved the canvase in', MVAdir+'overtraining%s.pdf'%self.mvaName)

    def saveOvertrainingPlots(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)

        self.hSIGtest = rootFile.Get('./Method_%s/%s/MVA_%s_S'%(self.mvaName,self.mvaName,self.mvaName))
        self.hBKGtest = rootFile.Get('./Method_%s/%s/MVA_%s_B'%(self.mvaName,self.mvaName,self.mvaName))
        self.hSIGtrain = rootFile.Get('./Method_%s/%s/MVA_%s_Train_S'%(self.mvaName,self.mvaName,self.mvaName))
        self.hBKGtrain = rootFile.Get('./Method_%s/%s/MVA_%s_Train_B'%(self.mvaName,self.mvaName,self.mvaName))
        print("./Method_%s/%s/MVA_%s_Train_B"%(self.mvaName,self.mvaName,self.mvaName))
        self.drawOvertraining()
Example #45
0
class CacheTraining(object):

    def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], [])))
        self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], [])))
        self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0]
            systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else []
            mvaVars = config.get(treeVarSet, 'Nominal').split(' ')
            weightVars = []
            #for systematic in systematics:
            for syst in systematics: 
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'
                weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)]

            self.trainingRegionsDict[trainingRegion] = {
                    'cut': config.get('Cuts', treeCutName),
                    'vars': mvaVars,
                    'weightVars': weightVars,
                    }

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize
        
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        ROOT.gSystem.Load(VHbbNameSpace)
    
    def printInfo(self):
        print ("REGION:".ljust(24),"CUT:")
        for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems():
            print (" > ",trainingRegion.ljust(20), trainingRegionInfo['cut'])

    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print ('*'*80)
            print (' ',sampleToCache)
            print ('*'*80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            self.sampleTree = None

            # use all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [x for x in self.samples if x.identifier == sampleToCache]

            # list of branches to keep for use as MVA input variables
            branchListOfMVAVars = BranchList()
            for sample in subsamples:
                for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems():
                    for additionalCut in [self.TrainCut, self.EvalCut]:
                        branchListOfMVAVars.addCut(trainingRegionInfo['vars'])
                    for weightVar in trainingRegionInfo['weightVars']:
                        branchListOfMVAVars.addCut(weightVar)
            branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF'))
            mvaBranches = branchListOfMVAVars.getListOfBranches()

            # loop over all samples
            for sample in subsamples:

                # add cuts for all training regions
                for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems():

                    # add cuts for training and evaluation
                    for additionalCut in [self.TrainCut, self.EvalCut]:

                        # cuts
                        sampleCuts = [sample.subcut]
                        if additionalCut:
                            sampleCuts.append(additionalCut)
                        if trainingRegionInfo['cut']:
                            sampleCuts.append(trainingRegionInfo['cut'])

                        # add cache object
                        tc = TreeCache.TreeCache(
                            name='{region}_{sample}_{tr}'.format(region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut==self.TrainCut else 'EVAL'),
                            sample=sample.name,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            splitFilesChunks=self.splitFilesChunks,
                            chunkNumber=self.chunkNumber,
                            splitFilesChunkSize=self.splitFilesChunkSize,
                            branches=mvaBranches,
                            config=self.config,
                            debug=True
                        )

                        # check if this part of the sample is already cached
                        isCached = tc.partIsCached()
                        if not isCached or self.force:
                            if isCached:
                                tc.deleteCachedFiles(chunkNumber=self.chunkNumber)
                            # for the first sample which comes from this files, load the tree
                            if not self.sampleTree:
                                self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True)
                            treeCaches.append(tc.setSampleTree(self.sampleTree).cache())

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print ("nothing to do!")
class SampleTreesToNumpyConverter(object):

    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics', 'systematics').strip().split(' ')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}
        for sys in self.systematics:
            self.MVA_Vars[sys] = [x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0]

        # samples
        self.sampleNames = {
#                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
#                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
#                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
#                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
#                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
#                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
#                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
#                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}


    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.samples.keys()
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        arrayLists_sys = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in systematics}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        features_sys = {x: self.MVA_Vars[x] for x in systematics} 
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)
                        inputData_sys = {x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics}

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        for k, features_s in features_sys.iteritems():
                            for feature in features_s:
                                sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                            # fill systematics 
                            for k, feature_s in features_sys.iteritems():
                                for j, feature in enumerate(feature_s):
                                    inputData_sys[k][i,j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        for sys in systematics:
                            arrayLists_sys[sys][datasetName].append(inputData_sys[sys])

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0), 
                    'y': np.array(targetLists['test'], dtype=np.float32), 
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                'category_labels': {idx: label for idx, label in enumerate(categories)},
                'meta': {
                    'version': self.dataFormatVersion,
                    'region': self.mvaName,
                    'cutName': self.treeCutName,
                    'cut': self.treeCut,
                    'trainCut': self.trainCut,
                    'testCut': self.evalCut,
                    'samples': self.sampleNames,
                    'weightF': weightF,
                    'variables': ' '.join(self.MVA_Vars['Nominal'])
                    }
                }
        # add systematics variations
        for sys in systematics:
            self.data['train']['X_'+sys] = np.concatenate(arrayLists_sys[sys]['train'], axis=0)

        numpyOutputFileName = './' + self.mvaName + '.dmpz'
        with gzip.open(numpyOutputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print(self.data['meta'])
        print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")
Example #47
0
backgrounds = eval(backgrounds)
print '\n -----> Training Backgrounds: ', backgrounds



treeVarSet  = config.get(run,'treeVarSet')
#print '\n -----> Training Features: ', treeVarSet
        
#variables
#TreeVar Array
MVA_Vars={}
MVA_Vars['Nominal']=config.get(treeVarSet,'Nominal')
MVA_Vars['Nominal']=MVA_Vars['Nominal'].split(' ')    

#Infofile
info = ParseInfo(samplesinfo,path)

#Workdir
workdir=ROOT.gDirectory.GetPath()

# Test and Train event cuts
#TrainCut = '%s & EventForTraining==1' % TCut
#EvalCut  = '%s & EventForTraining==0' % TCut
TrainCut= TCut +' & evt%2==0'
EvalCut = TCut +' & evt%2!=0'


cuts = [TrainCut,EvalCut]  

print '\n ------> with Train Cuts: ', TrainCut
print '                Test Cuts : ', EvalCut
Example #48
0
    print "Unknown Pt region"
    pt_region = 'NoSysRegion'
    #sys.exit("Unknown Pt region")
# Set rescale factor of 2 in case of TrainFalg
if TrainFlag:
    MC_rescale_factor = 2.
    print 'I RESCALE BY 2.0'
else:
    MC_rescale_factor = 1.
#systematics up/down
UD = ['Up', 'Down']

print 'Parse the sample information'
print '============================\n'
#Parse samples configuration
info = ParseInfo(samplesinfo, path)
# get all the treeCut sets
# create different sample Lists

print 'Get the sample list'
print '===================\n'
all_samples = info.get_samples(signals + backgrounds + additionals)
print 'workspace_datacard-all_samples:', [job.name for job in all_samples]

signal_samples = info.get_samples(signals)
print 'signal samples:', [job.name for job in signal_samples]

background_samples = info.get_samples(backgrounds)
data_sample_names = config.get('dc:%s' % var, 'data').split(' ')
print 'data_sample_names are', data_sample_names
data_samples = info.get_samples(data_sample_names)
Example #49
0
    for item in train_list:
        submit(item,repDict)


if opts.task == 'dc':
    #DC_vars = config.items('Limit')
    DC_vars= (config.get('LimitGeneral','List')).split(',')
    print DC_vars

if opts.task == 'plot':
    Plot_vars= (config.get('Plot_general','List')).split(',')

if not opts.task == 'prep':
    path = config.get("Directories","samplepath")
    samplesinfo = config.get("Directories","samplesinfo")
    info = ParseInfo(samplesinfo,path)

if opts.task == 'plot': 
    repDict['queue'] = 'all.q'
    for item in Plot_vars:
        submit(item,repDict)

if opts.task == 'trainReg':
    repDict['queue'] = 'all.q'
    submit('trainReg',repDict)


elif opts.task == 'dc':
    repDict['queue'] = 'all.q'
    for item in DC_vars:
        if 'ZH%s'%opts.mass in item:
Example #50
0
import sys
import os
from myutils.XbbConfig import XbbConfigReader, XbbConfigTools
from myutils import ParseInfo
from myutils.FileLocator import FileLocator
from myutils.XbbTools import XbbTools

argv = sys.argv
parser = OptionParser()
parser.add_option("-T","--tag", dest="tag", default='', help="config tag")
parser.add_option("-D","--directory", dest="directory", default='MVAout', help="directory name, e.g. MVAout")
parser.add_option("-S","--sample", dest="sample", default='TT*', help="sample")
(opts, args) = parser.parse_args(argv)

config = XbbConfigTools(config=XbbConfigReader.read(opts.tag))
path = config.get("Directories", opts.directory)
sampleInfoDirectory = config.get('Directories', 'samplefiles')
info = ParseInfo(samples_path=path, config=config)

# only take first sample which matches
sampleIdentifier = XbbTools.filterSampleList(info.getSampleIdentifiers(), XbbTools.parseSamplesList(opts.sample))[0]

# get list of ORIGINAL file names for this sample: /store/...
sampleTreeFileNames = XbbTools.getSampleTreeFileNames(sampleInfoDirectory, sampleIdentifier)

fileLocator = FileLocator(config=config)

# get local name of ffirst file
localFilename     = fileLocator.getFilePath(path, sampleIdentifier, sampleTreeFileNames[0])
print(localFilename)
Example #51
0
config = BetterConfigParser()
config.read(opts.config)
anaTag = config.get("Analysis", "tag")

#get locations:
Wdir = config.get('Directories', 'Wdir')
samplesinfo = config.get('Directories', 'samplesinfo')

#read shape systematics
systematics = config.get('systematics', 'systematics')

#systematics
INpath = config.get('Directories', 'MVAin')
OUTpath = config.get('Directories', 'MVAout')

info = ParseInfo(samplesinfo, INpath)

arglist = ''

if not evaluate_optimisation:
    arglist = opts.discr  #RTight_blavla,bsbsb
else:
    #    print '@INFO: Evaluating bdt for optimisation'
    arglist = weight

namelistIN = opts.names
namelist = namelistIN.split(',')

print('namelist', namelist)
# sys.exit(1)
Example #52
0
class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)

    # load numpy arrays with training/testing data
    def loadCachedNumpyArrays(self, cachedFilesPath):
        cached = True
        try:
            with open(cachedFilesPath + '/scikit_input.dmp', 'rb') as inputFile:
                self.data = pickle.load(inputFile)
            print("INFO: found numpy arrays for input in:", cachedFilesPath)
        except:
            cached = False
        return cached

    # save numpy arrays with training/testing data
    def writeNumpyArrays(self, cachedFilesPath):
        with open(cachedFilesPath + '/scikit_input.dmp', 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print("INFO: wrote numpy arrays for input to:", cachedFilesPath)

    def getCachedNumpyArrayPath(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__V:%r'%self.dataRepresentationVersion
        varsHash = hashlib.sha224(identifier).hexdigest()
        cachedFilesPath = self.logpath + '/../cache/' + varsHash + '/'
        return cachedFilesPath

    def getHash(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__PAR:%r'%self.parameters
        return hashlib.sha224(identifier).hexdigest()[:8]

    def prepare(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        self.sampleTrees = []
        categories = ['BKG', 'SIG']
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        cachedFilesPath = self.getCachedNumpyArrayPath() 
        try:
            os.makedirs(cachedFilesPath)
        except:
            pass
        
        # load numpy arrays from disk if they have been already created
        if self.loadCachedNumpyArrays(cachedFilesPath):
            return self

        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                        arrayLists[datasetName].append(inputData)

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0),
                    'y': np.array(targetLists['test'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                }

        # write numpy arrays to disk
        self.writeNumpyArrays(cachedFilesPath)

        return self

    def verify_data(self):
        valid = True
        for dataset in self.datasets:
            for var in self.varsets:
                print("DEBUG: self.data['{dataset}']['{var}'].shape = {shape}".format(dataset=dataset, var=var, shape=self.data[dataset][var].shape))

        for dataset in self.datasets:
            for i in range(len(self.varsets)-1):
                valid = valid and self.data[dataset][self.varsets[i]].shape[0] == self.data[dataset][self.varsets[i+1]].shape[0]
        return valid

    def run(self):

        if not self.verify_data():
            print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m")
            raise Exception("BadTrainingInputData")

        applyClassWeights = False
        if self.parameters['classifier'] == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'RandomForestClassifier':
            clf = RandomForestClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier':
            rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0)
            clf0 = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
            clf = make_pipeline(rt, clf0)
        elif self.parameters['classifier'] == 'XGBClassifier':
            clf = XGBClassifier(
                    learning_rate=self.parameters['learning_rate'],
                    max_depth=self.parameters['max_depth'],
                    n_estimators=self.parameters['n_estimators'],
                    objective='binary:logitraw',
                    colsample_bytree=self.parameters['colsample_bytree'],
                    subsample=self.parameters['subsample'],
                    min_child_weight=self.parameters['min_child_weight'],
                    gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0,
                    #reg_alpha=8,
                    reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0,
                    reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0,
                    ) 
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'MLPClassifier':
            classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']}
            clf = MLPClassifier(**classifierParams) 
        elif self.parameters['classifier'] in ['SVC', 'LinearSVC']:
            '''
            clf = SVC(
                        C=1.0,
                        cache_size=4000,
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma='auto',
                        kernel='rbf',
                        max_iter=100000,
                        probability=False,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=True
                    )
            '''
            bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False
            if self.parameters['classifier'] == 'LinearSVC':
                clf = LinearSVC(
                            class_weight='balanced',
                            dual=self.parameters['dual'],
                            max_iter=self.parameters['max_iter'],
                            C=self.parameters['C'],
                            penalty=self.parameters['penalty'],
                            loss=self.parameters['loss'],
                            tol=self.parameters['tol'],
                            verbose=True,
                        )
            else:
                # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000
                clf =  SVC(
                        C=self.parameters['C'],
                        cache_size=self.parameters['cache_size'],
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=self.parameters['degree'],
                        gamma=self.parameters['gamma'],
                        kernel=self.parameters['kernel'],
                        max_iter=self.parameters['max_iter'],
                        probability=False,
                        random_state=None,
                        shrinking=self.parameters['shrinking'],
                        tol=self.parameters['tol'],
                        verbose=True
                    )

            if bagged:
                n_estimators = bagged
                if 'bag_oversampling' in self.parameters:
                    n_estimators = int(n_estimators * self.parameters['bag_oversampling'])

                clf0 = clf
                clf = BaggingClassifier(
                        clf0,
                        max_samples=1.0 / bagged,
                        max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0,
                        bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False,
                        n_estimators=n_estimators,
                    )

        else:
            clf = AdaBoostClassifier(
                    DecisionTreeClassifier(
                        min_samples_leaf=self.parameters['min_samples_leaf'], 
                        max_depth=self.parameters['max_depth'], 
                        class_weight=self.parameters['class_weight'], 
                        criterion=self.parameters['criterion'],
                        splitter=self.parameters['splitter'],
                        max_features=self.parameters['max_features'],
                        ), 
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    algorithm=self.parameters['algorithm'],
                )

        #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile:
        #    clf = pickle.load(inputFile)

        # preprocessing
        print("transformation...")

        if 'scaler' in self.parameters:
            if self.parameters['scaler'] == 'standard':
                self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'minmax':
                self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'robust':
                self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X'])
            else:
                self.scaler = None
        else:
            self.scaler = None

        if self.scaler:
            self.data['train']['X'] = self.scaler.transform(self.data['train']['X'])
            self.data['test']['X'] = self.scaler.transform(self.data['test']['X'])

        # SHUFFLE all samples before
        self.shuffle = False
        if self.shuffle:
            print("shuffle input data...")
            for dataset in self.datasets:
                nSamples = self.data[dataset][self.varsets[0]].shape[0]
                randomPermutation = np.random.permutation(nSamples)
                for var in self.varsets:
                    self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0)

        # LIMIT number of training samples
        # recommended to also shuffle samples before, because they are ordered by signal/background
        limitNumTrainingSamples = self.parameters['limit']
        if (limitNumTrainingSamples > 0):
            print("limit training samples to:", limitNumTrainingSamples)
            #for dataset in self.datasets:
            #    for var in self.varsets:
            #        self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples]
            for dataset in self.datasets:
                self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False)

        # oversample
        upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None
        if upscale:
            upscalemax =  self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 
            upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal
            indices = []
            for i in range(len(self.data['train']['sample_weight'])):
                #print(x)
                x= self.data['train']['sample_weight'][i]
                if self.data['train']['y'][i] > 0.5:
                    x *= upscalesignal
                n = x * upscale
                # limit oversampling factor!
                if n > upscalemax:
                    n=upscalemax
                if n<1:
                    n=1
                intN = int(n)
                indices += [i]*intN
                #floatN = n-intN
                #if floatN > 0:
                #    if random.uniform(0.0,1.0) < floatN:
                #        indices += [i]

            self.data['train']['X'] = self.data['train']['X'][indices]
            self.data['train']['y'] = self.data['train']['y'][indices]
            self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices]
            self.verify_data()

        # BALANCE weights
        # calculate total weights and class_weights
        nSig = len([x for x in self.data['train']['y'] if x >= 0.5])
        nBkg = len([x for x in self.data['train']['y'] if x < 0.5])
        print("#SIG:", nSig)
        print("#BKG:", nBkg)
        weightsSignal = []
        weightsBackground = []
        for i in range(len(self.data['train']['sample_weight'])):
            if self.data['train']['y'][i] < 0.5:
                weightsBackground.append(self.data['train']['sample_weight'][i])
            else:
                weightsSignal.append(self.data['train']['sample_weight'][i])
        weightsSignal.sort()
        weightsBackground.sort()
        totalWeightSignal = sum(weightsSignal)
        totalWeightBackground = sum(weightsBackground)
        signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight']
        backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground
        print("SUM of weights for signal:", totalWeightSignal)
        print("SUM of weights for background:", totalWeightBackground)
        
        if applyClassWeights:
            print("re-weight signals by:", signalReweight)
            print("re-weight background by:", backgroundReweight)
            for i in range(len(self.data['train']['sample_weight'])):
                if self.data['train']['y'][i] < 0.5:
                    self.data['train']['sample_weight'][i] *= backgroundReweight
                else:
                    self.data['train']['sample_weight'][i] *= signalReweight
        else:
            print("DO NOT re-weight signals by:", signalReweight)
        print("...")
        # TRAINING

        learningCurve = []
        if self.parameters['classifier'] == 'XGBClassifier':
            clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True)
        else:
            try:
                clf = clf.fit(**self.data['train'])
            except:
                clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])
                
                if 'rounds' in self.parameters and self.parameters['rounds'] > 1:
                    for rNumber in range(self.parameters['rounds']):
                        results = clf.predict_proba(self.data['test']['X']) 
                        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
                        print(" round ", rNumber, " AUC=", auc1)
                        learningCurve.append(auc1)
                        clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])

        print("***** FIT done")

        # TEST
        try:
            results = clf.decision_function(self.data['test']['X'])
            print("***** EVALUATION on test sample done")
            results_train = clf.decision_function(self.data['train']['X'])
            print("***** EVALUATION on training sample done")

            print("R:", results.shape, results)

            results = np.c_[np.ones(results.shape[0]), results]
            results_train = np.c_[np.ones(results_train.shape[0]), results_train]
        except:
            results = clf.predict_proba(self.data['test']['X'])
            results_train = clf.predict_proba(self.data['train']['X'])

        # ROC curve
        print("calculating auc...")
        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
        auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight'])
        print("AUC:", auc1, " (training:", auc_training, ")")

        print("**** compute quantiles")
        qx = np.array([0.01, 0.99])
        qy = np.array([0.0, 0.0])
        thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0)
        nS = len(results)
        for i in range(nS):
            thq.Fill(results[i][1])
        thq.GetQuantiles(2, qy, qx)

        # rescaling of SCORE to [0, 1]
        minProb = 2.0
        maxProb = -1.0
        #for i in range(len(self.data['train']['X'])):
        #    if results_train[i][1] > maxProb:
        #        maxProb = results_train[i][1]
        #    if results_train[i][1] < minProb:
        #        minProb = results_train[i][1]
        #for i in range(len(self.data['test']['X'])):
        #    if results[i][1] > maxProb:
        #        maxProb = results[i][1]
        #    if results[i][1] < minProb:
        #        minProb = results[i][1]

        minProb = qy[0]
        maxProb = qy[1]
        delta = maxProb-minProb
        minProb -= delta * 0.01
        maxProb += delta * 0.10

        useSqrt = False

        # fill TRAINING SCORE histogram (class probability)
        h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0)
        h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0)
        for i in range(len(self.data['train']['X'])):
            result = (results_train[i][1]-minProb)/(maxProb-minProb)