Python ParseInfo Examples

Programming Language: Python

Namespace/Package Name: myutils

Class/Type: ParseInfo

Examples at hotexamples.com: 52

Python ParseInfo - 52 examples found. These are the top rated real world Python examples of myutils.ParseInfo extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ParseInfo(20)

get_samples(13)

MVAsettings(1)

MVAtype(1)

checkSplittedSampleName(1)

factoryname(1)

factorysettings(1)

getSampleIdentifiers(1)

get_sample(1)

path(1)

vars(1)

varset(1)

weightfilepath(1)

Example #1

Show file

    def __init__(self,
                 config,
                 sampleIdentifier,
                 trainingRegions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'backgrounds'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.signalSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'signals'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.samples = self.samplesInfo.get_samples(
            list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            systematics = [
                x for x in config.get('systematics', 'systematics').split(' ')
                if len(x.strip()) > 0
            ]
            mvaVars = []
            for systematic in systematics:
                mvaVars += config.get(treeVarSet,
                                      systematic).strip().split(' ')
            self.trainingRegionsDict[trainingRegion] = {
                'cut': config.get('Cuts', treeCutName),
                'vars': mvaVars,
            }

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

Example #2

Show file

File: cache_plot.py Project: GLP90/Xbb

    def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = eval(self.config.get('Plot_general', 'samples'))
        self.dataNames = eval(self.config.get('Plot_general', 'Data'))
        self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None
    
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

Example #3

Show file

    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName, 'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet,
                                              'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0

        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(
            factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")

Example #4

Show file

File: cache_plot.py Project: acalandr/Xbb

    def __init__(self,
                 config,
                 sampleIdentifier,
                 regions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 forceRedo=False,
                 fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = list(
            eval(self.config.get('Plot_general', 'samples')))
        self.dataNames = list(eval(self.config.get('Plot_general', 'Data')))
        self.samples = self.samplesInfo.get_samples(self.sampleNames +
                                                    self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

Example #5

Show file

File: make_skims.py Project: GLP90/Xbb

    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers]
            self.mcSamples =   [x for x in self.mcSamples   if x.identifier in self.sampleIdentifiers]

Example #6

Show file

    def customInit(self, initVars):
        self.sample = initVars['sample']
        self.sampleTree = initVars['sampleTree']
        self.config = initVars['config']
        self.samplesInfo = ParseInfo(samples_path=self.config.get(
            'Directories', 'dcSamples'),
                                     config=self.config)
        self.subsamples = [
            x for x in self.samplesInfo
            if x.identifier == self.sample.identifier and x.subsample
        ]
        print("INFO: subsamples/cut")
        for s in self.subsamples:
            print(" >", s.name, s.subcut)
            self.sampleTree.addFormula(s.subcut)

        if not self.groupDict:
            self.groupDict = eval(self.config.get('LimitGeneral', 'Group'))

        self.groupNames = list(set(self.groupDict.values()))
        self.groups = {
            k: [x for x, y in self.groupDict.iteritems() if y == k]
            for k in self.groupNames
        }

        for groupName, sampleNames in self.groups.iteritems():
            self.branches.append({
                'name': self.prefix + groupName,
                'formula': self.isInGroup,
                'arguments': groupName
            })

        self.branches.append({
            'name': 'sampleIndex',
            'formula': self.getSampleIndex,
            'type': 'i'
        })

        if self.eventCountsDict:
            self.branches.append({
                'name': 'event_unique',
                'formula': self.getEventNumber,
                'type': 'l'
            })

            if len(self.sampleTree.sampleFileNames) != 1:
                print(
                    "ERROR: adding unique event numbers for chains is not implemented!"
                )
                raise Exception("SampleGroup__customInit__not_implemented")
            self.eventNumberOffset = self.eventCountsDict[
                self.sample.identifier][self.sampleTree.sampleFileNames[0]]

Example #7

Show file

File: run_plot.py Project: perrozzi/Xbb

    def __init__(self, config, region, vars = None, title=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title)>0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # additional blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] 
        
        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection, 'vars').split(',')
            print ("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [x.strip() for x in varListFromConfig if len(x.strip()) > 0]

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}

Example #8

Show file

File: cache_training.py Project: GLP90/Xbb

    def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], [])))
        self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], [])))
        self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0]
            systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else []
            mvaVars = config.get(treeVarSet, 'Nominal').split(' ')
            weightVars = []
            #for systematic in systematics:
            for syst in systematics: 
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'
                weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)]

            self.trainingRegionsDict[trainingRegion] = {
                    'cut': config.get('Cuts', treeCutName),
                    'vars': mvaVars,
                    'weightVars': weightVars,
                    }

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize
        
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        ROOT.gSystem.Load(VHbbNameSpace)

Example #9

Show file

File: run_training.py Project: perrozzi/Xbb

    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0
        
        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")

Example #10

Show file

File: write_numpy_array_for_training.py Project: perrozzi/Xbb

    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics', 'systematics').strip().split(' ')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}
        for sys in self.systematics:
            self.MVA_Vars[sys] = [x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0]

        # samples
        self.sampleNames = {
#                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
#                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
#                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
#                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
#                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
#                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
#                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
#                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}

Example #11

Show file

File: run_training.py Project: perrozzi/Xbb

class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0
        
        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")


    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print ("INFO: initialized MvaTrainingHelper.", self.factory) 
        else:
            print ("\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m") 

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [
                    [addBackgroundTreeMethod, self.samples['BKG']],
                    [addSignalTreeMethod, self.samples['SIG']]
                ]:
            for sample in samples:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32*1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self

    # ----------------------------------------------------------------------------------------------------------------------
    # backup old .xml and .info files 
    # ----------------------------------------------------------------------------------------------------------------------
    def backupOldFiles(self):
        success = False
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        backupDir = MVAdir + 'backup/'
        try:
            os.makedirs(backupDir)
        except:
            pass
        freeNumber = 1
        try:
            lastUsedBackupDirectories = sorted(glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True)
            freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split('/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1
        except Exception as e:
            print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
            freeNumber = -1
        if freeNumber > -1:
            try:
                fileNamesToBackup = glob.glob(MVAdir + self.factoryname+'_'+self.mvaName + '.*')
                fileNamesToBackup += glob.glob(MVAdir + '/../mvatraining_MVA_ZllBDT_*.root')
                os.makedirs(backupDir + 'v%d/'%freeNumber)
                for fileNameToBackup in fileNamesToBackup:
                    shutil.copy(fileNameToBackup, backupDir + 'v%d/'%freeNumber)
                success = True
            except Exception as e:
                print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
        return success


    def run(self):
        backupFiles = False
        try:
            backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights'))
        except:
            pass
        if backupFiles:
            print('backing up old BDT files')
            self.backupOldFiles()
        # ----------------------------------------------------------------------------------------------------------------------
        # Execute TMVA
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory.Verbose()
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")'%(self.MVAtype, self.mvaName, self.MVAsettings))
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        weightF = self.config.get('Weights','weightF')
        try:
            self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings)
            print("ROOT 5 style TMVA found")
            self.factory.SetSignalWeightExpression(weightF)
            self.factory.SetBackgroundWeightExpression(weightF)
        except:
            print("ROOT 6 style TMVA found, using data loader object!!! >_<")
            print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir)
            print(" data loader:", self.dataLoader)
            print(" type:       ", self.MVAtype)
            print(" name:       ", self.mvaName)
            print(" settings:   ", self.MVAsettings)
            ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights'
            self.dataLoader.SetSignalWeightExpression(weightF)
            self.dataLoader.SetBackgroundWeightExpression(weightF)
            self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings)
        sys.stdout.flush()
        print('Execute TMVA: TrainAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TrainAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: TestAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TestAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: EvaluateAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.EvaluateAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: output.Write')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.trainingOutputFile.Close()
        return self

    def printInfo(self):
        #WRITE INFOFILE
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        infofile = open(MVAdir+self.factoryname+'_'+self.mvaName+'.info','w')
        print ('@DEBUG: output infofile name')
        print (infofile)

        info=mvainfo(self.mvaName)
        info.factoryname=self.factoryname
        info.factorysettings=self.factorysettings
        info.MVAtype=self.MVAtype
        info.MVAsettings=self.MVAsettings
        info.weightfilepath=MVAdir
        info.path=self.samplesPath
        info.varset=self.treeVarSet
        info.vars=self.MVA_Vars['Nominal']
        pickle.dump(info,infofile)
        infofile.close()

    def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0):
        hSIG = ROOT.TH1D("hSig","hSig",nBins,xMin,xMax)
        hBKG = ROOT.TH1D("hBkg","hBkg",nBins,xMin,xMax)
        print("INFO: GetEntries() = ", tree.GetEntries())
        if power != 1.0:
            print("INFO: rescale BDT score with power ", power)
        for event in tree:
            if power != 1.0:
                x = (getattr(event, self.mvaName)-xMin)/(xMax-xMin)
                if x<0:
                    x=0
                if x>0.999999:
                    x=0.999999
                value = math.pow(x, power)*(xMax-xMin)+xMin
            else:
                value = max(min(getattr(event, self.mvaName),xMax-0.00001),xMin)

            weight = event.weight
            if event.classID == 1:
                hSIG.Fill(value, weight * rescaleSig)
            else:
                hBKG.Fill(value, weight * rescaleBkg)
        ssbSum = 0.0
        sSum = 0
        bSum = 0
        sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}"
        print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----")
        print(sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)"))
        for i in range(nBins):
            ssbSum += hSIG.GetBinContent(1+i)*hSIG.GetBinContent(1+i)/(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            sSum += hSIG.GetBinContent(1+i)
            bSum += hBKG.GetBinContent(1+i)
            ssb = hSIG.GetBinContent(1+i)/math.sqrt(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            print(sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1+i),1), background=round(hBKG.GetBinContent(1+i),1), ssb=round(ssb,3)))
        expectedSignificance = math.sqrt(ssbSum)
        print(sbTableFormat.format(bin="SUM", signal=round(sSum,1), background=round(bSum,1), ssb="\x1b[34mZ=%1.3f\x1b[0m"%expectedSignificance))
        print("-"*40)
        hSIG.Delete()
        hBKG.Delete()
        return expectedSignificance, sSum, bSum

    def estimateExpectedSignificance(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)
        testTree = rootFile.Get('./TestTree')

        # run a few tests with different binnings and rescaling of BDT score
        self.getExpectedSignificance(testTree, 15, -0.8, 1.0)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.9)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0)

        # close to nominal binning
        print("---- ~nominal TEST -----")
        esTest, sTest, bTest = self.getExpectedSignificance(testTree, 15, -0.8, 0.8)
        print("---- ~nominal TRAINING (without correct normalization) -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8)

        # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes
        # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events
        rescaleSig = 1.0*sTest/sTrain
        rescaleBkg = 1.0*bTest/bTrain
        print("---- ~nominal TRAINING -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg)

Example #12

Show file

TrainFlag = eval(config.get('Analysis','TrainFlag'))
btagLibrary = config.get('BTagReshaping','library')
samplesinfo=config.get('Directories','samplesinfo')
channel=config.get('Configuration','channel')
VHbbNameSpace=config.get('VHbbNameSpace','library')
ROOT.gSystem.Load(VHbbNameSpace)
pathIN = config.get('Directories','SYSin')
pathOUT = config.get('Directories','SYSout')
tmpDir = config.get('Directories','scratch')
print 'INput samples:\t%s'%pathIN
print 'OUTput samples:\t%s'%pathOUT

fileLocator = FileLocator(config=config)

# samples
info = ParseInfo(samplesinfo, pathIN)
matchingSamples = [x for x in info if x.identifier==opts.sampleIdentifier and not x.subsample]
if len(matchingSamples) != 1:
    print "need exactly 1 sample identifier as input with -S !!"
    print matchingSamples
    exit(1)
sample = matchingSamples[0]

# TODO: 
collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0  else []
if len(collections) < 1:
    print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m"
print 'collections to add:', collections


for fileName in filelist:

Example #13

Show file

signals = eval(signals)
#backgrounds
backgrounds = config.get(run, 'backgrounds')
backgrounds = eval(backgrounds)
treeVarSet = config.get(run, 'treeVarSet')
print 'signals are', signals
print 'backgrounds are', backgrounds

#variables
#TreeVar Array
MVA_Vars = {}
MVA_Vars['Nominal'] = config.get(treeVarSet, 'Nominal')
MVA_Vars['Nominal'] = MVA_Vars['Nominal'].split(' ')

#Infofile
info = ParseInfo(samplesinfo, path)

#Workdir
workdir = ROOT.gDirectory.GetPath()

#Remove EventForTraining in order to run the MVA directly from the PREP step
#TrainCut='%s & !((evt%s)==0 || isData)'%(TCut,'%2')
#EvalCut= '%s & ((evt%s)==0 || isData)'%(TCut,'%2')
TrainCut = '!((evt%2)==0 || isData)'
EvalCut = '((evt%2)==0 || isData)'
#TrainCut='%s & EventForTraining==1'%TCut
#EvalCut='%s & EventForTraining==0'%TCut

if data_as_signal:
    TrainCut = '1'
    EvalCut = '1'

Example #14

Show file

File: evaluateMVA.py Project: dcurry09/Heppy

#Import after configure to get help message
from myutils import BetterConfigParser, progbar, printc, ParseInfo, MvaEvaluator

config = BetterConfigParser()
config.read(opts.config)
anaTag = config.get("Analysis","tag")

#get locations:
Wdir = config.get('Directories','Wdir')
samplesinfo = config.get('Directories','samplesinfo')

#systematics
INpath  = config.get('Directories','MVAin')
OUTpath = config.get('Directories','MVAout')

info = ParseInfo(samplesinfo,INpath)

arglist = opts.discr #RTight_blavla,bsbsb

namelistIN = opts.names
namelist   = namelistIN.split(',')
print ('\n-----> SampleList: ', namelist)

MVAlist = arglist.split(',')
print ('-----> MVAList:', MVAlist)

#CONFIG
#factory
factoryname = config.get('factory','factoryname')

# unique training name

Example #15

Show file

    def __init__(self, config, region, vars=None, title=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # additional blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0]

        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection,
                                                'vars').split(',')
            print("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [
                x.strip() for x in varListFromConfig if len(x.strip()) > 0
            ]

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}

Example #16

Show file

File: cache_plot.py Project: GLP90/Xbb

class CachePlot(object):

    def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = eval(self.config.get('Plot_general', 'samples'))
        self.dataNames = eval(self.config.get('Plot_general', 'Data'))
        self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None
    
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

    def printInfo(self):
        print ("REGION:".ljust(24),"CUT:")
        for region,regionInfo in self.regionsDict.iteritems():
            print (" > ",region.ljust(20), regionInfo['cut'])

    def run(self):

        # keep additional branches for plotting
        try:
            keepBranchesPlot = eval(self.config.get('Branches', 'keep_branches_plot'))
        except:
            keepBranchesPlot = []
        try:
            keepBranchesPlot += eval(self.config.get('Branches', 'keep_branches'))
        except:
            pass

        # also keep some branches which might be used later in variables definition and weights
        try:
            for section in self.config.sections():
                if section.startswith('plotDef:') and self.config.has_option(section, 'relPath'):
                    keepBranchesPlot.append(self.config.get(section, 'relPath'))
        except Exception as e:
            print("\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m")
            print(e)
        try:
            keepBranchesPlot.append(self.config.get('Weights', 'weightF'))
        except:
            pass
        # plotting region cut
        for region,regionInfo in self.regionsDict.iteritems():
            keepBranchesPlot.append(regionInfo['cut'])
        keepBranchesPlotFinal = BranchList(keepBranchesPlot).getListOfBranches()
        print("KEEP:", keepBranchesPlotFinal)


        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print ('*'*80)
            print (' ',sampleToCache)
            print ('*'*80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            sampleTree = None

            # for all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [x for x in self.samples if x.identifier == sampleToCache]
            for sample in subsamples:

                # add cuts for all training regions
                for region,regionInfo in self.regionsDict.iteritems():

                    configSection = 'Plot:%s'%region
                    
                    # cuts
                    sampleCuts = [sample.subcut]
                    if regionInfo['cut']:
                        sampleCuts.append(regionInfo['cut'])
                    if self.config.has_option(configSection, 'Datacut'):
                        sampleCuts.append(self.config.get(configSection, 'Datacut'))
                    if self.config.has_option('Plot_general','addBlindingCut'):
                        sampleCuts.append(self.config.has_option('Plot_general', 'addBlindingCut'))

                    # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.)
                    cacheName = 'plot:{region}_{sample}'.format(region=region, sample=sample.name)

                    # add cache object
                    tc = TreeCache.TreeCache(
                        name=cacheName,
                        sample=sample.name,
                        cutList=sampleCuts,
                        inputFolder=self.samplesPath,
                        splitFilesChunks=self.splitFilesChunks,
                        chunkNumber=self.chunkNumber,
                        splitFilesChunkSize=self.splitFilesChunkSize,
                        fileList=self.fileList,
                        branches=keepBranchesPlotFinal,
                        config=self.config,
                        debug=True
                    )

                    # check if this part of the sample is already cached
                    isCached = tc.partIsCached()
                    if not isCached or self.forceRedo:
                        if isCached:
                            tc.deleteCachedFiles(chunkNumber=self.chunkNumber)

                        # for the first sample which comes from this files, load the tree
                        if not self.sampleTree:
                            self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True)
                            if not self.sampleTree or not self.sampleTree.tree:
                                print ("\x1b[31mERROR: creation of sample tree failed!!\x1b[0m")
                                raise Exception("CreationOfSampleTreeFailed")
                            # consistency check on the file list at submission time and now
                            fileListNow = self.sampleTree.getSampleFileNameChunk(self.chunkNumber)
                            if self.fileList and (sorted(self.fileList) != sorted(fileListNow)):
                                print ("\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m")
                                raise Exception("SampleFilesHaveChanged")

                        treeCaches.append(tc.setSampleTree(self.sampleTree).cache())
                    else:
                        print ("INFO: already cached!",tc, "(",tc.hash,")")

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print ("nothing to do!")

Example #17

Show file

File: write_numpy_array_for_training.py Project: GLP90/Xbb

    def __init__(self, config, mvaName, useSyst=True, useWeightSyst=True, testRun=False):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 3
        self.sampleTrees = []
        self.config = config
        self.testRun = testRun
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}

        self.weightSYS = []
        self.weightSYSweights = {}

        self.systematics = []
        if useSyst:
            print('INFO: use systematics in training!')
            self.systList = eval(self.config.get(mvaName, 'systematics')) if self.config.has_option(mvaName, 'systematics') else []
            for syst in self.systList:
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'

                self.systematics.append({
                    'name': syst,
                    'U': self.config.get('Weights', systNameUp),
                    'D': self.config.get('Weights', systNameDown),
                    })

        # default: signal vs. background
        self.sampleNames = {
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        # for multi-output classifiers load dictionary from config
        self.categories = None
        if self.config.has_option(mvaName, 'classDict'):
            self.sampleNames = eval(self.config.get(mvaName, 'classDict'))
            self.categories = self.samples.keys()
            print("classes dict:", self.sampleNames)
        elif self.config.has_option(mvaName, 'classes'):
            self.sampleNames = dict(eval(self.config.get(mvaName, 'classes')))
            self.categories = [x[0] for x in eval(self.config.get(mvaName, 'classes'))]
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}
        if not self.categories:
            self.categories = self.samples.keys()
        if self.testRun:
            print("\x1b[31mDEBUG: TEST-RUN, using only small subset of samples!\x1b[0m")

Example #18

Show file

    def __init__(self,
                 config,
                 sampleIdentifier,
                 trainingRegions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        if config.has_option('Directories', 'trainingSamples'):
            self.samplesPath = self.config.get('Directories',
                                               'trainingSamples')
        else:
            self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'backgrounds'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.signalSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'signals'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        # can include DATA in the .h5 files for training
        self.dataSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'data'))
                    if self.config.has_option(trainingRegion, 'data') else []
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.samples = self.samplesInfo.get_samples(
            list(
                set(self.backgroundSampleNames + self.signalSampleNames +
                    self.dataSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(
                trainingRegion, 'treeCut') if config.has_option(
                    trainingRegion, 'treeCut') else trainingRegion
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0]
            if config.has_option(trainingRegion, 'systematics'):
                systematicsString = config.get(trainingRegion,
                                               'systematics').strip()
                if systematicsString.startswith('['):
                    systematics = eval(systematicsString)
                else:
                    systematics = systematicsString.split(' ')
            else:
                systematics = []
            mvaVars = config.get(treeVarSet, 'Nominal').split(' ')
            weightVars = []
            #for systematic in systematics:
            for syst in systematics:
                systNameUp = syst + '_UP' if self.config.has_option(
                    'Weights', syst + '_UP') else syst + '_Up'
                systNameDown = syst + '_DOWN' if self.config.has_option(
                    'Weights', syst + '_DOWN') else syst + '_Down'
                if self.config.has_option('Weights', systNameUp):
                    weightVars.append(self.config.get('Weights', systNameUp))
                if self.config.has_option('Weights', systNameDown):
                    weightVars.append(self.config.get('Weights', systNameDown))

            self.trainingRegionsDict[trainingRegion] = {
                'cut': config.get('Cuts', treeCutName),
                'vars': mvaVars,
                'weightVars': weightVars,
            }

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

Example #19

Show file

File: evaluateMVA.py Project: GLP90/Xbb_Mjj

config = BetterConfigParser()
config.read(opts.config)
anaTag = config.get("Analysis", "tag")

#get locations:
Wdir = config.get('Directories', 'Wdir')
samplesinfo = config.get('Directories', 'samplesinfo')

#systematics
INpath = config.get('Directories', 'MVAin')
OUTpath = config.get('Directories', 'MVAout')

#read shape systematics
systematics = config.get('systematics', 'systematics')

info = ParseInfo(samplesinfo, INpath)

arglist = opts.discr  #RTight_blavla,bsbsb

namelistIN = opts.names
namelist = namelistIN.split(',')
print('\n-----> SampleList: ', namelist)

MVAlist = arglist.split(',')
print('-----> MVAList:', MVAlist)

#CONFIG
#factory
factoryname = config.get('factory', 'factoryname')

# unique training name

Example #20

Show file

File: make_skims.py Project: GLP90/Xbb

class SkimsHelper(object):

    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers]
            self.mcSamples =   [x for x in self.mcSamples   if x.identifier in self.sampleIdentifiers]

    def prepare(self):
        # add DATA + MC samples
        self.fileNames = []
        for sample in self.dataSamples + self.mcSamples:
            print(sample.identifier)
            
            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)
            
            # get sample tree from cache
            self.fileNames += TreeCache.TreeCache(
                    sample=sample,
                    cutList=sampleCuts,
                    inputFolder=self.samplesPath,
                    config=config
                ).findCachedFileNames()
        if len(self.fileNames) < 1:
            print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m")
        return self

    def run(self):
        name = self.config.get('Configuration', 'channel') if self.config.has_option('Configuration', 'channel') else '_'
        timestamp = datetime.datetime.now().strftime("%y%m%d")
        tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root'
        destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root'

        sampleTree = SampleTree(self.fileNames, config=self.config) 

        if self.config.has_option('Plot_general', 'controlSample'):
            controlSampleDict = eval(self.config.get('Plot_general', 'controlSample'))
            controlSample = controlSampleDict[self.region] if self.region in controlSampleDict else -1
            sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i")
            print("INFO: setting controlSample to", controlSample)

        sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False)
        sampleTree.process()

        # copy to final destination
        if sampleTree.getNumberOfOutputTrees() > 0:
            try:
                self.fileLocator.cp(tmpName, destName, force=True)
                print('copy ', tmpName, destName)

                if not self.fileLocator.isValidRootFile(destName):
                    print("\x1b[31mERROR: copy failed, output is broken!\x1b[0m")
                else:
                    try:
                        self.fileLocator.rm(tmpName)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")

Example #21

Show file

    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut')
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics',
                                      'systematics').strip().split(' ')
        self.MVA_Vars = {
            'Nominal': [
                x for x in config.get(self.treeVarSet,
                                      'Nominal').strip().split(' ')
                if len(x.strip()) > 0
            ]
        }
        for sys in self.systematics:
            self.MVA_Vars[sys] = [
                x for x in config.get(self.treeVarSet, sys).strip().split(' ')
                if len(x.strip()) > 0
            ]

        self.weightSYS = []
        self.weightWithoutBtag = self.config.get('Weights', 'weight_noBTag')
        self.weightSYSweights = {}
        for d in ['Up', 'Down']:
            for syst in [
                    'HFStats1', 'HFStats2', 'LF', 'HF', 'LFStats1', 'LFStats2',
                    'cErr2', 'cErr1', 'JES'
            ]:
                systFullName = "btag_" + syst + "_" + d
                weightName = "bTagWeightCMVAV2_Moriond_" + syst + d
                self.weightSYSweights[
                    systFullName] = self.weightWithoutBtag + '*' + weightName
                self.weightSYS.append(systFullName)

        # samples
        self.sampleNames = {
            #                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
            #                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
            #                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
            #                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
            #                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
            #                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
            #                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
            #                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
            'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
            'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
        }
        self.samples = {
            category: self.samplesInfo.get_samples(samples)
            for category, samples in self.sampleNames.iteritems()
        }

Example #22

Show file

File: prepare_environment_with_config.py Project: acalandr/Xbb

                  default=None,
                  help="max number of files to process")
(opts, args) = parser.parse_args(argv)
config = BetterConfigParser()
config.read(opts.config)

fileList = FileList.decompress(
    opts.fileList) if len(opts.fileList) > 0 else None

pathOUT = config.get('Directories', 'PREPout')
samplefiles = config.get('Directories', 'samplefiles')
sampleconf = config

whereToLaunch = config.get('Configuration', 'whereToLaunch')

info = ParseInfo(samples_path=None, config=config)
samples = [
    x for x in info
    if not x.subsample and (len(opts.sampleIdentifier) == 0 or x.identifier in
                            opts.sampleIdentifier.split(','))
]
treeCopier = copytreePSI.CopyTreePSI(config=config)
if opts.limit and len(samples) > int(opts.limit):
    samples = samples[:int(opts.limit)]
for sample in samples:
    treeCopier.copytreePSI(pathIN=samplefiles,
                           pathOUT=pathOUT,
                           folderName=sample.identifier,
                           skimmingCut=sample.addtreecut,
                           fileList=fileList)

Example #23

Show file

class SkimsHelper(object):
    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(
            ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [
                x for x in self.dataSamples
                if x.identifier in self.sampleIdentifiers
            ]
            self.mcSamples = [
                x for x in self.mcSamples
                if x.identifier in self.sampleIdentifiers
            ]

    def prepare(self):
        # add DATA + MC samples
        self.fileNames = []
        for sample in self.dataSamples + self.mcSamples:
            print(sample.identifier)

            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(
                    self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)

            # get sample tree from cache
            tc = TreeCache.TreeCache(sample=sample,
                                     cutList=sampleCuts,
                                     inputFolder=self.samplesPath,
                                     config=config)
            if tc.isCached():
                self.fileNames += tc.findCachedFileNames()
            else:
                print("ERROR: not cached, run cacheplot again")
                raise Exception("NotCached")
        if len(self.fileNames) < 1:
            print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m")
        return self

    def run(self):
        name = self.config.get('Configuration',
                               'channel') if self.config.has_option(
                                   'Configuration', 'channel') else '_'
        timestamp = datetime.datetime.now().strftime("%y%m%d")
        tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root'
        destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root'

        sampleTree = SampleTree(self.fileNames, config=self.config)

        if self.config.has_option('Plot_general', 'controlSample'):
            controlSampleDict = eval(
                self.config.get('Plot_general', 'controlSample'))
            controlSample = controlSampleDict[
                self.region] if self.region in controlSampleDict else -1
            sampleTree.addOutputBranch("controlSample",
                                       lambda x: controlSample,
                                       branchType="i")
            print("INFO: setting controlSample to", controlSample)

        sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False)
        sampleTree.process()

        # copy to final destination
        if sampleTree.getNumberOfOutputTrees() > 0:
            try:
                self.fileLocator.cp(tmpName, destName, force=True)
                print('copy ', tmpName, destName)

                if not self.fileLocator.isValidRootFile(destName):
                    print(
                        "\x1b[31mERROR: copy failed, output is broken!\x1b[0m")
                else:
                    try:
                        self.fileLocator.rm(tmpName)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")

Example #24

Show file

    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) 
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)

Example #25

Show file

class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) 
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)

    # load numpy arrays with training/testing data
    def loadCachedNumpyArrays(self, cachedFilesPath):
        cached = True
        try:
            with open(cachedFilesPath + '/scikit_input.dmp', 'rb') as inputFile:
                self.data = pickle.load(inputFile)
            print("INFO: found numpy arrays for input in:", cachedFilesPath)
        except:
            cached = False
        return cached

    # save numpy arrays with training/testing data
    def writeNumpyArrays(self, cachedFilesPath):
        with open(cachedFilesPath + '/scikit_input.dmp', 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print("INFO: wrote numpy arrays for input to:", cachedFilesPath)

    def getCachedNumpyArrayPath(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__V:%r'%self.dataRepresentationVersion
        varsHash = hashlib.sha224(identifier).hexdigest()
        cachedFilesPath = self.logpath + '/../cache/' + varsHash + '/'
        return cachedFilesPath

    def getHash(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__PAR:%r'%self.parameters
        return hashlib.sha224(identifier).hexdigest()[:8]

    def prepare(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        self.sampleTrees = []
        categories = ['BKG', 'SIG']
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        cachedFilesPath = self.getCachedNumpyArrayPath() 
        try:
            os.makedirs(cachedFilesPath)
        except:
            pass
        
        # load numpy arrays from disk if they have been already created
        if self.loadCachedNumpyArrays(cachedFilesPath):
            return self

        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                        arrayLists[datasetName].append(inputData)

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0),
                    'y': np.array(targetLists['test'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                }

        # write numpy arrays to disk
        self.writeNumpyArrays(cachedFilesPath)

        return self

    def verify_data(self):
        valid = True
        for dataset in self.datasets:
            for var in self.varsets:
                print("DEBUG: self.data['{dataset}']['{var}'].shape = {shape}".format(dataset=dataset, var=var, shape=self.data[dataset][var].shape))

        for dataset in self.datasets:
            for i in range(len(self.varsets)-1):
                valid = valid and self.data[dataset][self.varsets[i]].shape[0] == self.data[dataset][self.varsets[i+1]].shape[0]
        return valid

    def run(self):

        if not self.verify_data():
            print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m")
            raise Exception("BadTrainingInputData")

        applyClassWeights = False
        if self.parameters['classifier'] == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'RandomForestClassifier':
            clf = RandomForestClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier':
            rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0)
            clf0 = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
            clf = make_pipeline(rt, clf0)
        elif self.parameters['classifier'] == 'XGBClassifier':
            clf = XGBClassifier(
                    learning_rate=self.parameters['learning_rate'],
                    max_depth=self.parameters['max_depth'],
                    n_estimators=self.parameters['n_estimators'],
                    objective='binary:logitraw',
                    colsample_bytree=self.parameters['colsample_bytree'],
                    subsample=self.parameters['subsample'],
                    min_child_weight=self.parameters['min_child_weight'],
                    gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0,
                    #reg_alpha=8,
                    reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0,
                    reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0,
                    ) 
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'MLPClassifier':
            classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']}
            clf = MLPClassifier(**classifierParams) 
        elif self.parameters['classifier'] in ['SVC', 'LinearSVC']:
            '''
            clf = SVC(
                        C=1.0,
                        cache_size=4000,
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma='auto',
                        kernel='rbf',
                        max_iter=100000,
                        probability=False,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=True
                    )
            '''
            bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False
            if self.parameters['classifier'] == 'LinearSVC':
                clf = LinearSVC(
                            class_weight='balanced',
                            dual=self.parameters['dual'],
                            max_iter=self.parameters['max_iter'],
                            C=self.parameters['C'],
                            penalty=self.parameters['penalty'],
                            loss=self.parameters['loss'],
                            tol=self.parameters['tol'],
                            verbose=True,
                        )
            else:
                # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000
                clf =  SVC(
                        C=self.parameters['C'],
                        cache_size=self.parameters['cache_size'],
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=self.parameters['degree'],
                        gamma=self.parameters['gamma'],
                        kernel=self.parameters['kernel'],
                        max_iter=self.parameters['max_iter'],
                        probability=False,
                        random_state=None,
                        shrinking=self.parameters['shrinking'],
                        tol=self.parameters['tol'],
                        verbose=True
                    )

            if bagged:
                n_estimators = bagged
                if 'bag_oversampling' in self.parameters:
                    n_estimators = int(n_estimators * self.parameters['bag_oversampling'])

                clf0 = clf
                clf = BaggingClassifier(
                        clf0,
                        max_samples=1.0 / bagged,
                        max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0,
                        bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False,
                        n_estimators=n_estimators,
                    )

        else:
            clf = AdaBoostClassifier(
                    DecisionTreeClassifier(
                        min_samples_leaf=self.parameters['min_samples_leaf'], 
                        max_depth=self.parameters['max_depth'], 
                        class_weight=self.parameters['class_weight'], 
                        criterion=self.parameters['criterion'],
                        splitter=self.parameters['splitter'],
                        max_features=self.parameters['max_features'],
                        ), 
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    algorithm=self.parameters['algorithm'],
                )

        #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile:
        #    clf = pickle.load(inputFile)

        # preprocessing
        print("transformation...")

        if 'scaler' in self.parameters:
            if self.parameters['scaler'] == 'standard':
                self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'minmax':
                self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'robust':
                self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X'])
            else:
                self.scaler = None
        else:
            self.scaler = None

        if self.scaler:
            self.data['train']['X'] = self.scaler.transform(self.data['train']['X'])
            self.data['test']['X'] = self.scaler.transform(self.data['test']['X'])

        # SHUFFLE all samples before
        self.shuffle = False
        if self.shuffle:
            print("shuffle input data...")
            for dataset in self.datasets:
                nSamples = self.data[dataset][self.varsets[0]].shape[0]
                randomPermutation = np.random.permutation(nSamples)
                for var in self.varsets:
                    self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0)

        # LIMIT number of training samples
        # recommended to also shuffle samples before, because they are ordered by signal/background
        limitNumTrainingSamples = self.parameters['limit']
        if (limitNumTrainingSamples > 0):
            print("limit training samples to:", limitNumTrainingSamples)
            #for dataset in self.datasets:
            #    for var in self.varsets:
            #        self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples]
            for dataset in self.datasets:
                self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False)

        # oversample
        upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None
        if upscale:
            upscalemax =  self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 
            upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal
            indices = []
            for i in range(len(self.data['train']['sample_weight'])):
                #print(x)
                x= self.data['train']['sample_weight'][i]
                if self.data['train']['y'][i] > 0.5:
                    x *= upscalesignal
                n = x * upscale
                # limit oversampling factor!
                if n > upscalemax:
                    n=upscalemax
                if n<1:
                    n=1
                intN = int(n)
                indices += [i]*intN
                #floatN = n-intN
                #if floatN > 0:
                #    if random.uniform(0.0,1.0) < floatN:
                #        indices += [i]

            self.data['train']['X'] = self.data['train']['X'][indices]
            self.data['train']['y'] = self.data['train']['y'][indices]
            self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices]
            self.verify_data()

        # BALANCE weights
        # calculate total weights and class_weights
        nSig = len([x for x in self.data['train']['y'] if x >= 0.5])
        nBkg = len([x for x in self.data['train']['y'] if x < 0.5])
        print("#SIG:", nSig)
        print("#BKG:", nBkg)
        weightsSignal = []
        weightsBackground = []
        for i in range(len(self.data['train']['sample_weight'])):
            if self.data['train']['y'][i] < 0.5:
                weightsBackground.append(self.data['train']['sample_weight'][i])
            else:
                weightsSignal.append(self.data['train']['sample_weight'][i])
        weightsSignal.sort()
        weightsBackground.sort()
        totalWeightSignal = sum(weightsSignal)
        totalWeightBackground = sum(weightsBackground)
        signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight']
        backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground
        print("SUM of weights for signal:", totalWeightSignal)
        print("SUM of weights for background:", totalWeightBackground)
        
        if applyClassWeights:
            print("re-weight signals by:", signalReweight)
            print("re-weight background by:", backgroundReweight)
            for i in range(len(self.data['train']['sample_weight'])):
                if self.data['train']['y'][i] < 0.5:
                    self.data['train']['sample_weight'][i] *= backgroundReweight
                else:
                    self.data['train']['sample_weight'][i] *= signalReweight
        else:
            print("DO NOT re-weight signals by:", signalReweight)
        print("...")
        # TRAINING

        learningCurve = []
        if self.parameters['classifier'] == 'XGBClassifier':
            clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True)
        else:
            try:
                clf = clf.fit(**self.data['train'])
            except:
                clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])
                
                if 'rounds' in self.parameters and self.parameters['rounds'] > 1:
                    for rNumber in range(self.parameters['rounds']):
                        results = clf.predict_proba(self.data['test']['X']) 
                        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
                        print(" round ", rNumber, " AUC=", auc1)
                        learningCurve.append(auc1)
                        clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])

        print("***** FIT done")

        # TEST
        try:
            results = clf.decision_function(self.data['test']['X'])
            print("***** EVALUATION on test sample done")
            results_train = clf.decision_function(self.data['train']['X'])
            print("***** EVALUATION on training sample done")

            print("R:", results.shape, results)

            results = np.c_[np.ones(results.shape[0]), results]
            results_train = np.c_[np.ones(results_train.shape[0]), results_train]
        except:
            results = clf.predict_proba(self.data['test']['X'])
            results_train = clf.predict_proba(self.data['train']['X'])

        # ROC curve
        print("calculating auc...")
        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
        auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight'])
        print("AUC:", auc1, " (training:", auc_training, ")")

        print("**** compute quantiles")
        qx = np.array([0.01, 0.99])
        qy = np.array([0.0, 0.0])
        thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0)
        nS = len(results)
        for i in range(nS):
            thq.Fill(results[i][1])
        thq.GetQuantiles(2, qy, qx)

        # rescaling of SCORE to [0, 1]
        minProb = 2.0
        maxProb = -1.0
        #for i in range(len(self.data['train']['X'])):
        #    if results_train[i][1] > maxProb:
        #        maxProb = results_train[i][1]
        #    if results_train[i][1] < minProb:
        #        minProb = results_train[i][1]
        #for i in range(len(self.data['test']['X'])):
        #    if results[i][1] > maxProb:
        #        maxProb = results[i][1]
        #    if results[i][1] < minProb:
        #        minProb = results[i][1]

        minProb = qy[0]
        maxProb = qy[1]
        delta = maxProb-minProb
        minProb -= delta * 0.01
        maxProb += delta * 0.10

        useSqrt = False

        # fill TRAINING SCORE histogram (class probability)
        h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0)
        h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0)
        for i in range(len(self.data['train']['X'])):
            result = (results_train[i][1]-minProb)/(maxProb-minProb)

Example #26

Show file

class CacheTraining(object):
    def __init__(self,
                 config,
                 sampleIdentifier,
                 trainingRegions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'backgrounds'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.signalSampleNames = list(
            set(
                sum([
                    eval(self.config.get(trainingRegion, 'signals'))
                    for trainingRegion in self.trainingRegions
                ], [])))
        self.samples = self.samplesInfo.get_samples(
            list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            systematics = [
                x for x in config.get('systematics', 'systematics').split(' ')
                if len(x.strip()) > 0
            ]
            mvaVars = []
            for systematic in systematics:
                mvaVars += config.get(treeVarSet,
                                      systematic).strip().split(' ')
            self.trainingRegionsDict[trainingRegion] = {
                'cut': config.get('Cuts', treeCutName),
                'vars': mvaVars,
            }

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

    def printInfo(self):
        print("REGION:".ljust(24), "CUT:")
        for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
        ):
            print(" > ", trainingRegion.ljust(20), trainingRegionInfo['cut'])

    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print('*' * 80)
            print(' ', sampleToCache)
            print('*' * 80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            self.sampleTree = None

            # use all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [
                x for x in self.samples if x.identifier == sampleToCache
            ]

            # list of branches to keep for use as MVA input variables
            branchListOfMVAVars = BranchList()
            for sample in subsamples:
                for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
                ):
                    for additionalCut in [self.TrainCut, self.EvalCut]:
                        branchListOfMVAVars.addCut(trainingRegionInfo['vars'])
            branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF'))
            mvaBranches = branchListOfMVAVars.getListOfBranches()

            # loop over all samples
            for sample in subsamples:

                # add cuts for all training regions
                for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
                ):

                    # add cuts for training and evaluation
                    for additionalCut in [self.TrainCut, self.EvalCut]:

                        # cuts
                        sampleCuts = [sample.subcut]
                        if additionalCut:
                            sampleCuts.append(additionalCut)
                        if trainingRegionInfo['cut']:
                            sampleCuts.append(trainingRegionInfo['cut'])

                        # add cache object
                        tc = TreeCache.TreeCache(
                            name='{region}_{sample}_{tr}'.format(
                                region=trainingRegion,
                                sample=sample.name,
                                tr='TRAIN'
                                if additionalCut == self.TrainCut else 'EVAL'),
                            sample=sample.name,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            splitFilesChunks=self.splitFilesChunks,
                            chunkNumber=self.chunkNumber,
                            splitFilesChunkSize=self.splitFilesChunkSize,
                            branches=mvaBranches,
                            config=self.config,
                            debug=True)

                        # check if this part of the sample is already cached
                        isCached = tc.partIsCached()
                        if not isCached or self.force:
                            if isCached:
                                tc.deleteCachedFiles(
                                    chunkNumber=self.chunkNumber)
                            # for the first sample which comes from this files, load the tree
                            if not self.sampleTree:
                                self.sampleTree = SampleTree(
                                    {
                                        'name': sample.identifier,
                                        'folder': self.samplesPath
                                    },
                                    splitFilesChunkSize=self.
                                    splitFilesChunkSize,
                                    chunkNumber=self.chunkNumber,
                                    config=self.config,
                                    saveMemory=True)
                            treeCaches.append(
                                tc.setSampleTree(self.sampleTree).cache())

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print("nothing to do!")

Example #27

Show file

class MvaTrainingHelper(object):
    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName, 'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet,
                                              'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut')
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0

        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(
            factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")

    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName,
                                                  "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname,
                                         self.trainingOutputFile,
                                         self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print("INFO: initialized MvaTrainingHelper.", self.factory)
        else:
            print(
                "\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m"
            )

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [[
                addBackgroundTreeMethod, self.samples['BKG']
        ], [addSignalTreeMethod, self.samples['SIG']]]:
            for sample in samples:
                print('*' * 80, '\n%s\n' % sample, '*' * 80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(sample=sample,
                                             cutList=sampleCuts,
                                             inputFolder=self.samplesPath,
                                             config=self.config,
                                             debug=True)
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32 * 1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(
                            sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(
                                sampleTree.tree, treeScale,
                                ROOT.TMVA.Types.kTraining if additionalCut
                                == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d' % (resource.getrusage(
                                resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print("\x1b[31mERROR: TREE NOT FOUND:", sample.name,
                              " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self

    # ----------------------------------------------------------------------------------------------------------------------
    # backup old .xml and .info files
    # ----------------------------------------------------------------------------------------------------------------------
    def backupOldFiles(self):
        success = False
        MVAdir = self.config.get('Directories',
                                 'vhbbpath') + '/python/weights/'
        backupDir = MVAdir + 'backup/'
        try:
            os.makedirs(backupDir)
        except:
            pass
        freeNumber = 1
        try:
            lastUsedBackupDirectories = sorted(
                glob.glob(backupDir + '/v*/'),
                key=lambda x: int(x.strip('/').split('/')[-1][1:]),
                reverse=True)
            freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split(
                '/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1
        except Exception as e:
            print("\x1b[31mERROR: creating backup of MVA files failed!", e,
                  "\x1b[0m")
            freeNumber = -1
        if freeNumber > -1:
            try:
                fileNamesToBackup = glob.glob(MVAdir + self.factoryname + '_' +
                                              self.mvaName + '.*')
                fileNamesToBackup += glob.glob(
                    MVAdir + '/../mvatraining_MVA_ZllBDT_*.root')
                os.makedirs(backupDir + 'v%d/' % freeNumber)
                for fileNameToBackup in fileNamesToBackup:
                    shutil.copy(fileNameToBackup,
                                backupDir + 'v%d/' % freeNumber)
                success = True
            except Exception as e:
                print("\x1b[31mERROR: creating backup of MVA files failed!", e,
                      "\x1b[0m")
        return success

    def run(self):
        backupFiles = False
        try:
            backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights'))
        except:
            pass
        if backupFiles:
            print('backing up old BDT files')
            self.backupOldFiles()
        # ----------------------------------------------------------------------------------------------------------------------
        # Execute TMVA
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory.Verbose()
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")' %
              (self.MVAtype, self.mvaName, self.MVAsettings))
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        weightF = self.config.get('Weights', 'weightF')
        try:
            self.factory.BookMethod(self.MVAtype, self.mvaName,
                                    self.MVAsettings)
            print("ROOT 5 style TMVA found")
            self.factory.SetSignalWeightExpression(weightF)
            self.factory.SetBackgroundWeightExpression(weightF)
        except:
            print("ROOT 6 style TMVA found, using data loader object!!! >_<")
            print(" weights dir:",
                  ROOT.TMVA.gConfig().GetIONames().fWeightFileDir)
            print(" data loader:", self.dataLoader)
            print(" type:       ", self.MVAtype)
            print(" name:       ", self.mvaName)
            print(" settings:   ", self.MVAsettings)
            ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights'
            self.dataLoader.SetSignalWeightExpression(weightF)
            self.dataLoader.SetBackgroundWeightExpression(weightF)
            self.factory.BookMethod(self.dataLoader, self.MVAtype,
                                    self.mvaName, self.MVAsettings)
        sys.stdout.flush()
        print('Execute TMVA: TrainAllMethods')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TrainAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: TestAllMethods')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TestAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: EvaluateAllMethods')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.EvaluateAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: output.Write')
        print('max mem used = %d' %
              (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.trainingOutputFile.Close()
        return self

    def printInfo(self):
        #WRITE INFOFILE
        MVAdir = self.config.get('Directories',
                                 'vhbbpath') + '/python/weights/'
        infofile = open(
            MVAdir + self.factoryname + '_' + self.mvaName + '.info', 'w')
        print('@DEBUG: output infofile name')
        print(infofile)

        info = mvainfo(self.mvaName)
        info.factoryname = self.factoryname
        info.factorysettings = self.factorysettings
        info.MVAtype = self.MVAtype
        info.MVAsettings = self.MVAsettings
        info.weightfilepath = MVAdir
        info.path = self.samplesPath
        info.varset = self.treeVarSet
        info.vars = self.MVA_Vars['Nominal']
        pickle.dump(info, infofile)
        infofile.close()

    def getExpectedSignificance(self,
                                tree,
                                nBins,
                                xMin,
                                xMax,
                                power=1.0,
                                rescaleSig=1.0,
                                rescaleBkg=1.0):
        hSIG = ROOT.TH1D("hSig", "hSig", nBins, xMin, xMax)
        hBKG = ROOT.TH1D("hBkg", "hBkg", nBins, xMin, xMax)
        print("INFO: GetEntries() = ", tree.GetEntries())
        if power != 1.0:
            print("INFO: rescale BDT score with power ", power)
        for event in tree:
            if power != 1.0:
                x = (getattr(event, self.mvaName) - xMin) / (xMax - xMin)
                if x < 0:
                    x = 0
                if x > 0.999999:
                    x = 0.999999
                value = math.pow(x, power) * (xMax - xMin) + xMin
            else:
                value = max(min(getattr(event, self.mvaName), xMax - 0.00001),
                            xMin)

            weight = event.weight
            if event.classID == 1:
                hSIG.Fill(value, weight * rescaleSig)
            else:
                hBKG.Fill(value, weight * rescaleBkg)
        ssbSum = 0.0
        sSum = 0
        bSum = 0
        sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}"
        print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----")
        print(
            sbTableFormat.format(bin="bin",
                                 signal="signal",
                                 background="background",
                                 ssb="S/sqrt(S+B)"))
        for i in range(nBins):
            ssbSum += hSIG.GetBinContent(1 + i) * hSIG.GetBinContent(1 + i) / (
                hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) if (
                    hSIG.GetBinContent(1 + i) +
                    hBKG.GetBinContent(1 + i)) > 0 else 0
            sSum += hSIG.GetBinContent(1 + i)
            bSum += hBKG.GetBinContent(1 + i)
            ssb = hSIG.GetBinContent(1 + i) / math.sqrt(
                hSIG.GetBinContent(1 + i) + hBKG.GetBinContent(1 + i)) if (
                    hSIG.GetBinContent(1 + i) +
                    hBKG.GetBinContent(1 + i)) > 0 else 0
            print(
                sbTableFormat.format(bin=i,
                                     signal=round(hSIG.GetBinContent(1 + i),
                                                  1),
                                     background=round(
                                         hBKG.GetBinContent(1 + i), 1),
                                     ssb=round(ssb, 3)))
        expectedSignificance = math.sqrt(ssbSum)
        print(
            sbTableFormat.format(bin="SUM",
                                 signal=round(sSum, 1),
                                 background=round(bSum, 1),
                                 ssb="\x1b[34mZ=%1.3f\x1b[0m" %
                                 expectedSignificance))
        print("-" * 40)
        hSIG.Delete()
        hBKG.Delete()
        return expectedSignificance, sSum, bSum

    def estimateExpectedSignificance(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)
        testTree = rootFile.Get('./TestTree')

        # run a few tests with different binnings and rescaling of BDT score
        self.getExpectedSignificance(testTree, 15, -0.8, 1.0)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.9)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0)

        # close to nominal binning
        print("---- ~nominal TEST -----")
        esTest, sTest, bTest = self.getExpectedSignificance(
            testTree, 15, -0.8, 0.8)
        print("---- ~nominal TRAINING (without correct normalization) -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(
            trainTree, 15, -0.8, 0.8)

        # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes
        # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events
        rescaleSig = 1.0 * sTest / sTrain
        rescaleBkg = 1.0 * bTest / bTrain
        print("---- ~nominal TRAINING -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(
            trainTree,
            15,
            -0.8,
            0.8,
            rescaleSig=rescaleSig,
            rescaleBkg=rescaleBkg)

Example #28

Show file

class SampleTreesToNumpyConverter(object):
    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut')
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics',
                                      'systematics').strip().split(' ')
        self.MVA_Vars = {
            'Nominal': [
                x for x in config.get(self.treeVarSet,
                                      'Nominal').strip().split(' ')
                if len(x.strip()) > 0
            ]
        }
        for sys in self.systematics:
            self.MVA_Vars[sys] = [
                x for x in config.get(self.treeVarSet, sys).strip().split(' ')
                if len(x.strip()) > 0
            ]

        self.weightSYS = []
        self.weightWithoutBtag = self.config.get('Weights', 'weight_noBTag')
        self.weightSYSweights = {}
        for d in ['Up', 'Down']:
            for syst in [
                    'HFStats1', 'HFStats2', 'LF', 'HF', 'LFStats1', 'LFStats2',
                    'cErr2', 'cErr1', 'JES'
            ]:
                systFullName = "btag_" + syst + "_" + d
                weightName = "bTagWeightCMVAV2_Moriond_" + syst + d
                self.weightSYSweights[
                    systFullName] = self.weightWithoutBtag + '*' + weightName
                self.weightSYS.append(systFullName)

        # samples
        self.sampleNames = {
            #                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
            #                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
            #                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
            #                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
            #                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
            #                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
            #                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
            #                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
            'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
            'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
        }
        self.samples = {
            category: self.samplesInfo.get_samples(samples)
            for category, samples in self.sampleNames.iteritems()
        }

    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.samples.keys()
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }
        arrayLists_sys = {
            x: {datasetName: []
                for datasetName in datasetParts.iterkeys()}
            for x in systematics
        }
        weightLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }
        targetLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }

        weightListsSYS = {
            x: {datasetName: []
                for datasetName in datasetParts.iterkeys()}
            for x in self.weightSYS
        }

        # standard weight expression
        weightF = self.config.get('Weights', 'weightF')

        for category in categories:
            for sample in self.samples[category]:
                print('*' * 80, '\n%s\n' % sample, '*' * 80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(sample=sample,
                                             cutList=sampleCuts,
                                             inputFolder=self.samplesPath,
                                             config=self.config,
                                             debug=True)
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(
                            sample) * self.globalRescale
                        print('scale:', treeScale)

                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        features_sys = {
                            x: self.MVA_Vars[x]
                            for x in systematics
                        }
                        nFeatures = len(features)
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures),
                                             dtype=np.float32)
                        inputData_sys = {
                            x: np.zeros((nSamples, nFeatures),
                                        dtype=np.float32)
                            for x in systematics
                        }

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        for k, features_s in features_sys.iteritems():
                            for feature in features_s:
                                sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        for syst in self.weightSYS:
                            sampleTree.addFormula(self.weightSYSweights[syst])

                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(
                                weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(
                                categories.index(category))

                            # add weights varied by (btag) systematics
                            for syst in self.weightSYS:
                                weightListsSYS[syst][datasetName].append(
                                    treeScale * sampleTree.evaluate(
                                        self.weightSYSweights[syst]))

                            # fill systematics
                            for k, feature_s in features_sys.iteritems():
                                for j, feature in enumerate(feature_s):
                                    inputData_sys[k][
                                        i, j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        for sys in systematics:
                            arrayLists_sys[sys][datasetName].append(
                                inputData_sys[sys])

                    else:
                        print("\x1b[31mERROR: TREE NOT FOUND:", sample.name,
                              " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
            'train': {
                'X': np.concatenate(arrayLists['train'], axis=0),
                'y': np.array(targetLists['train'], dtype=np.float32),
                'sample_weight': np.array(weightLists['train'],
                                          dtype=np.float32),
            },
            'test': {
                'X': np.concatenate(arrayLists['test'], axis=0),
                'y': np.array(targetLists['test'], dtype=np.float32),
                'sample_weight': np.array(weightLists['test'],
                                          dtype=np.float32),
            },
            'category_labels':
            {idx: label
             for idx, label in enumerate(categories)},
            'meta': {
                'version': self.dataFormatVersion,
                'region': self.mvaName,
                'cutName': self.treeCutName,
                'cut': self.treeCut,
                'trainCut': self.trainCut,
                'testCut': self.evalCut,
                'samples': self.sampleNames,
                'weightF': weightF,
                'weightSYS': self.weightSYS,
                'variables': ' '.join(self.MVA_Vars['Nominal'])
            }
        }
        # add systematics variations
        for sys in systematics:
            self.data['train']['X_' + sys] = np.concatenate(
                arrayLists_sys[sys]['train'], axis=0)
        for syst in self.weightSYS:
            self.data['train']['sample_weight_' + syst] = np.array(
                weightListsSYS[syst]['train'], dtype=np.float32)

        numpyOutputFileName = './' + self.mvaName + '.dmpz'
        with gzip.open(numpyOutputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print(self.data['meta'])
        print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")

Example #29

Show file

File: write_numpy_array_for_training.py Project: GLP90/Xbb

class SampleTreesToNumpyConverter(object):

    def __init__(self, config, mvaName, useSyst=True, useWeightSyst=True, testRun=False):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 3
        self.sampleTrees = []
        self.config = config
        self.testRun = testRun
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}

        self.weightSYS = []
        self.weightSYSweights = {}

        self.systematics = []
        if useSyst:
            print('INFO: use systematics in training!')
            self.systList = eval(self.config.get(mvaName, 'systematics')) if self.config.has_option(mvaName, 'systematics') else []
            for syst in self.systList:
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'

                self.systematics.append({
                    'name': syst,
                    'U': self.config.get('Weights', systNameUp),
                    'D': self.config.get('Weights', systNameDown),
                    })

        # default: signal vs. background
        self.sampleNames = {
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        # for multi-output classifiers load dictionary from config
        self.categories = None
        if self.config.has_option(mvaName, 'classDict'):
            self.sampleNames = eval(self.config.get(mvaName, 'classDict'))
            self.categories = self.samples.keys()
            print("classes dict:", self.sampleNames)
        elif self.config.has_option(mvaName, 'classes'):
            self.sampleNames = dict(eval(self.config.get(mvaName, 'classes')))
            self.categories = [x[0] for x in eval(self.config.get(mvaName, 'classes'))]
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}
        if not self.categories:
            self.categories = self.samples.keys()
        if self.testRun:
            print("\x1b[31mDEBUG: TEST-RUN, using only small subset of samples!\x1b[0m")


    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.categories 
        if categories:
            print("categories:")
            for i,category in enumerate(categories):
                print(" ",i,":", category)
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        #arrayLists_sys = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in systematics}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}

        weightListsSYS = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in self.weightSYS} 
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        weightListSYStotal = {datasetName:[] for datasetName in datasetParts.iterkeys()}

        for i,category in enumerate(categories):
            if self.testRun:
                self.samples[category] = self.samples[category][0:1]
            for j,sample in enumerate(self.samples[category]):
                print ('*'*80,'\n%s (category %d/%d sample %d/%d)\n'%(sample, i+1, len(categories), j+1, len(self.samples[category])),'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        #features_sys = {x: self.MVA_Vars[x] for x in systematics} 
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)
                        #inputData_sys = {x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics}

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        #for k, features_s in features_sys.iteritems():
                        #    for feature in features_s:
                        #        sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        #for syst in self.weightSYS:
                        #    sampleTree.addFormula(self.weightSYSweights[syst])
                        for syst in self.systematics:
                            sampleTree.addFormula(syst['U'])
                            sampleTree.addFormula(syst['D'])

                        useSpecialWeight = self.config.has_option('Weights', 'useSpecialWeight') and eval(self.config.get('Weights', 'useSpecialWeight')) 
                        if useSpecialWeight:
                            sampleTree.addFormula(sample.specialweight)

                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            eventWeight = sampleTree.evaluate(weightF)
                            specialWeight =  sampleTree.evaluate(sample.specialweight) if useSpecialWeight else 1.0 
                            totalWeight = treeScale * eventWeight * specialWeight 
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))
                            
                            # add weights varied by (btag) systematics
                            #for syst in self.weightSYS:
                            #    weightListsSYS[syst][datasetName].append(treeScale * sampleTree.evaluate(self.weightSYSweights[syst]))
                            deltas = []
                            for syst in self.systematics:
                                delta_up   = sampleTree.evaluate(syst['U']) - eventWeight
                                delta_down = sampleTree.evaluate(syst['D']) - eventWeight
                                delta = 0.5 * (np.abs(delta_up) + np.abs(delta_down))
                                deltas.append(delta*delta)
                            totalDelta = np.sqrt(sum(deltas))

                            # convert to absolute error on total event weight
                            weightListSYStotal[datasetName].append(treeScale * totalDelta * specialWeight)

                            # fill systematics 
                            #for k, feature_s in features_sys.iteritems():
                            #    for j, feature in enumerate(feature_s):
                            #        inputData_sys[k][i,j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        #for sys in systematics:
                        #    arrayLists_sys[sys][datasetName].append(inputData_sys[sys])

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        ##systematics for training
        #puresystematics = deepcopy(systematics)
        #if 'Nominal' in puresystematics:
        #    puresystematics.remove('Nominal')
        puresystematics = [x['name'] for x in self.systematics]

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    'sample_weight_error': np.array(weightListSYStotal['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0), 
                    'y': np.array(targetLists['test'], dtype=np.float32), 
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    'sample_weight_error': np.array(weightListSYStotal['test'], dtype=np.float32),
                    },
                'category_labels': {idx: label for idx, label in enumerate(categories)},
                'meta': {
                    'version': self.dataFormatVersion,
                    'region': self.mvaName,
                    'cutName': self.treeCutName,
                    'cut': self.treeCut,
                    'trainCut': self.trainCut,
                    'testCut': self.evalCut,
                    'samples': self.sampleNames,
                    'weightF': weightF,
                    'weightSYS': self.weightSYS,
                    'variables': ' '.join(self.MVA_Vars['Nominal']),
                    'systematics': puresystematics,
                    }
                }
        ## add systematics variations
        #for sys in systematics:
        #    self.data['train']['X_'+sys] = np.concatenate(arrayLists_sys[sys]['train'], axis=0)
        #for syst in self.weightSYS:
        #    self.data['train']['sample_weight_'+syst] = np.array(weightListsSYS[syst]['train'], dtype=np.float32)

        if not os.path.exists("./dumps"):
            os.makedirs("dumps")
        baseName = './dumps/' +self.config.get("Directories","Dname").split("_")[1] + '_' + self.mvaName + '_' + datetime.datetime.now().strftime("%y%m%d")
        numpyOutputFileName = baseName + '.dmpz'
        hdf5OutputFileName = baseName + '.h5'
        print("INFO: saving output...")
        
        success = False
        try:
            if self.config.has_option(self.mvaName, 'writeNumpy') and eval(self.config.get(self.mvaName, 'writeNumpy')):
                self.saveAsPickledNumpy(numpyOutputFileName)
                success = True
        except Exception as e:
            print("ERROR: writing numpy array failed.", e)

        try:
            self.saveAsHDF5(hdf5OutputFileName)
            success = True
        except Exception as e:
            print("ERROR: writing HDF5 file failed.", e)

        if success:
            print("INFO: done.")
            return True
        else:
            print("ERROR: no output file written")
            return False

    def saveAsPickledNumpy(self, outputFileName):
        with gzip.open(outputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print("written to:\x1b[34m", outputFileName, " \x1b[0m")

    def saveAsHDF5(self, outputFileName):
        f = h5py.File(outputFileName, 'w')
        for k in ['meta', 'category_labels']:
            f.attrs[k] = json.dumps(self.data[k].items())
        for k in ['train', 'test']:
            for k2 in self.data[k].keys():
                f.create_dataset(k + '/' + k2, data=self.data[k][k2], compression="gzip", compression_opts=9)
        f.close()
        print("written to:\x1b[34m", outputFileName, " \x1b[0m")

Example #30

Show file

File: write_regression_systematics.py Project: perrozzi/VHbb

ang_yield = eval(config.get('AngularLike', 'yields'))

#path=opts.path
pathIN = config.get('Directories', 'SYSin')
pathOUT = config.get('Directories', 'SYSout')
tmpDir = os.environ["TMPDIR"]

print 'INput samples:\t%s' % pathIN
print 'OUTput samples:\t%s' % pathOUT

#storagesamples = config.get('Directories','storagesamples')

namelist = opts.names.split(',')

#load info
info = ParseInfo(samplesinfo, pathIN)


def deltaPhi(phi1, phi2):
    result = phi1 - phi2
    while (result > math.pi):
        result -= 2 * math.pi
    while (result <= -math.pi):
        result += 2 * math.pi
    return result


def resolutionBias(eta):
    if (eta < 0.5): return 0.052
    if (eta < 1.1): return 0.057
    if (eta < 1.7): return 0.096

Example #31

Show file

File: cache_plot.py Project: acalandr/Xbb

class CachePlot(object):
    def __init__(self,
                 config,
                 sampleIdentifier,
                 regions,
                 splitFilesChunks=1,
                 chunkNumber=1,
                 splitFilesChunkSize=-1,
                 forceRedo=False,
                 fileList=None):
        self.config = config
        self.sampleIdentifier = sampleIdentifier
        self.regions = list(set(regions))
        self.forceRedo = forceRedo

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.sampleNames = list(
            eval(self.config.get('Plot_general', 'samples')))
        self.dataNames = list(eval(self.config.get('Plot_general', 'Data')))
        self.samples = self.samplesInfo.get_samples(self.sampleNames +
                                                    self.dataNames)

        self.regionsDict = {}
        for region in self.regions:
            treeCut = config.get('Cuts', region)
            self.regionsDict[region] = {'cut': treeCut}
        self.splitFilesChunkSize = splitFilesChunkSize
        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.fileList = FileList.decompress(fileList) if fileList else None

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

    def printInfo(self):
        print("REGION:".ljust(24), "CUT:")
        for region, regionInfo in self.regionsDict.iteritems():
            print(" > ", region.ljust(20), regionInfo['cut'])

    def run(self):

        # keep additional branches for plotting
        try:
            keepBranchesPlot = eval(
                self.config.get('Branches', 'keep_branches_plot'))
        except:
            keepBranchesPlot = []
        try:
            keepBranchesPlot += eval(
                self.config.get('Branches', 'keep_branches'))
        except:
            pass

        # also keep some branches which might be used later in variables definition and weights
        try:
            for section in self.config.sections():
                try:
                    if section.startswith(
                            'plotDef:') and self.config.has_option(
                                section, 'relPath'):
                        keepBranchesPlot.append(
                            self.config.get(section, 'relPath'))
                except Exception as e:
                    print("\x1b[31mWARNING: config error in:", section, "=>",
                          e, "\x1b[0m")
        except Exception as e2:
            print(
                "\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m"
            )
            print(e2)
        try:
            keepBranchesPlot.append(self.config.get('Weights', 'weightF'))
        except:
            pass
        # plotting region cut
        for region, regionInfo in self.regionsDict.iteritems():
            keepBranchesPlot.append(regionInfo['cut'])
        keepBranchesPlotFinal = BranchList(
            keepBranchesPlot).getListOfBranches()
        print("KEEP:", keepBranchesPlotFinal)

        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print('*' * 80)
            print(' ', sampleToCache)
            print('*' * 80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            sampleTree = None

            # for all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [
                x for x in self.samples if x.identifier == sampleToCache
            ]
            for sample in subsamples:

                # add cuts for all training regions
                for region, regionInfo in self.regionsDict.iteritems():

                    configSection = 'Plot:%s' % region

                    # cuts
                    sampleCuts = [sample.subcut]
                    if regionInfo['cut']:
                        sampleCuts.append(regionInfo['cut'])
                    if self.config.has_option(configSection, 'Datacut'):
                        sampleCuts.append(
                            self.config.get(configSection, 'Datacut'))
                    if self.config.has_option('Plot_general',
                                              'addBlindingCut'):
                        sampleCuts.append(
                            self.config.has_option('Plot_general',
                                                   'addBlindingCut'))

                    # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.)
                    cacheName = 'plot:{region}_{sample}'.format(
                        region=region, sample=sample.name)

                    # add cache object
                    tc = TreeCache.TreeCache(
                        name=cacheName,
                        sample=sample.name,
                        cutList=sampleCuts,
                        inputFolder=self.samplesPath,
                        splitFilesChunks=self.splitFilesChunks,
                        chunkNumber=self.chunkNumber,
                        splitFilesChunkSize=self.splitFilesChunkSize,
                        fileList=self.fileList,
                        branches=keepBranchesPlotFinal,
                        config=self.config,
                        debug=True)

                    # check if this part of the sample is already cached
                    isCached = tc.partIsCached()
                    if not isCached or self.forceRedo:
                        if isCached:
                            tc.deleteCachedFiles(chunkNumber=self.chunkNumber)

                        # for the first sample which comes from this files, load the tree
                        if not self.sampleTree:
                            self.sampleTree = SampleTree(
                                {
                                    'name': sample.identifier,
                                    'folder': self.samplesPath
                                },
                                splitFilesChunkSize=self.splitFilesChunkSize,
                                chunkNumber=self.chunkNumber,
                                config=self.config,
                                saveMemory=True)
                            if not self.sampleTree or not self.sampleTree.tree:
                                print(
                                    "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m"
                                )
                                raise Exception("CreationOfSampleTreeFailed")
                            # consistency check on the file list at submission time and now
                            fileListNow = self.sampleTree.getSampleFileNameChunk(
                                self.chunkNumber)
                            if self.fileList and (sorted(self.fileList) !=
                                                  sorted(fileListNow)):
                                print(
                                    "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m"
                                )
                                raise Exception("SampleFilesHaveChanged")

                        treeCaches.append(
                            tc.setSampleTree(self.sampleTree).cache())
                    else:
                        print("INFO: already cached!", tc, "(", tc.hash, ")")

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print("nothing to do!")

Example #32

Show file

File: tree_stack.py Project: jmduarte/Xbb

print "Compile external macros"
print "=======================\n"

#get locations:
Wdir = config.get('Directories',
                  'Wdir')  # working direcoty containing the ouput
samplesinfo = config.get('Directories', 'samplesinfo')  # samples_nosplit.cfg

path = config.get('Directories',
                  'plottingSamples')  # from which samples to plot

section = 'Plot:%s' % region

info = ParseInfo(
    samplesinfo, path
)  #creates a list of Samples by reading the info in samples_nosplit.cfg and the conentent of the path.

import os
if os.path.exists("../interface/DrawFunctions_C.so"):
    print 'ROOT.gROOT.LoadMacro("../interface/DrawFunctions_C.so")'
    ROOT.gROOT.LoadMacro("../interface/DrawFunctions_C.so")

if os.path.exists("../interface/VHbbNameSpace_h.so"):
    print 'ROOT.gROOT.LoadMacro("../interface/VHbbNameSpace_h.so")'
    ROOT.gROOT.LoadMacro("../interface/VHbbNameSpace_h.so")


#----------Histo from trees------------
#Get the selections and the samples
def doPlot():

Example #33

Show file

class PlotHelper(object):
    def __init__(self, config, region, vars=None, title=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # additional blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories', 'samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0]

        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection,
                                                'vars').split(',')
            print("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [
                x.strip() for x in varListFromConfig if len(x.strip()) > 0
            ]

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}

    def prepare(self):
        print(
            "INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:"
            .format(region=region))
        for var in self.vars:
            print("  > {var}".format(var=var))

        self.histogramStacks = {}
        for var in self.vars:
            self.histogramStacks[var] = StackMaker(self.config,
                                                   var,
                                                   self.region,
                                                   self.signalRegion,
                                                   None,
                                                   '_' + self.subcutPlotName,
                                                   title=self.title)

        # add DATA + MC samples
        for sample in self.dataSamples + self.mcSamples:

            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(
                    self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)

            # get sample tree from cache
            tc = TreeCache.TreeCache(sample=sample,
                                     cutList=sampleCuts,
                                     inputFolder=self.samplesPath,
                                     config=config)
            sampleTree = tc.getTree()

            if sampleTree:
                groupName = self.groupDict[sample.name]
                print(" > found the tree, #entries = ",
                      sampleTree.tree.GetEntries())
                print("   > group =", groupName)
                print(" > now adding the tree for vars=", self.vars)

                # add the sample tree for all the variables
                for var in self.vars:
                    self.histogramStacks[var].addSampleTree(
                        sample=sample,
                        sampleTree=sampleTree,
                        groupName=groupName)
            else:
                print("\x1b[31mERROR: sampleTree not available for ", sample,
                      ", run caching again!!\x1b[0m")
                raise Exception("CachedTreeMissing")
        return self

    def run(self):
        # draw
        for var in self.vars:
            self.histogramStacks[var].Draw(outputFolder=self.plotPath,
                                           prefix='{region}__{var}_'.format(
                                               region=self.region, var=var))
            self.histogramStacks[var].Draw(
                outputFolder=self.plotPath,
                prefix='comp_{region}__{var}_'.format(region=self.region,
                                                      var=var),
                normalize=True)

        return self

    def getHistogramStack(self, var):
        if var in self.vars and var in self.histogramStacks:
            return self.histogramStacks[var]
        else:
            return None

Example #34

Show file

from ROOT import TAxis
from ROOT import TLorentzVector
from ROOT import TMath
from ROOT import TLegend
#from ROOT import cmath

from ROOT import gStyle
from ROOT import gPad

from ROOT import TCanvas, TColor, TGaxis, TH1F, TPad
from ROOT import kBlack, kBlue, kRed, kViolet

# load configuration and list of used samples
config = XbbConfigReader.read('Zll2018')
path = "Zll2018config/samples_nosplit.ini"
sampleInfo = ParseInfo(config, path, config=config)

usedSamples = sampleInfo.get_samples(XbbConfigTools(config).getMC())
#usedSamples = sampleInfo.get_samples(['ZJetsHT100', 'ZH_Znunu'])

usedSampleIdentifiers = list(set([x.identifier for x in usedSamples]))
print('usedSampleIdentifiers', usedSampleIdentifiers)

# some samples come from same set of ROOT trees (=have same identifier)
# -> find list of unique identifiers to avoid to process same tree file twice
#sampleIdentifiers = sampleInfo.getSampleIdentifiers()
#usedSampleIdentifiers = ParseInfo.filterIdentifiers(sampleIdentifiers, usedSamples)

# from which step to take the root trees
directory = config.get('Directories', 'sysOUT4')

Example #35

Show file

config = BetterConfigParser()
config.read(opts.config)

#namelist=opts.names.split(',')
#print "namelist:",namelist

pathIN = config.get('Directories', 'PREPin')
pathOUT = config.get('Directories', 'PREPout')
samplesinfo = config.get('Directories', 'samplesinfo')
sampleconf = BetterConfigParser()
sampleconf.read(samplesinfo)

prefix = sampleconf.get('General', 'prefix')

info = ParseInfo(samples_path=pathIN, config=config)
print "samplesinfo:", samplesinfo
cross_sections = {}
samples = []
for job in info:
    if not job.identifier in samples:
        if type(job.xsec) is list: job.xsec = job.xsec[0]
        cross_sections[job.identifier] = job.xsec
        samples.append(job.identifier)

for sample in samples:
    print sample, "\t", cross_sections[sample]
#    print dir(job)
#    print "job.name:",job.name," job.cross_section:",job.xsec
#    print "job.prefix:",job.prefix
#    if not job.name in namelist:

Example #36

Show file

    def __init__(self, opts):

        # get file list
        self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None
        print "len(filelist)",len(self.filelist),
        if len(self.filelist) > 0:
            print "filelist[0]:", self.filelist[0]
        else:
            print ''

        # config
        self.debug = 'XBBDEBUG' in os.environ
        self.verifyCopy = True
        self.opts = opts
        self.config = BetterConfigParser()
        self.config.read(opts.config)
        self.channel = self.config.get('Configuration', 'channel')

        # load namespace, TODO
        VHbbNameSpace = self.config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # directories
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')
        print 'INput samples:\t%s'%self.pathIN
        print 'OUTput samples:\t%s'%self.pathOUT

        self.fileLocator = FileLocator(config=self.config)

        # check if given sample identifier uniquely matches a samples from config
        matchingSamples = ParseInfo(samples_path=self.pathIN, config=self.config).find(identifier=opts.sampleIdentifier)
        if len(matchingSamples) != 1:
            print "ERROR: need exactly 1 sample identifier as input with -S !!"
            print matchingSamples
            exit(1)
        self.sample = matchingSamples[0]

        # collections
        self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0  else []
        if len(self.collections) < 1:
            print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m"
        print 'collections to add:', self.collections
        self.collections = self.parseCollectionList(self.collections)
        print 'after parsing:', self.collections

        # temorary folder to save the files of this job on the scratch
        temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex

        # input files
        self.subJobs = []
        if opts.join:
            print("INFO: join input files! This is an experimental feature!")

            # translate naming convention of .txt file to imported files after the prep step
            inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist]

            self.subJobs.append({
                'inputFileNames': self.filelist,
                'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep],
                'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]),
                'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]),
                })

        else:
            
            # create separate subjob for all files (default!)
            for inputFileName in self.filelist:
                inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)]

                self.subJobs.append({
                    'inputFileNames': [inputFileName],
                    'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep],
                    'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]),
                    'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]),
                    })

Example #37

Show file

File: write_regression_systematics.py Project: dcurry09/Xbb

# path=opts.path
pathIN = config.get("Directories", "SYSin")
pathOUT = config.get("Directories", "SYSout")
tmpDir = os.environ["TMPDIR"]

print "INput samples:\t%s" % pathIN
print "OUTput samples:\t%s" % pathOUT


# storagesamples = config.get('Directories','storagesamples')


namelist = opts.names.split(",")

# load info
info = ParseInfo(samplesinfo, pathIN)


def deltaPhi(phi1, phi2):
    result = phi1 - phi2
    while result > math.pi:
        result -= 2 * math.pi
    while result <= -math.pi:
        result += 2 * math.pi
    return result


def resolutionBias(eta):
    if eta < 0.5:
        return 0.052
    if eta < 1.1:

Example #38

Show file

File: submitThem_old.py Project: jmduarte/Xbb

    train_list = (config.get('MVALists', 'List_for_submitscript')).split(',')
    print train_list
    for item in train_list:
        submit(item, repDict)

if opts.task == 'dc':
    DC_vars = (config.get('LimitGeneral', 'List')).split(',')
    print DC_vars

Plot_vars = ['']
if opts.task == 'plot' or opts.task == 'singleplot' or opts.task == 'mergesingleplot' or opts.task == 'checksingleplot':
    Plot_vars = (config.get('Plot_general', 'List')).split(',')

if not opts.task == 'prep':
    path = config.get("Directories", "samplepath")
    info = ParseInfo(samplesinfo, path)

if opts.task == 'plot':
    repDict['queue'] = 'all.q'
    for item in Plot_vars:
        submit(item, repDict)

if opts.task == 'trainReg':
    repDict['queue'] = 'all.q'
    submit('trainReg', repDict)

elif opts.task == 'dc':
    repDict['queue'] = 'all.q'
    for item in DC_vars:
        # item here contains the dc name
        submit(item, repDict)

Example #39

Show file

File: workspace_datacard.py Project: dcurry09/Heppy

# Set rescale factor of 2 in case of TrainFlag
if TrainFlag:
    MC_rescale_factor=1.
    #print 'I RESCALE BY 2.0'
else: MC_rescale_factor = 1.

#systematics up/down

if doSYS == 'False':
    UD = []
else:
    UD = ['Up','Down']

#Parse samples configuration
info = ParseInfo(samplesinfo,path)

# get all the treeCut sets
all_samples        = info.get_samples(signals+backgrounds+additionals)
signal_samples     = info.get_samples(signals) 
background_samples = info.get_samples(backgrounds) 
data_sample_names  = config.get('dc:%s'%var,'data').split(' ')
data_samples       = info.get_samples(data_sample_names)

print '\n-----> Collecting all Samples...'
print '         Signals     : ', signals
print '         Backgrounds : ', backgrounds
print '         Data        : ', data_sample_names


#-------------------------------------------------------------------------------------------------

Example #40

Show file

File: run_training_scikit.py Project: GLP90/Xbb

    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)

Example #41

Show file

File: run_plot.py Project: GLP90/Xbb

class PlotHelper(object):

    def __init__(self, config, region, vars = None, title=None, sampleIdentifier=None):
        self.config = config
        self.region = region
        self.vars = vars
        self.title = title if title and len(title)>0 else None
        self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode)
        else:
            print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace)

        # input/output paths
        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesDefinitions = config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection='Plot:%s'%region

        # variables
        if self.vars and type(self.vars) == list:
            self.vars = [x.strip() for x in self.vars if len(x.strip()) > 0] 

        # if variables not specified in command line, read from config
        if not self.vars or len(self.vars) < 1:
            varListFromConfig = self.config.get(self.configSection, 'vars').split(',')
            print ("VARS::", self.configSection, " => ", varListFromConfig)
            self.vars = [x.strip() for x in varListFromConfig if len(x.strip()) > 0]
        
        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general','addBlindingCut')
            print ('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers]
            self.mcSamples =   [x for x in self.mcSamples   if x.identifier in self.sampleIdentifiers]

        self.groupDict = eval(self.config.get('Plot_general', 'Group'))
        self.subcutPlotName = ''
        self.histogramStacks = {}


    def prepare(self):
        print ("INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:".format(region=region))
        for var in self.vars:
            print ("  > {var}".format(var=var))

        self.histogramStacks = {}
        for var in self.vars:
            self.histogramStacks[var] = StackMaker(self.config, var, self.region, self.signalRegion, None, '_'+self.subcutPlotName, title=self.title)
        
        # add DATA + MC samples
        for sample in self.dataSamples + self.mcSamples:
            
            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)
            
            # get sample tree from cache
            tc = TreeCache.TreeCache(
                    sample=sample,
                    cutList=sampleCuts,
                    inputFolder=self.samplesPath,
                    config=config
                )
            sampleTree = tc.getTree()

            if sampleTree:
                groupName = self.groupDict[sample.name]  
                print (" > found the tree, #entries = ", sampleTree.tree.GetEntries())
                print ("   > group =", groupName)
                print (" > now adding the tree for vars=", self.vars)
                
                # add the sample tree for all the variables
                for var in self.vars:
                    self.histogramStacks[var].addSampleTree(sample=sample, sampleTree=sampleTree, groupName=groupName, cut=self.subcut if self.subcut else '1')
            else:
                print ("\x1b[31mERROR: sampleTree not available for ", sample,", run caching again!!\x1b[0m")
                raise Exception("CachedTreeMissing")
        return self

    def run(self):
        # draw
        for var in self.vars:
            self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='{region}__{var}_'.format(region=self.region, var=var))
            if self.config.has_option('Plot_general', 'drawNormalizedPlots') and eval(self.config.get('Plot_general', 'drawNormalizedPlots')):
                self.histogramStacks[var].Draw(outputFolder=self.plotPath, prefix='comp_{region}__{var}_'.format(region=self.region, var=var), normalize=True)

        return self

    def getHistogramStack(self, var):
        if var in self.vars and var in self.histogramStacks:
            return self.histogramStacks[var]
        else:
            return None

Example #42

Show file

    def __init__(self, config, region, sampleIdentifier=None, opts=None):
        self.config = config
        self.region = region
        self.sampleIdentifiers = sampleIdentifier.split(
            ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None

        # VHbb namespace
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        returnCode = ROOT.gSystem.Load(VHbbNameSpace)
        if returnCode != 0:
            print(
                "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"
                % returnCode)
        else:
            print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace)

        # input/output paths
        self.fileLocator = FileLocator(config=self.config)
        self.pathIN = self.config.get('Directories', opts.inputDir)
        self.pathOUT = self.config.get('Directories', opts.outputDir)
        self.tmpDir = self.config.get('Directories', 'scratch')

        self.samplesPath = config.get('Directories', 'plottingSamples')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath,
                                     config=self.config)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.plotPath = config.get('Directories', 'plotpath')

        # plot regions
        self.configSection = 'Plot:%s' % region

        # additional cut to only plot a subset of the region
        self.subcut = None
        if self.config.has_option(self.configSection, 'subcut'):
            self.subcut = self.config.get(self.configSection, 'subcut')
            print("INFO: use cut:", self.subcut)

        # additional global blinding cut:
        self.addBlindingCut = None
        if self.config.has_option(
                'Plot_general', 'addBlindingCut'
        ):  #contained in plots, cut on the event number
            self.addBlindingCut = self.config.get('Plot_general',
                                                  'addBlindingCut')
            print('adding add. blinding cut:', self.addBlindingCut)

        # load samples
        self.data = eval(self.config.get(
            self.configSection,
            'Datas'))  # read the data corresponding to each CR (section)
        self.mc = eval(self.config.get(
            'Plot_general', 'samples'))  # read the list of mc samples
        self.total_lumi = eval(self.config.get('General', 'lumi'))
        self.signalRegion = False
        if self.config.has_option(self.configSection, 'Signal'):
            self.mc.append(self.config.get(self.configSection, 'Signal'))
            self.signalRegion = True
        self.dataSamples = self.samplesInfo.get_samples(self.data)
        self.mcSamples = self.samplesInfo.get_samples(self.mc)

        # filter samples used in the plot
        if self.sampleIdentifiers:
            self.dataSamples = [
                x for x in self.dataSamples
                if x.identifier in self.sampleIdentifiers
            ]
            self.mcSamples = [
                x for x in self.mcSamples
                if x.identifier in self.sampleIdentifiers
            ]

Example #43

Show file

File: write_regression_systematics.py Project: perrozzi/VHbb

#path=opts.path
pathIN = config.get('Directories','SYSin')
pathOUT = config.get('Directories','SYSout')
tmpDir = os.environ["TMPDIR"]

print 'INput samples:\t%s'%pathIN
print 'OUTput samples:\t%s'%pathOUT


#storagesamples = config.get('Directories','storagesamples')


namelist=opts.names.split(',')

#load info
info = ParseInfo(samplesinfo,pathIN)

def deltaPhi(phi1, phi2): 
    result = phi1 - phi2
    while (result > math.pi): result -= 2*math.pi
    while (result <= -math.pi): result += 2*math.pi
    return result

def resolutionBias(eta):
    if(eta< 0.5): return 0.052
    if(eta< 1.1): return 0.057
    if(eta< 1.7): return 0.096
    if(eta< 2.3): return 0.134
    if(eta< 5): return 0.28
    return 0

Example #44

Show file

File: run_training.py Project: acalandr/Xbb

class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.config = config
        self.factoryname = config.get('factory', 'factoryname')
        self.factorysettings = config.get('factory', 'factorysettings')
        if config.has_option('Directories', 'trainingSamples'):
            self.samplesPath = config.get('Directories', 'trainingSamples')
        else:
            self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) 

        self.sampleFilesFolder = config.get('Directories', 'samplefiles')

        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.MVAtype = config.get(mvaName, 'MVAtype')
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.mvaName = mvaName

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(signalSampleNames),
        }

        self.treeCutName = config.get(mvaName, 'treeCut') if config.has_option(mvaName, 'treeCut') else mvaName
        self.treeCut = config.get('Cuts', self.treeCutName)

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')
        print("TRAINING CUT:", self.TrainCut)
        print("EVAL CUT:", self.EvalCut)

        self.globalRescale = 2.0
        
        self.trainingOutputFileName = 'mvatraining_{factoryname}_{region}.root'.format(factoryname=self.factoryname, region=mvaName)
        print("INFO: MvaTrainingHelper class created.")


    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName, "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname, self.trainingOutputFile, self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print ("INFO: initialized MvaTrainingHelper.", self.factory) 
        else:
            print ("\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m") 

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        if self.config.has_option('Weights','useSpecialWeight') and eval(self.config.get('Weights','useSpecialWeight')):
            print("\x1b[31mERROR: specialweight cannot be used with TMVA training, set it to false and add the DY_specialWeight to weightF!!\x1b[0m")
            raise Exception("SpecialWeightNotSupported")
        
        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [
                    [addBackgroundTreeMethod, self.samples['BKG']],
                    [addSignalTreeMethod, self.samples['SIG']]
                ]:
            for sample in samples:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32*1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(sampleTree.tree, treeScale, ROOT.TMVA.Types.kTraining if additionalCut == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self

    # ----------------------------------------------------------------------------------------------------------------------
    # backup old .xml and .info files 
    # ----------------------------------------------------------------------------------------------------------------------
    def backupOldFiles(self):
        success = False
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        backupDir = MVAdir + 'backup/'
        try:
            os.makedirs(backupDir)
        except:
            pass
        freeNumber = 1
        try:
            lastUsedBackupDirectories = sorted(glob.glob(backupDir + '/v*/'), key=lambda x: int(x.strip('/').split('/')[-1][1:]), reverse=True)
            freeNumber = 1 + int(lastUsedBackupDirectories[0].strip('/').split('/')[-1][1:]) if len(lastUsedBackupDirectories) > 0 else 1
        except Exception as e:
            print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
            freeNumber = -1
        if freeNumber > -1:
            try:
                fileNamesToBackup = glob.glob(MVAdir + self.factoryname+'_'+self.mvaName + '.*')
                fileNamesToBackup += glob.glob(MVAdir + '/../mvatraining_MVA_ZllBDT_*.root')
                os.makedirs(backupDir + 'v%d/'%freeNumber)
                for fileNameToBackup in fileNamesToBackup:
                    shutil.copy(fileNameToBackup, backupDir + 'v%d/'%freeNumber)
                success = True
            except Exception as e:
                print("\x1b[31mERROR: creating backup of MVA files failed!", e, "\x1b[0m")
        return success


    def run(self):
        backupFiles = False
        try:
            backupFiles = eval(self.config.get('MVAGeneral', 'backupWeights'))
        except:
            pass
        if backupFiles:
            print('backing up old BDT files')
            self.backupOldFiles()
        # ----------------------------------------------------------------------------------------------------------------------
        # Execute TMVA
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory.Verbose()
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        print('Execute TMVA: factory.BookMethod("%s", "%s", "%s")'%(self.MVAtype, self.mvaName, self.MVAsettings))
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        weightF = self.config.get('Weights','weightF')
        try:
            self.factory.BookMethod(self.MVAtype, self.mvaName, self.MVAsettings)
            print("ROOT 5 style TMVA found")
            self.factory.SetSignalWeightExpression(weightF)
            self.factory.SetBackgroundWeightExpression(weightF)
        except:
            print("ROOT 6 style TMVA found, using data loader object!!! >_<")
            print(" weights dir:", ROOT.TMVA.gConfig().GetIONames().fWeightFileDir)
            print(" data loader:", self.dataLoader)
            print(" type:       ", self.MVAtype)
            print(" name:       ", self.mvaName)
            print(" settings:   ", self.MVAsettings)
            ROOT.TMVA.gConfig().GetIONames().fWeightFileDir = 'weights'
            self.dataLoader.SetSignalWeightExpression(weightF)
            self.dataLoader.SetBackgroundWeightExpression(weightF)
            self.factory.BookMethod(self.dataLoader, self.MVAtype, self.mvaName, self.MVAsettings)
        sys.stdout.flush()
        print('Execute TMVA: TrainAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TrainAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: TestAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.TestAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: EvaluateAllMethods')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.factory.EvaluateAllMethods()
        sys.stdout.flush()
        print('Execute TMVA: output.Write')
        print('max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
        self.trainingOutputFile.Close()
        return self

    def printInfo(self):
        #WRITE INFOFILE
        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        infofile = open(MVAdir+self.factoryname+'_'+self.mvaName+'.info','w')
        print ('@DEBUG: output infofile name')
        print (infofile)

        info=mvainfo(self.mvaName)
        info.factoryname=self.factoryname
        info.factorysettings=self.factorysettings
        info.MVAtype=self.MVAtype
        info.MVAsettings=self.MVAsettings
        info.weightfilepath=MVAdir
        info.path=self.samplesPath
        info.varset=self.treeVarSet
        info.vars=self.MVA_Vars['Nominal']
        pickle.dump(info,infofile)
        infofile.close()

    def getExpectedSignificance(self, tree, nBins, xMin, xMax, power=1.0, rescaleSig=1.0, rescaleBkg=1.0):
        hSIG = ROOT.TH1D("hSig","hSig",nBins,xMin,xMax)
        hBKG = ROOT.TH1D("hBkg","hBkg",nBins,xMin,xMax)
        print("INFO: GetEntries() = ", tree.GetEntries())
        if power != 1.0:
            print("INFO: rescale BDT score with power ", power)
        for event in tree:
            if power != 1.0:
                x = (getattr(event, self.mvaName)-xMin)/(xMax-xMin)
                if x<0:
                    x=0
                if x>0.999999:
                    x=0.999999
                value = math.pow(x, power)*(xMax-xMin)+xMin
            else:
                value = max(min(getattr(event, self.mvaName),xMax-0.00001),xMin)

            weight = event.weight
            if event.classID == 1:
                hSIG.Fill(value, weight * rescaleSig)
            else:
                hBKG.Fill(value, weight * rescaleBkg)
        ssbSum = 0.0
        sSum = 0
        bSum = 0
        sbTableFormat = "{bin: <16}{signal: <16}{background: <16}{ssb: <16}"
        print("---- nBins =", nBins, " from ", xMin, "..", xMax, "-----")
        print(sbTableFormat.format(bin="bin", signal="signal", background="background", ssb="S/sqrt(S+B)"))
        for i in range(nBins):
            ssbSum += hSIG.GetBinContent(1+i)*hSIG.GetBinContent(1+i)/(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            sSum += hSIG.GetBinContent(1+i)
            bSum += hBKG.GetBinContent(1+i)
            ssb = hSIG.GetBinContent(1+i)/math.sqrt(hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) if (hSIG.GetBinContent(1+i) + hBKG.GetBinContent(1+i)) > 0 else 0
            print(sbTableFormat.format(bin=i, signal=round(hSIG.GetBinContent(1+i),2), background=round(hBKG.GetBinContent(1+i),2), ssb=round(ssb,3)))
        expectedSignificance = math.sqrt(ssbSum)
        print(sbTableFormat.format(bin="SUM", signal=round(sSum,1), background=round(bSum,1), ssb="\x1b[34mZ=%1.3f\x1b[0m"%expectedSignificance))
        print("-"*40)
        hSIG.Delete()
        hBKG.Delete()
        return expectedSignificance, sSum, bSum

    def estimateExpectedSignificance(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)
        testTree = rootFile.Get('./TestTree')

        # run a few tests with different binnings and rescaling of BDT score
        self.getExpectedSignificance(testTree, 15, -0.8, 1.0)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.9)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.75)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.7)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=0.33)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=1.5)
        self.getExpectedSignificance(testTree, 15, -0.8, 0.8, power=2.0)

        # close to nominal binning
        print("---- ~nominal TEST -----")
        esTest, sTest, bTest = self.getExpectedSignificance(testTree, 15, -0.8, 0.8)
        print("---- ~nominal TRAINING (without correct normalization) -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8)

        # the tree ./TrainTree contains the input events for training AFTER re-balancing the classes
        # therefore for SIG/BKG separately the normalization is fixed to the one of the TEST events
        rescaleSig = 1.0*sTest/sTrain
        rescaleBkg = 1.0*bTest/bTrain
        print("---- ~nominal TRAINING -----")
        trainTree = rootFile.Get('./TrainTree')
        esTrain, sTrain, bTrain = self.getExpectedSignificance(trainTree, 15, -0.8, 0.8, rescaleSig=rescaleSig, rescaleBkg=rescaleBkg)

    def getbdtHistogram(self, tree):
        hSIG = ROOT.TH1D("hSig","TMVA overtraining check for classifier: %s"%self.mvaName,40,-1,1)
        hBKG = ROOT.TH1D("hBkg","TMVA overtraining check for classifier: %s"%self.mvaName,40,-1,1)
        print("INFO: GetEntries() = ", tree.GetEntries())
        for event in tree:
            value = getattr(event, self.mvaName)

            if event.classID == 1:
                hSIG.Fill(value)
            else:
                hBKG.Fill(value)
        return [hSIG, hBKG]

    def setTMVASyle(self):
        # style
        self.hSIGtest.SetLineColor(ROOT.TColor.GetColor("#0000ee"))
        self.hSIGtest.SetLineWidth(1)
        self.hSIGtest.SetFillStyle(1001)
        self.hSIGtest.SetFillColor(ROOT.TColor.GetColor("#7d99d1"))
        self.hSIGtest.SetTitle("TMVA overtraining check for classifier: %s"%self.mvaName )

        self.hBKGtest.SetLineColor(ROOT.TColor.GetColor("#ff0000"))
        self.hBKGtest.SetLineWidth(1)
        self.hBKGtest.SetFillStyle(3554)
        self.hBKGtest.SetFillColor(ROOT.TColor.GetColor("#ff0000"))
        self.hBKGtest.SetTitle(self.hSIGtest.GetTitle())


        self.hSIGtrain.SetMarkerColor(self.hSIGtest.GetLineColor())
        self.hSIGtrain.SetMarkerSize(0.7)
        self.hSIGtrain.SetMarkerStyle(20)
        self.hSIGtrain.SetLineWidth(1)
        self.hSIGtrain.SetLineColor(self.hSIGtest.GetLineColor())
        self.hSIGtrain.SetTitle(self.hSIGtest.GetTitle())

        self.hBKGtrain.SetMarkerColor(self.hBKGtest.GetLineColor())
        self.hBKGtrain.SetMarkerSize(0.7)
        self.hBKGtrain.SetMarkerStyle(20)
        self.hBKGtrain.SetLineWidth(1)
        self.hBKGtrain.SetLineColor(self.hBKGtest.GetLineColor())
        self.hBKGtrain.SetTitle(self.hSIGtest.GetTitle())

        TMVAStyle = ROOT.TStyle(ROOT.gROOT.GetStyle("Plain"))# // our style is based on Plain
        TMVAStyle.SetName("TMVA")
        TMVAStyle.SetTitle("TMVA style based on \"Plain\" with modifications defined in tmvaglob.C")
        ROOT.gROOT.GetListOfStyles().Add(TMVAStyle)
        ROOT.gROOT.SetStyle("TMVA")
         	
        TMVAStyle.SetLineStyleString( 5, "[52 12]" )
        TMVAStyle.SetLineStyleString( 6, "[22 12]" )
        TMVAStyle.SetLineStyleString( 7, "[22 10 7 10]" )
        
        UsePaperStyle = False
    
        #// the pretty color palette of old
        TMVAStyle.SetPalette((18 if UsePaperStyle else 1))
    
        #// use plain black on white colors
        TMVAStyle.SetFrameBorderMode(0)
        TMVAStyle.SetCanvasBorderMode(0)
        TMVAStyle.SetPadBorderMode(0)
        TMVAStyle.SetPadColor(0)
        TMVAStyle.SetFillStyle(0)
    
        TMVAStyle.SetLegendBorderSize(0)
    
        c_TitleBox = ROOT.TColor.GetColor( "#5D6B7D" )
        c_TitleText = ROOT.TColor.GetColor( "#FFFFFF" )
        c_TitleBorder = ROOT.TColor.GetColor( "#7D8B9D" )
        c_FrameFill = ROOT.TColor.GetColor( "#fffffd" )
        c_Canvas = ROOT.TColor.GetColor( "#f0f0f0" )


        TMVAStyle.SetTitleFillColor( c_TitleBox )
        TMVAStyle.SetTitleTextColor( c_TitleText )
        TMVAStyle.SetTitleBorderSize( 1 )
        TMVAStyle.SetLineColor( c_TitleBorder )
        if not UsePaperStyle:
            TMVAStyle.SetFrameFillColor( c_FrameFill )
            TMVAStyle.SetCanvasColor( c_Canvas )
    
        #// set the paper & margin sizes
        TMVAStyle.SetPaperSize(20,26)
        TMVAStyle.SetPadTopMargin(0.10)
        TMVAStyle.SetPadRightMargin(0.05)
        TMVAStyle.SetPadBottomMargin(0.11)
        TMVAStyle.SetPadLeftMargin(0.12)
    
        #// use bold lines and markers
        TMVAStyle.SetMarkerStyle(21)
        TMVAStyle.SetMarkerSize(0.3)
        TMVAStyle.SetHistLineWidth(2)
        TMVAStyle.SetLineStyleString(2,"[12 12]") #// postscript dashes
    
        #// do not display any of the standard histogram decorations
        TMVAStyle.SetOptTitle(1)
        TMVAStyle.SetTitleH(0.052)
    
        TMVAStyle.SetOptStat(0)
        TMVAStyle.SetOptFit(0)
    
        #// put tick marks on top and RHS of plots
        TMVAStyle.SetPadTickX(1)
        TMVAStyle.SetPadTickY(1)

    def nomrmaliseHist(self,hSIG, hBKG):
        if (hSIG.GetSumw2N() == 0):
            hSIG.Sumw2()
        if (hBKG and hBKG.GetSumw2N() == 0): 
            hBKG.Sumw2()
     
        if(hSIG.GetSumOfWeights()!=0): 
            dx = (hSIG.GetXaxis().GetXmax() - hSIG.GetXaxis().GetXmin())/hSIG.GetNbinsX()
            hSIG.Scale(1.0/hSIG.GetSumOfWeights()/dx)
        if (hBKG != 0 and hBKG.GetSumOfWeights()!=0):
            dx = (hBKG.GetXaxis().GetXmax() - hBKG.GetXaxis().GetXmin())/hBKG.GetNbinsX()
            hBKG.Scale( 1.0/hBKG.GetSumOfWeights()/dx )

    def drawOvertraining(self):

        #normalise histograms
        self.nomrmaliseHist(self.hSIGtest, self.hBKGtest)
        self.nomrmaliseHist(self.hSIGtrain, self.hBKGtrain)

        c = ROOT.TCanvas("canvas1", "TMVA comparison %s"%self.mvaName, 0, 200, 600, 468) 

        # frame limits (choose judicuous x range)
        nrms = 10
        xmin = ROOT.TMath.Max(ROOT.TMath.Min(self.hSIGtest.GetMean() - nrms*self.hSIGtest.GetRMS(), self.hBKGtest.GetMean() - nrms*self.hBKGtest.GetRMS() ),self.hSIGtest.GetXaxis().GetXmin() )
        xmax = ROOT.TMath.Min(ROOT.TMath.Max(self.hSIGtest.GetMean() + nrms*self.hSIGtest.GetRMS(), self.hBKGtest.GetMean() + nrms*self.hBKGtest.GetRMS()), self.hSIGtest.GetXaxis().GetXmax())
        ymin = 0
        maxMult = 1.3
        #maxMult = (htype == CompareType) ? 1.3 : 1.2
        ymax = ROOT.TMath.Max(self.hSIGtest.GetMaximum(), self.hBKGtest.GetMaximum())*maxMult
        ymax = ROOT.TMath.Max(ymax,ROOT.TMath.Max(self.hSIGtrain.GetMaximum(), self.hBKGtrain.GetMaximum())*maxMult)
        #print ('ymax is', ymax)
        #print (self.hSIGtest.GetMaximum())
        #print (self.hBKGtest.GetMaximum())
        #print (self.hSIGtrain.GetMaximum())
        #print (self.hBKGtrain.GetMaximum())
        #sys.exit()
   
        # build a frame
        nb = 500
        hFrameName = "frame" + self.mvaName
        #o = ROOT.gROOT.FindObject(hFrameName)
        frame = ROOT.TH2F(hFrameName, self.hSIGtest.GetTitle(), nb, xmin, xmax, nb, ymin, ymax )
        frame.GetXaxis().SetTitle(self.mvaName + " response")
        frame.GetYaxis().SetTitle("(1/N) dN^{ }/^{ }dx")

        #TMVAGlob.SetFrameStyle( frame )
        frame.SetLabelOffset( 0.012, "X" )
        frame.SetLabelOffset( 0.012, "Y" )
        frame.GetXaxis().SetTitleOffset( 1.25 )
        frame.GetYaxis().SetTitleOffset( 1.22 )
        frame.GetXaxis().SetTitleSize( 0.045)
        frame.GetYaxis().SetTitleSize( 0.045)
        frame.GetXaxis().SetLabelSize( 0.04)
        frame.GetYaxis().SetLabelSize( 0.04)

        #// global style settings
        ROOT.gPad.SetTicks()
        ROOT.gPad.SetLeftMargin  ( 0.108)
        ROOT.gPad.SetRightMargin ( 0.050)
        ROOT.gPad.SetBottomMargin( 0.120)
   
        # eventually: draw the frame
        frame.Draw()  
    
        c.GetPad(0).SetLeftMargin(0.105 )
        frame.GetYaxis().SetTitleOffset( 1.2 )

        # Draw legend               
        legend = ROOT.TLegend(c.GetLeftMargin(), 1 - c.GetTopMargin() - 0.12, c.GetLeftMargin() + 0.40, 1 - c.GetTopMargin() )
        legend.SetFillStyle(1)
        legend.AddEntry(self.hSIGtest,"Signal"     + " (test sample)", "F")
        legend.AddEntry(self.hBKGtest,"Background" + " (test sample)", "F")
        legend.SetBorderSize(1)
        legend.SetMargin(0.2)
        legend.Draw("same")

        legend2= ROOT.TLegend( 1 - c.GetRightMargin() - 0.42, 1 - c.GetTopMargin() - 0.12, 1 - c.GetRightMargin(), 1 - c.GetTopMargin() )
        legend2.SetFillStyle(1)
        legend2.SetBorderSize(1)
        legend2.AddEntry(self.hSIGtrain,"Signal (training sample)","P")
        legend2.AddEntry(self.hBKGtrain,"Background (training sample)","P")
        legend2.SetMargin( 0.1 )
        legend2.Draw("same")

        self.setTMVASyle()

        self.hSIGtest.Draw('samehist')
        self.hBKGtest.Draw('samehist')
        self.hSIGtrain.Draw('e1same')
        self.hBKGtrain.Draw('e1same')

        #perform K-S test
        print("--- Perform Kolmogorov-Smirnov tests")
        #//Double_t kolS = sig->KolmogorovTest( self.hSIGtrain, "X" );
        #//Double_t kolB = bgd->KolmogorovTest( bgdOv, "X" );
        kolS = self.hSIGtest.KolmogorovTest( self.hSIGtrain);
        kolB = self.hBKGtest.KolmogorovTest( self.hBKGtrain);
        print ("--- Goodness of signal (background) consistency: " + str(kolS) + " (" + str(kolB) + ")")

        probatext = "Kolmogorov-Smirnov test: signal (background) probability = % 5.3g (%5.3g)"% (kolS, kolB)
        tt = ROOT.TText(0.12, 0.74, probatext)
        tt.SetNDC()
        tt.SetTextSize(0.032)
        tt.AppendPad()

        # redraw axes
        frame.Draw("sameaxis")

        #/text for overflows
        nbin = self.hSIGtest.GetNbinsX()
        dxu  = self.hSIGtest.GetBinWidth(0)
        dxo  = self.hSIGtest.GetBinWidth(nbin+1)
        uoflow =  "U/O-flow (S,B): (%.1f, %.1f)%% / (%.1f, %.1f)%%"% (self.hSIGtest.GetBinContent(0)*dxu*100, self.hBKGtest.GetBinContent(0)*dxu*100, self.hSIGtest.GetBinContent(nbin+1)*dxo*100, self.hBKGtest.GetBinContent(nbin+1)*dxo*100)
        t = ROOT.TText( 0.975, 0.115, uoflow )
        t.SetNDC()
        t.SetTextSize( 0.030 )
        t.SetTextAngle( 90 )
        t.AppendPad()    
   
        # update canvas
        c.Update()


        MVAdir = self.config.get('Directories','vhbbpath')+'/python/weights/'
        c.SaveAs(MVAdir+'overtraining%s.pdf'%self.mvaName)
        print ('I saved the canvase in', MVAdir+'overtraining%s.pdf'%self.mvaName)

    def saveOvertrainingPlots(self):
        print("INFO: open ", self.trainingOutputFileName)
        rootFile = ROOT.TFile.Open(self.trainingOutputFileName, "READ")
        print("INFO: ->", rootFile)

        self.hSIGtest = rootFile.Get('./Method_%s/%s/MVA_%s_S'%(self.mvaName,self.mvaName,self.mvaName))
        self.hBKGtest = rootFile.Get('./Method_%s/%s/MVA_%s_B'%(self.mvaName,self.mvaName,self.mvaName))
        self.hSIGtrain = rootFile.Get('./Method_%s/%s/MVA_%s_Train_S'%(self.mvaName,self.mvaName,self.mvaName))
        self.hBKGtrain = rootFile.Get('./Method_%s/%s/MVA_%s_Train_B'%(self.mvaName,self.mvaName,self.mvaName))
        print("./Method_%s/%s/MVA_%s_Train_B"%(self.mvaName,self.mvaName,self.mvaName))
        self.drawOvertraining()

Example #45

Show file

File: cache_training.py Project: GLP90/Xbb

class CacheTraining(object):

    def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False):
        self.config = config
        self.force = force
        self.sampleIdentifier = sampleIdentifier
        self.trainingRegions = trainingRegions

        self.sampleTree = None
        self.samplesPath = self.config.get('Directories', 'MVAin')
        self.samplesDefinitions = self.config.get('Directories','samplesinfo') 
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = self.config.get('Directories', 'samplefiles')

        self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], [])))
        self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], [])))
        self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames)))

        self.trainingRegionsDict = {}
        for trainingRegion in self.trainingRegions:
            treeCutName = config.get(trainingRegion, 'treeCut')
            treeVarSet = config.get(trainingRegion, 'treeVarSet').strip()
            #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0]
            systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else []
            mvaVars = config.get(treeVarSet, 'Nominal').split(' ')
            weightVars = []
            #for systematic in systematics:
            for syst in systematics: 
                systNameUp   = syst+'_UP'   if self.config.has_option('Weights',syst+'_UP')   else syst+'_Up'
                systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down'
                weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)]

            self.trainingRegionsDict[trainingRegion] = {
                    'cut': config.get('Cuts', treeCutName),
                    'vars': mvaVars,
                    'weightVars': weightVars,
                    }

        self.TrainCut = config.get('Cuts', 'TrainCut') 
        self.EvalCut = config.get('Cuts', 'EvalCut')

        self.splitFilesChunks = splitFilesChunks
        self.chunkNumber = chunkNumber
        self.splitFilesChunkSize = splitFilesChunkSize
        
        VHbbNameSpace=config.get('VHbbNameSpace','library')
        ROOT.gSystem.Load(VHbbNameSpace)
    
    def printInfo(self):
        print ("REGION:".ljust(24),"CUT:")
        for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems():
            print (" > ",trainingRegion.ljust(20), trainingRegionInfo['cut'])

    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print ('*'*80)
            print (' ',sampleToCache)
            print ('*'*80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            self.sampleTree = None

            # use all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [x for x in self.samples if x.identifier == sampleToCache]

            # list of branches to keep for use as MVA input variables
            branchListOfMVAVars = BranchList()
            for sample in subsamples:
                for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems():
                    for additionalCut in [self.TrainCut, self.EvalCut]:
                        branchListOfMVAVars.addCut(trainingRegionInfo['vars'])
                    for weightVar in trainingRegionInfo['weightVars']:
                        branchListOfMVAVars.addCut(weightVar)
            branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF'))
            mvaBranches = branchListOfMVAVars.getListOfBranches()

            # loop over all samples
            for sample in subsamples:

                # add cuts for all training regions
                for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems():

                    # add cuts for training and evaluation
                    for additionalCut in [self.TrainCut, self.EvalCut]:

                        # cuts
                        sampleCuts = [sample.subcut]
                        if additionalCut:
                            sampleCuts.append(additionalCut)
                        if trainingRegionInfo['cut']:
                            sampleCuts.append(trainingRegionInfo['cut'])

                        # add cache object
                        tc = TreeCache.TreeCache(
                            name='{region}_{sample}_{tr}'.format(region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut==self.TrainCut else 'EVAL'),
                            sample=sample.name,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            splitFilesChunks=self.splitFilesChunks,
                            chunkNumber=self.chunkNumber,
                            splitFilesChunkSize=self.splitFilesChunkSize,
                            branches=mvaBranches,
                            config=self.config,
                            debug=True
                        )

                        # check if this part of the sample is already cached
                        isCached = tc.partIsCached()
                        if not isCached or self.force:
                            if isCached:
                                tc.deleteCachedFiles(chunkNumber=self.chunkNumber)
                            # for the first sample which comes from this files, load the tree
                            if not self.sampleTree:
                                self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True)
                            treeCaches.append(tc.setSampleTree(self.sampleTree).cache())

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print ("nothing to do!")

Example #46

Show file

File: write_numpy_array_for_training.py Project: perrozzi/Xbb

class SampleTreesToNumpyConverter(object):

    def __init__(self, config, mvaName):
        self.mvaName = mvaName
        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)
        self.dataFormatVersion = 2
        self.sampleTrees = []
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)

        # region
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/eval sets
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')
        # rescale MC by 2 because of train/eval split
        self.globalRescale = 2.0

        # variables and systematics
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.systematics = config.get('systematics', 'systematics').strip().split(' ')
        self.MVA_Vars = {'Nominal': [x for x in config.get(self.treeVarSet, 'Nominal').strip().split(' ') if len(x.strip()) > 0]}
        for sys in self.systematics:
            self.MVA_Vars[sys] = [x for x in config.get(self.treeVarSet, sys).strip().split(' ') if len(x.strip()) > 0]

        # samples
        self.sampleNames = {
#                   'BKG_TT': eval(self.config.get('Plot_general', 'TT')),
#                   'BKG_ST': eval(self.config.get('Plot_general', 'ST')),
#                   'BKG_VV': eval(self.config.get('Plot_general', 'VV')),
#                   'BKG_DY2b': eval(self.config.get('Plot_general', 'DY2b')),
#                   'BKG_DY1b': eval(self.config.get('Plot_general', 'DY1b')),
#                   'BKG_DY0b': eval(self.config.get('Plot_general', 'DYlight')),
#                   'SIG_ggZH': eval(self.config.get('Plot_general', 'ggZH')),
#                   'SIG_qqZH': eval(self.config.get('Plot_general', 'qqZH')),
                    'SIG_ALL': eval(self.config.get('Plot_general', 'allSIG')),
                    'BKG_ALL': eval(self.config.get('Plot_general', 'allBKG')),
                }
        self.samples = {category: self.samplesInfo.get_samples(samples) for category,samples in self.sampleNames.iteritems()}


    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.samples.keys()
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        arrayLists_sys = {x: {datasetName:[] for datasetName in datasetParts.iterkeys()} for x in systematics}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        features_sys = {x: self.MVA_Vars[x] for x in systematics} 
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)
                        inputData_sys = {x: np.zeros((nSamples, nFeatures), dtype=np.float32) for x in systematics}

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        for k, features_s in features_sys.iteritems():
                            for feature in features_s:
                                sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                            # fill systematics 
                            for k, feature_s in features_sys.iteritems():
                                for j, feature in enumerate(feature_s):
                                    inputData_sys[k][i,j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        for sys in systematics:
                            arrayLists_sys[sys][datasetName].append(inputData_sys[sys])

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0), 
                    'y': np.array(targetLists['test'], dtype=np.float32), 
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                'category_labels': {idx: label for idx, label in enumerate(categories)},
                'meta': {
                    'version': self.dataFormatVersion,
                    'region': self.mvaName,
                    'cutName': self.treeCutName,
                    'cut': self.treeCut,
                    'trainCut': self.trainCut,
                    'testCut': self.evalCut,
                    'samples': self.sampleNames,
                    'weightF': weightF,
                    'variables': ' '.join(self.MVA_Vars['Nominal'])
                    }
                }
        # add systematics variations
        for sys in systematics:
            self.data['train']['X_'+sys] = np.concatenate(arrayLists_sys[sys]['train'], axis=0)

        numpyOutputFileName = './' + self.mvaName + '.dmpz'
        with gzip.open(numpyOutputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print(self.data['meta'])
        print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")

Example #47

Show file

File: train.py Project: dcurry09/Heppy

backgrounds = eval(backgrounds)
print '\n -----> Training Backgrounds: ', backgrounds



treeVarSet  = config.get(run,'treeVarSet')
#print '\n -----> Training Features: ', treeVarSet
        
#variables
#TreeVar Array
MVA_Vars={}
MVA_Vars['Nominal']=config.get(treeVarSet,'Nominal')
MVA_Vars['Nominal']=MVA_Vars['Nominal'].split(' ')    

#Infofile
info = ParseInfo(samplesinfo,path)

#Workdir
workdir=ROOT.gDirectory.GetPath()

# Test and Train event cuts
#TrainCut = '%s & EventForTraining==1' % TCut
#EvalCut  = '%s & EventForTraining==0' % TCut
TrainCut= TCut +' & evt%2==0'
EvalCut = TCut +' & evt%2!=0'


cuts = [TrainCut,EvalCut]  

print '\n ------> with Train Cuts: ', TrainCut
print '                Test Cuts : ', EvalCut

Example #48

Show file

File: workspace_datacard.py Project: jmduarte/Xbb

    print "Unknown Pt region"
    pt_region = 'NoSysRegion'
    #sys.exit("Unknown Pt region")
# Set rescale factor of 2 in case of TrainFalg
if TrainFlag:
    MC_rescale_factor = 2.
    print 'I RESCALE BY 2.0'
else:
    MC_rescale_factor = 1.
#systematics up/down
UD = ['Up', 'Down']

print 'Parse the sample information'
print '============================\n'
#Parse samples configuration
info = ParseInfo(samplesinfo, path)
# get all the treeCut sets
# create different sample Lists

print 'Get the sample list'
print '===================\n'
all_samples = info.get_samples(signals + backgrounds + additionals)
print 'workspace_datacard-all_samples:', [job.name for job in all_samples]

signal_samples = info.get_samples(signals)
print 'signal samples:', [job.name for job in signal_samples]

background_samples = info.get_samples(backgrounds)
data_sample_names = config.get('dc:%s' % var, 'data').split(' ')
print 'data_sample_names are', data_sample_names
data_samples = info.get_samples(data_sample_names)

Example #49

Show file

File: submitThem.py Project: perrozzi/VHbb

    for item in train_list:
        submit(item,repDict)


if opts.task == 'dc':
    #DC_vars = config.items('Limit')
    DC_vars= (config.get('LimitGeneral','List')).split(',')
    print DC_vars

if opts.task == 'plot':
    Plot_vars= (config.get('Plot_general','List')).split(',')

if not opts.task == 'prep':
    path = config.get("Directories","samplepath")
    samplesinfo = config.get("Directories","samplesinfo")
    info = ParseInfo(samplesinfo,path)

if opts.task == 'plot': 
    repDict['queue'] = 'all.q'
    for item in Plot_vars:
        submit(item,repDict)

if opts.task == 'trainReg':
    repDict['queue'] = 'all.q'
    submit('trainReg',repDict)


elif opts.task == 'dc':
    repDict['queue'] = 'all.q'
    for item in DC_vars:
        if 'ZH%s'%opts.mass in item:

Example #50

Show file

import sys
import os
from myutils.XbbConfig import XbbConfigReader, XbbConfigTools
from myutils import ParseInfo
from myutils.FileLocator import FileLocator
from myutils.XbbTools import XbbTools

argv = sys.argv
parser = OptionParser()
parser.add_option("-T","--tag", dest="tag", default='', help="config tag")
parser.add_option("-D","--directory", dest="directory", default='MVAout', help="directory name, e.g. MVAout")
parser.add_option("-S","--sample", dest="sample", default='TT*', help="sample")
(opts, args) = parser.parse_args(argv)

config = XbbConfigTools(config=XbbConfigReader.read(opts.tag))
path = config.get("Directories", opts.directory)
sampleInfoDirectory = config.get('Directories', 'samplefiles')
info = ParseInfo(samples_path=path, config=config)

# only take first sample which matches
sampleIdentifier = XbbTools.filterSampleList(info.getSampleIdentifiers(), XbbTools.parseSamplesList(opts.sample))[0]

# get list of ORIGINAL file names for this sample: /store/...
sampleTreeFileNames = XbbTools.getSampleTreeFileNames(sampleInfoDirectory, sampleIdentifier)

fileLocator = FileLocator(config=config)

# get local name of ffirst file
localFilename     = fileLocator.getFilePath(path, sampleIdentifier, sampleTreeFileNames[0])
print(localFilename)

Example #51

Show file

File: evaluateMVA.py Project: jmduarte/Xbb

config = BetterConfigParser()
config.read(opts.config)
anaTag = config.get("Analysis", "tag")

#get locations:
Wdir = config.get('Directories', 'Wdir')
samplesinfo = config.get('Directories', 'samplesinfo')

#read shape systematics
systematics = config.get('systematics', 'systematics')

#systematics
INpath = config.get('Directories', 'MVAin')
OUTpath = config.get('Directories', 'MVAout')

info = ParseInfo(samplesinfo, INpath)

arglist = ''

if not evaluate_optimisation:
    arglist = opts.discr  #RTight_blavla,bsbsb
else:
    #    print '@INFO: Evaluating bdt for optimisation'
    arglist = weight

namelistIN = opts.names
namelist = namelistIN.split(',')

print('namelist', namelist)
# sys.exit(1)

Example #52

Show file

File: run_training_scikit.py Project: GLP90/Xbb

class MvaTrainingHelper(object):

    def __init__(self, config, mvaName):
        self.dataRepresentationVersion = 2
        self.config = config
        self.samplesPath = config.get('Directories', 'MVAin')
        self.samplesDefinitions = config.get('Directories','samplesinfo')
        self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath)
        self.sampleFilesFolder = config.get('Directories', 'samplefiles')
        self.logpath = config.get('Directories', 'logpath')
        self.treeVarSet = config.get(mvaName, 'treeVarSet')
        self.mvaName = mvaName
        self.MVAsettings = config.get(mvaName,'MVAsettings')
        self.factoryname = 'scikit-test1'

        VHbbNameSpace = config.get('VHbbNameSpace', 'library')
        ROOT.gSystem.Load(VHbbNameSpace)

        # variables
        self.MVA_Vars = {}
        self.MVA_Vars['Nominal'] = config.get(self.treeVarSet, 'Nominal').strip().split(' ')

        # samples
        self.backgroundSampleNames = eval(config.get(mvaName, 'backgrounds'))
        self.signalSampleNames = eval(config.get(mvaName, 'signals'))
        self.samples = {
            'BKG': self.samplesInfo.get_samples(self.backgroundSampleNames),
            'SIG': self.samplesInfo.get_samples(self.signalSampleNames),
        }

        # MVA signal region cuts
        self.treeCutName = config.get(mvaName, 'treeCut')
        self.treeCut = config.get('Cuts', self.treeCutName)

        # split in train/test samples
        self.datasets = ['train', 'test']
        self.varsets = ['X', 'y', 'sample_weight']
        self.trainCut = config.get('Cuts', 'TrainCut') 
        self.evalCut = config.get('Cuts', 'EvalCut')

        print("TRAINING CUT:", self.trainCut)
        print("TEST CUT:", self.evalCut)

        self.globalRescale = 2.0
        
        # default parameters
        self.parameters = {
                'factoryname': self.factoryname,
                'mvaName': self.mvaName,
                'MVAregionCut': self.treeCutName + ': ' + self.treeCut,
                #'classifier': 'GradientBoostingClassifier',
                'classifier': 'RandomForestClassifier',
                #'classifier': 'ExtraTreesClassifier',
                #'classifier': 'FT_GradientBoostingClassifier',
                'max_depth': None,
                'max_leaf_nodes': None,
                'class_weight': 'balanced',
                #'criterion': 'friedman_mse',
                'criterion': 'gini',
                #'n_estimators': 3000,
                'n_estimators': 400,
                #'learning_rate': 0.1,
                'algorithm': 'SAMME.R',
                #'min_samples_leaf': 100,
                'splitter': 'best',
                'max_features': 4,
                'subsample': 0.6,
                'limit': -1,
                'additional_signal_weight': 1.0,
                'min_impurity_split': 0.0,
                'bootstrap': True,
                }

        # load parameters from config in a format similar to Root TMVA parameter string
        self.MVAsettingsEvaluated = []
        for mvaSetting in self.MVAsettings.split(':'):
             self.parameters[mvaSetting.split('=')[0].strip()] = eval(mvaSetting.split('=')[1].strip())
             try:
                 self.MVAsettingsEvaluated.append('%s'%mvaSetting.split('=')[0].strip() + '=' + '%r'%self.parameters[mvaSetting.split('=')[0].strip()])
             except:
                 print("???:", mvaSetting)
                 self.MVAsettingsEvaluated.append(mvaSetting)

        self.MVAsettingsEvaluated = ':'.join(self.MVAsettingsEvaluated)

    # load numpy arrays with training/testing data
    def loadCachedNumpyArrays(self, cachedFilesPath):
        cached = True
        try:
            with open(cachedFilesPath + '/scikit_input.dmp', 'rb') as inputFile:
                self.data = pickle.load(inputFile)
            print("INFO: found numpy arrays for input in:", cachedFilesPath)
        except:
            cached = False
        return cached

    # save numpy arrays with training/testing data
    def writeNumpyArrays(self, cachedFilesPath):
        with open(cachedFilesPath + '/scikit_input.dmp', 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print("INFO: wrote numpy arrays for input to:", cachedFilesPath)

    def getCachedNumpyArrayPath(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__V:%r'%self.dataRepresentationVersion
        varsHash = hashlib.sha224(identifier).hexdigest()
        cachedFilesPath = self.logpath + '/../cache/' + varsHash + '/'
        return cachedFilesPath

    def getHash(self):
        identifier = self.treeCut + '__VAR:' + ' '.join(self.MVA_Vars['Nominal']) + '__SIG:' + '/'.join(self.signalSampleNames) + '__BKG:' + '/'.join(self.backgroundSampleNames) + '__PAR:%r'%self.parameters
        return hashlib.sha224(identifier).hexdigest()[:8]

    def prepare(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        self.sampleTrees = []
        categories = ['BKG', 'SIG']
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        cachedFilesPath = self.getCachedNumpyArrayPath() 
        try:
            os.makedirs(cachedFilesPath)
        except:
            pass
        
        # load numpy arrays from disk if they have been already created
        if self.loadCachedNumpyArrays(cachedFilesPath):
            return self

        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                        arrayLists[datasetName].append(inputData)

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0),
                    'y': np.array(targetLists['test'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                }

        # write numpy arrays to disk
        self.writeNumpyArrays(cachedFilesPath)

        return self

    def verify_data(self):
        valid = True
        for dataset in self.datasets:
            for var in self.varsets:
                print("DEBUG: self.data['{dataset}']['{var}'].shape = {shape}".format(dataset=dataset, var=var, shape=self.data[dataset][var].shape))

        for dataset in self.datasets:
            for i in range(len(self.varsets)-1):
                valid = valid and self.data[dataset][self.varsets[i]].shape[0] == self.data[dataset][self.varsets[i+1]].shape[0]
        return valid

    def run(self):

        if not self.verify_data():
            print ("\x1b[31mERROR: training input data array shapes are incompatible!\x1b[0m")
            raise Exception("BadTrainingInputData")

        applyClassWeights = False
        if self.parameters['classifier'] == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'RandomForestClassifier':
            clf = RandomForestClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    bootstrap=self.parameters['bootstrap'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'FT_GradientBoostingClassifier':
            rt = RandomTreesEmbedding(max_depth=3, n_estimators=20, random_state=0)
            clf0 = GradientBoostingClassifier(
                    min_samples_leaf=self.parameters['min_samples_leaf'], 
                    max_depth=self.parameters['max_depth'], 
                    max_leaf_nodes=self.parameters['max_leaf_nodes'],
                    criterion=self.parameters['criterion'],
                    max_features=self.parameters['max_features'],
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    subsample=self.parameters['subsample'],
                    min_impurity_split=self.parameters['min_impurity_split'],
                )
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
            clf = make_pipeline(rt, clf0)
        elif self.parameters['classifier'] == 'XGBClassifier':
            clf = XGBClassifier(
                    learning_rate=self.parameters['learning_rate'],
                    max_depth=self.parameters['max_depth'],
                    n_estimators=self.parameters['n_estimators'],
                    objective='binary:logitraw',
                    colsample_bytree=self.parameters['colsample_bytree'],
                    subsample=self.parameters['subsample'],
                    min_child_weight=self.parameters['min_child_weight'],
                    gamma=self.parameters['gamma'] if 'gamma' in self.parameters else 0.0,
                    #reg_alpha=8,
                    reg_lambda=self.parameters['reg_lambda'] if 'reg_lambda' in self.parameters else 1.0,
                    reg_alpha=self.parameters['reg_alpha'] if 'reg_alpha' in self.parameters else 0.0,
                    ) 
            if self.parameters['class_weight'] == 'balanced':
                applyClassWeights = True
        elif self.parameters['classifier'] == 'MLPClassifier':
            classifierParams = {k:v for k,v in self.parameters.iteritems() if k in ['solver', 'alpha', 'hidden_layer_sizes', 'max_iter', 'warm_start', 'learning_rate_init', 'learning_rate', 'momentum', 'epsilon', 'beta_1', 'beta_2', 'validation_fraction', 'early_stopping']}
            clf = MLPClassifier(**classifierParams) 
        elif self.parameters['classifier'] in ['SVC', 'LinearSVC']:
            '''
            clf = SVC(
                        C=1.0,
                        cache_size=4000,
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=3,
                        gamma='auto',
                        kernel='rbf',
                        max_iter=100000,
                        probability=False,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=True
                    )
            '''
            bagged = int(self.parameters['bagged']) if 'bagged' in self.parameters else False
            if self.parameters['classifier'] == 'LinearSVC':
                clf = LinearSVC(
                            class_weight='balanced',
                            dual=self.parameters['dual'],
                            max_iter=self.parameters['max_iter'],
                            C=self.parameters['C'],
                            penalty=self.parameters['penalty'],
                            loss=self.parameters['loss'],
                            tol=self.parameters['tol'],
                            verbose=True,
                        )
            else:
                # classifier='SVC':C=random.choice([1.0, 10.0, 100.0, 500.0, 1000.0]):kernel=random.choice(['rbf','poly','linear']):degree=random.choice([2,3,4]):gamma=random.choice(['auto', 0.1, 0.3, 0.6]):shrinking=random.choice([True, False]):max_iter=10000:penalty=random.choice(['l1','l2']):tol=random.choice([0.005, 0.001, 0.0005, 0.0001]):cache_size=1000
                clf =  SVC(
                        C=self.parameters['C'],
                        cache_size=self.parameters['cache_size'],
                        class_weight='balanced',
                        coef0=0.0,
                        decision_function_shape='ovr',
                        degree=self.parameters['degree'],
                        gamma=self.parameters['gamma'],
                        kernel=self.parameters['kernel'],
                        max_iter=self.parameters['max_iter'],
                        probability=False,
                        random_state=None,
                        shrinking=self.parameters['shrinking'],
                        tol=self.parameters['tol'],
                        verbose=True
                    )

            if bagged:
                n_estimators = bagged
                if 'bag_oversampling' in self.parameters:
                    n_estimators = int(n_estimators * self.parameters['bag_oversampling'])

                clf0 = clf
                clf = BaggingClassifier(
                        clf0,
                        max_samples=1.0 / bagged,
                        max_features=self.parameters['baggedfeatures'] if 'baggedfeatures' in self.parameters else 1.0,
                        bootstrap_features=self.parameters['bootstrapfeatures'] if 'bootstrapfeatures' in self.parameters else False,
                        n_estimators=n_estimators,
                    )

        else:
            clf = AdaBoostClassifier(
                    DecisionTreeClassifier(
                        min_samples_leaf=self.parameters['min_samples_leaf'], 
                        max_depth=self.parameters['max_depth'], 
                        class_weight=self.parameters['class_weight'], 
                        criterion=self.parameters['criterion'],
                        splitter=self.parameters['splitter'],
                        max_features=self.parameters['max_features'],
                        ), 
                    n_estimators=self.parameters['n_estimators'], 
                    learning_rate=self.parameters['learning_rate'], 
                    algorithm=self.parameters['algorithm'],
                )

        #with open("/mnt/t3nfs01/data01/shome/berger_p2/VHbb/CMSSW_9_4_0_pre3/src/Xbb/python/logs_v25//test-scikit-svm/Logs//../cache/b7d92f50a52f8474e66cf4e2c3ad3fa4725aa489e7a6b288e4ed3855//clf2018-01-31_18-22-38_be9479a2.pkl","rb") as inputFile:
        #    clf = pickle.load(inputFile)

        # preprocessing
        print("transformation...")

        if 'scaler' in self.parameters:
            if self.parameters['scaler'] == 'standard':
                self.scaler = preprocessing.StandardScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'minmax':
                self.scaler = preprocessing.MinMaxScaler().fit(self.data['train']['X'])
            elif self.parameters['scaler'] == 'robust':
                self.scaler = preprocessing.RobustScaler().fit(self.data['train']['X'])
            else:
                self.scaler = None
        else:
            self.scaler = None

        if self.scaler:
            self.data['train']['X'] = self.scaler.transform(self.data['train']['X'])
            self.data['test']['X'] = self.scaler.transform(self.data['test']['X'])

        # SHUFFLE all samples before
        self.shuffle = False
        if self.shuffle:
            print("shuffle input data...")
            for dataset in self.datasets:
                nSamples = self.data[dataset][self.varsets[0]].shape[0]
                randomPermutation = np.random.permutation(nSamples)
                for var in self.varsets:
                    self.data[dataset][var] = np.take(self.data[dataset][var], randomPermutation, axis=0)

        # LIMIT number of training samples
        # recommended to also shuffle samples before, because they are ordered by signal/background
        limitNumTrainingSamples = self.parameters['limit']
        if (limitNumTrainingSamples > 0):
            print("limit training samples to:", limitNumTrainingSamples)
            #for dataset in self.datasets:
            #    for var in self.varsets:
            #        self.data[dataset][var] = self.data[dataset][var][0:limitNumTrainingSamples]
            for dataset in self.datasets:
                self.data[dataset] = resample(self.data[dataset], n_samples=limitNumTrainingSamples, replace=False)

        # oversample
        upscale = self.parameters['upscalefactor'] if 'upscalefactor' in self.parameters else None
        if upscale:
            upscalemax =  self.parameters['upscalemax'] if 'upscalemax' in self.parameters else 10 
            upscalesignal = self.parameters['upscalefactorsignal'] if 'upscalefactorsignal' in self.parameters else 1.0 #upscalefactorsignal
            indices = []
            for i in range(len(self.data['train']['sample_weight'])):
                #print(x)
                x= self.data['train']['sample_weight'][i]
                if self.data['train']['y'][i] > 0.5:
                    x *= upscalesignal
                n = x * upscale
                # limit oversampling factor!
                if n > upscalemax:
                    n=upscalemax
                if n<1:
                    n=1
                intN = int(n)
                indices += [i]*intN
                #floatN = n-intN
                #if floatN > 0:
                #    if random.uniform(0.0,1.0) < floatN:
                #        indices += [i]

            self.data['train']['X'] = self.data['train']['X'][indices]
            self.data['train']['y'] = self.data['train']['y'][indices]
            self.data['train']['sample_weight'] = self.data['train']['sample_weight'][indices]
            self.verify_data()

        # BALANCE weights
        # calculate total weights and class_weights
        nSig = len([x for x in self.data['train']['y'] if x >= 0.5])
        nBkg = len([x for x in self.data['train']['y'] if x < 0.5])
        print("#SIG:", nSig)
        print("#BKG:", nBkg)
        weightsSignal = []
        weightsBackground = []
        for i in range(len(self.data['train']['sample_weight'])):
            if self.data['train']['y'][i] < 0.5:
                weightsBackground.append(self.data['train']['sample_weight'][i])
            else:
                weightsSignal.append(self.data['train']['sample_weight'][i])
        weightsSignal.sort()
        weightsBackground.sort()
        totalWeightSignal = sum(weightsSignal)
        totalWeightBackground = sum(weightsBackground)
        signalReweight = (totalWeightSignal+totalWeightBackground)/totalWeightSignal * self.parameters['additional_signal_weight']
        backgroundReweight = (totalWeightSignal+totalWeightBackground)/totalWeightBackground
        print("SUM of weights for signal:", totalWeightSignal)
        print("SUM of weights for background:", totalWeightBackground)
        
        if applyClassWeights:
            print("re-weight signals by:", signalReweight)
            print("re-weight background by:", backgroundReweight)
            for i in range(len(self.data['train']['sample_weight'])):
                if self.data['train']['y'][i] < 0.5:
                    self.data['train']['sample_weight'][i] *= backgroundReweight
                else:
                    self.data['train']['sample_weight'][i] *= signalReweight
        else:
            print("DO NOT re-weight signals by:", signalReweight)
        print("...")
        # TRAINING

        learningCurve = []
        if self.parameters['classifier'] == 'XGBClassifier':
            clf = clf.fit(self.data['train']['X'], self.data['train']['y'], self.data['train']['sample_weight'], verbose=True)
        else:
            try:
                clf = clf.fit(**self.data['train'])
            except:
                clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])
                
                if 'rounds' in self.parameters and self.parameters['rounds'] > 1:
                    for rNumber in range(self.parameters['rounds']):
                        results = clf.predict_proba(self.data['test']['X']) 
                        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
                        print(" round ", rNumber, " AUC=", auc1)
                        learningCurve.append(auc1)
                        clf = clf.fit(X=self.data['train']['X'], y=self.data['train']['y'])

        print("***** FIT done")

        # TEST
        try:
            results = clf.decision_function(self.data['test']['X'])
            print("***** EVALUATION on test sample done")
            results_train = clf.decision_function(self.data['train']['X'])
            print("***** EVALUATION on training sample done")

            print("R:", results.shape, results)

            results = np.c_[np.ones(results.shape[0]), results]
            results_train = np.c_[np.ones(results_train.shape[0]), results_train]
        except:
            results = clf.predict_proba(self.data['test']['X'])
            results_train = clf.predict_proba(self.data['train']['X'])

        # ROC curve
        print("calculating auc...")
        auc1 = roc_auc_score(self.data['test']['y'], results[:,1], sample_weight=self.data['test']['sample_weight'])
        auc_training = roc_auc_score(self.data['train']['y'], results_train[:,1], sample_weight=self.data['train']['sample_weight'])
        print("AUC:", auc1, " (training:", auc_training, ")")

        print("**** compute quantiles")
        qx = np.array([0.01, 0.99])
        qy = np.array([0.0, 0.0])
        thq = ROOT.TH1D("quant","quant",500000,-5.0,5.0)
        nS = len(results)
        for i in range(nS):
            thq.Fill(results[i][1])
        thq.GetQuantiles(2, qy, qx)

        # rescaling of SCORE to [0, 1]
        minProb = 2.0
        maxProb = -1.0
        #for i in range(len(self.data['train']['X'])):
        #    if results_train[i][1] > maxProb:
        #        maxProb = results_train[i][1]
        #    if results_train[i][1] < minProb:
        #        minProb = results_train[i][1]
        #for i in range(len(self.data['test']['X'])):
        #    if results[i][1] > maxProb:
        #        maxProb = results[i][1]
        #    if results[i][1] < minProb:
        #        minProb = results[i][1]

        minProb = qy[0]
        maxProb = qy[1]
        delta = maxProb-minProb
        minProb -= delta * 0.01
        maxProb += delta * 0.10

        useSqrt = False

        # fill TRAINING SCORE histogram (class probability)
        h1t = ROOT.TH1D("h1t","h1t",50,0.0,1.0)
        h2t = ROOT.TH1D("h2t","h2t",50,0.0,1.0)
        for i in range(len(self.data['train']['X'])):
            result = (results_train[i][1]-minProb)/(maxProb-minProb)