Exemple #1
0
    def test_TreeCache1(self):

        # load sample tree
        sampleTree = self.getTree()

        # create skimmed tree (cache)
        tc = TreeCache.TreeCache(sample=self.sampleName,
                                 cutList=[self.someCut],
                                 inputFolder=self.scratchDirectory,
                                 tmpFolder=self.tmpDir,
                                 outputFolder=self.cacheDir,
                                 branches=['a', 'b', 'c'],
                                 debug=False)
        tc.setSampleTree(sampleTree).cache()
        sampleTree.process()

        # now try to load the cached tree
        tc2 = TreeCache.TreeCache(sample=self.sampleName,
                                  cutList=[self.someCut],
                                  inputFolder=self.scratchDirectory,
                                  tmpFolder=self.tmpDir,
                                  outputFolder=self.cacheDir,
                                  debug=False)

        # and check if cached tree is there
        self.assertTrue(tc2.isCached())
        self.assertTrue(tc2.isCachedAndValid())

        # get the sampleTree
        sampleTreeCached = tc2.getTree()

        # check if the number of events matches
        nSelectedEvents = sampleTree.tree.Draw("a", self.someCut, "goff")
        self.assertEqual(sampleTreeCached.tree.GetEntries(), nSelectedEvents)
Exemple #2
0
    def prepare(self):
        # add DATA + MC samples
        self.fileNames = []
        for sample in self.dataSamples + self.mcSamples:
            print(sample.identifier)

            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(
                    self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)

            # get sample tree from cache
            tc = TreeCache.TreeCache(sample=sample,
                                     cutList=sampleCuts,
                                     inputFolder=self.samplesPath,
                                     config=config)
            if tc.isCached():
                self.fileNames += tc.findCachedFileNames()
            else:
                print("ERROR: not cached, run cacheplot again")
                raise Exception("NotCached")
        if len(self.fileNames) < 1:
            print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m")
        return self
Exemple #3
0
    def prepare(self):
        print(
            "INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:"
            .format(region=region))
        for var in self.vars:
            print("  > {var}".format(var=var))

        self.histogramStacks = {}
        for var in self.vars:
            self.histogramStacks[var] = StackMaker(self.config,
                                                   var,
                                                   self.region,
                                                   self.signalRegion,
                                                   None,
                                                   '_' + self.subcutPlotName,
                                                   title=self.title)

        fileLocator = FileLocator(config=self.config,
                                  useDirectoryListingCache=True)

        # add DATA + MC samples
        for sample in self.dataSamples + self.mcSamples:

            # cuts
            sampleCuts = [sample.subcut]
            if self.config.has_option('Cuts', self.region):
                sampleCuts.append(self.config.get('Cuts', self.region))
            if self.config.has_option(self.configSection, 'Datacut'):
                sampleCuts.append(
                    self.config.get(self.configSection, 'Datacut'))
            if self.addBlindingCut:
                sampleCuts.append(self.addBlindingCut)

            # get sample tree from cache
            tc = TreeCache.TreeCache(sample=sample,
                                     cutList=sampleCuts,
                                     inputFolder=self.samplesPath,
                                     config=config,
                                     fileLocator=fileLocator)
            sampleTree = tc.getTree()

            if sampleTree:
                groupName = self.getSampleGroup(sample)
                print(" > found the tree, #entries = ",
                      sampleTree.tree.GetEntries())
                print("   > group =", groupName)
                print(" > now adding the tree for vars=", self.vars)

                # add the sample tree for all the variables
                for var in self.vars:
                    self.histogramStacks[var].addSampleTree(
                        sample=sample,
                        sampleTree=sampleTree,
                        groupName=groupName,
                        cut=self.subcut if self.subcut else '1')
            else:
                print("\x1b[31mERROR: sampleTree not available for ", sample,
                      ", run caching again!!\x1b[0m")
                raise Exception("CachedTreeMissing")
        return self
Exemple #4
0
    def test_NotCached(self):

        # now try to load the cached tree
        tc2 = TreeCache.TreeCache(sample='ThisSampleDoesNotExistInCache',
                                  cutList=[self.someCut],
                                  inputFolder=self.scratchDirectory,
                                  tmpFolder=self.tmpDir,
                                  outputFolder=self.cacheDir,
                                  debug=False)
        self.assertFalse(tc2.isCached())
Exemple #5
0
    def run(self):

        # keep additional branches for plotting
        try:
            keepBranchesPlot = eval(
                self.config.get('Branches', 'keep_branches_plot'))
        except:
            keepBranchesPlot = []
        try:
            keepBranchesPlot += eval(
                self.config.get('Branches', 'keep_branches'))
        except:
            pass

        # also keep some branches which might be used later in variables definition and weights
        try:
            for section in self.config.sections():
                try:
                    if section.startswith(
                            'plotDef:') and self.config.has_option(
                                section, 'relPath'):
                        keepBranchesPlot.append(
                            self.config.get(section, 'relPath'))
                except Exception as e:
                    print("\x1b[31mWARNING: config error in:", section, "=>",
                          e, "\x1b[0m")
        except Exception as e2:
            print(
                "\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m"
            )
            print(e2)
        try:
            keepBranchesPlot.append(self.config.get('Weights', 'weightF'))
        except:
            pass
        # plotting region cut
        for region, regionInfo in self.regionsDict.iteritems():
            keepBranchesPlot.append(regionInfo['cut'])
        keepBranchesPlotFinal = BranchList(
            keepBranchesPlot).getListOfBranches()
        print("KEEP:", keepBranchesPlotFinal)

        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print('*' * 80)
            print(' ', sampleToCache)
            print('*' * 80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            sampleTree = None

            # for all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [
                x for x in self.samples if x.identifier == sampleToCache
            ]
            for sample in subsamples:

                # add cuts for all training regions
                for region, regionInfo in self.regionsDict.iteritems():

                    configSection = 'Plot:%s' % region

                    # cuts
                    sampleCuts = [sample.subcut]
                    if regionInfo['cut']:
                        sampleCuts.append(regionInfo['cut'])
                    if self.config.has_option(configSection, 'Datacut'):
                        sampleCuts.append(
                            self.config.get(configSection, 'Datacut'))
                    if self.config.has_option('Plot_general',
                                              'addBlindingCut'):
                        sampleCuts.append(
                            self.config.has_option('Plot_general',
                                                   'addBlindingCut'))

                    # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.)
                    cacheName = 'plot:{region}_{sample}'.format(
                        region=region, sample=sample.name)

                    # add cache object
                    tc = TreeCache.TreeCache(
                        name=cacheName,
                        sample=sample.name,
                        cutList=sampleCuts,
                        inputFolder=self.samplesPath,
                        splitFilesChunks=self.splitFilesChunks,
                        chunkNumber=self.chunkNumber,
                        splitFilesChunkSize=self.splitFilesChunkSize,
                        fileList=self.fileList,
                        branches=keepBranchesPlotFinal,
                        config=self.config,
                        debug=True)

                    # check if this part of the sample is already cached
                    isCached = tc.partIsCached()
                    if not isCached or self.forceRedo:
                        if isCached:
                            tc.deleteCachedFiles(chunkNumber=self.chunkNumber)

                        # for the first sample which comes from this files, load the tree
                        if not self.sampleTree:
                            self.sampleTree = SampleTree(
                                {
                                    'name': sample.identifier,
                                    'folder': self.samplesPath
                                },
                                splitFilesChunkSize=self.splitFilesChunkSize,
                                chunkNumber=self.chunkNumber,
                                config=self.config,
                                saveMemory=True)
                            if not self.sampleTree or not self.sampleTree.tree:
                                print(
                                    "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m"
                                )
                                raise Exception("CreationOfSampleTreeFailed")
                            # consistency check on the file list at submission time and now
                            fileListNow = self.sampleTree.getSampleFileNameChunk(
                                self.chunkNumber)
                            if self.fileList and (sorted(self.fileList) !=
                                                  sorted(fileListNow)):
                                print(
                                    "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m"
                                )
                                raise Exception("SampleFilesHaveChanged")

                        treeCaches.append(
                            tc.setSampleTree(self.sampleTree).cache())
                    else:
                        print("INFO: already cached!", tc, "(", tc.hash, ")")

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print("nothing to do!")
Exemple #6
0
    def prepare(self):

        self.trainingOutputFile = ROOT.TFile.Open(self.trainingOutputFileName,
                                                  "RECREATE")
        # ----------------------------------------------------------------------------------------------------------------------
        # create TMVA factory
        # ----------------------------------------------------------------------------------------------------------------------
        self.factory = ROOT.TMVA.Factory(self.factoryname,
                                         self.trainingOutputFile,
                                         self.factorysettings)
        if self.trainingOutputFile and self.factory:
            print("INFO: initialized MvaTrainingHelper.", self.factory)
        else:
            print(
                "\x1b[31mERROR: initialization of MvaTrainingHelper failed!\x1b[0m"
            )

        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/eval trees
        # ----------------------------------------------------------------------------------------------------------------------
        try:
            addBackgroundTreeMethod = self.factory.AddBackgroundTree
            addSignalTreeMethod = self.factory.AddSignalTree
            self.dataLoader = None
        except:
            print("oh no..")
            # the DataLoader wants to be called '.'
            self.dataLoader = ROOT.TMVA.DataLoader(".")
            addBackgroundTreeMethod = self.dataLoader.AddBackgroundTree
            addSignalTreeMethod = self.dataLoader.AddSignalTree

        # DEBUG: restrict memory
        # resource.setrlimit(resource.RLIMIT_AS, (4.0*1024*1024*1024, 5.0*1024*1024*1024))

        self.sampleTrees = []
        for addTreeFcn, samples in [[
                addBackgroundTreeMethod, self.samples['BKG']
        ], [addSignalTreeMethod, self.samples['SIG']]]:
            for sample in samples:
                print('*' * 80, '\n%s\n' % sample, '*' * 80)
                for additionalCut in [self.TrainCut, self.EvalCut]:
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    tc = TreeCache.TreeCache(sample=sample,
                                             cutList=sampleCuts,
                                             inputFolder=self.samplesPath,
                                             config=self.config,
                                             debug=True)
                    sampleTree = tc.getTree()
                    sampleTree.tree.SetCacheSize(32 * 1024)

                    # prevent garbage collection
                    self.sampleTrees.append(sampleTree)
                    if sampleTree:
                        treeScale = sampleTree.getScale(
                            sample) * self.globalRescale

                        # only non-empty trees can be added
                        if sampleTree.tree.GetEntries() > 0:
                            addTreeFcn(
                                sampleTree.tree, treeScale,
                                ROOT.TMVA.Types.kTraining if additionalCut
                                == self.TrainCut else ROOT.TMVA.Types.kTesting)
                            print('max mem used = %d' % (resource.getrusage(
                                resource.RUSAGE_SELF).ru_maxrss))
                    else:
                        print("\x1b[31mERROR: TREE NOT FOUND:", sample.name,
                              " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        if self.dataLoader:
            for var in self.MVA_Vars['Nominal']:
                self.dataLoader.AddVariable(var, 'D')
        else:
            for var in self.MVA_Vars['Nominal']:
                self.factory.AddVariable(var, 'D')

        return self
Exemple #7
0
    def prepare(self):
        if len(self.dcMakers) > 0:
            self.treeCaches = []
            self.sampleTree = None

            # cuts
            allSamples = self.getAllSamples()
            subsamples = [
                x for x in allSamples if x.identifier == self.sampleToCache
            ]

            # loop over all datacard regions
            for dcMaker in self.dcMakers:

                # loop over all subsamples (which come from the same root tree files)
                for sample in subsamples:

                    # combine subcut and systematics cut with logical AND
                    # systematics cuts are combined with logical OR, such that 1 cache file can be used for all the systematics

                    isData = (sample.type == 'DATA')
                    systematicsCuts = sorted(
                        list(
                            set([
                                x['cachecut']
                                for x in dcMaker.getSystematicsList(
                                    isData=isData)
                            ])))
                    sampleCuts = {
                        'AND': [sample.subcut, {
                            'OR': systematicsCuts
                        }]
                    }
                    if self.verbose:
                        print(
                            json.dumps(sampleCuts,
                                       sort_keys=True,
                                       indent=8,
                                       default=str))

                    # make list of branches to keep in root file
                    branchList = BranchList(sample.subcut)
                    branchList.addCut(
                        [x['cachecut'] for x in dcMaker.getSystematicsList()])
                    branchList.addCut(
                        [x['cut'] for x in dcMaker.getSystematicsList()])
                    branchList.addCut(
                        [x['var'] for x in dcMaker.getSystematicsList()])
                    branchList.addCut(
                        [x['weight'] for x in dcMaker.getSystematicsList()])
                    branchList.addCut(self.config.get('Weights', 'weightF'))
                    branchList.addCut(
                        eval(self.config.get('Branches', 'keep_branches')))
                    branchesToKeep = branchList.getListOfBranches()

                    # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.)
                    cacheName = 'dc:{region}_{sample}'.format(
                        region=dcMaker.getRegion(), sample=sample.name)

                    # add cache object
                    tc = TreeCache.TreeCache(
                        name=cacheName,
                        sample=sample.name,
                        cutList=sampleCuts,
                        cutSequenceMode='TREE',
                        branches=branchesToKeep,
                        inputFolder=dcMaker.path,
                        splitFilesChunks=self.splitFilesChunks,
                        chunkNumber=self.chunkNumber,
                        splitFilesChunkSize=self.splitFilesChunkSize,
                        fileList=self.fileList,
                        config=self.config,
                        debug=self.verbose)

                    # check if this part of the sample is already cached
                    isCached = tc.partIsCached()
                    print(
                        "check if sample \x1b[34m{sample}\x1b[0m part {part} is cached:"
                        .format(sample=sample.name,
                                part=self.chunkNumber), isCached)
                    if not isCached or self.forceRedo:
                        if isCached:
                            tc.deleteCachedFiles(chunkNumber=self.chunkNumber)

                        # for the first sample which comes from this files, load the tree
                        if not self.sampleTree:
                            self.sampleTree = SampleTree(
                                {
                                    'name': sample.identifier,
                                    'folder': dcMaker.path
                                },
                                splitFilesChunkSize=self.splitFilesChunkSize,
                                chunkNumber=self.chunkNumber,
                                config=self.config,
                                saveMemory=True)
                            if not self.sampleTree or not self.sampleTree.tree:
                                print(
                                    "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m"
                                )
                                raise Exception("CreationOfSampleTreeFailed")

                            # consistency check on the file list at submission time and now
                            fileListNow = self.sampleTree.getSampleFileNameChunk(
                                self.chunkNumber)
                            if self.fileList and (sorted(self.fileList) !=
                                                  sorted(fileListNow)):
                                print(
                                    "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m"
                                )
                                raise Exception("SampleFilesHaveChanged")

                        # connect the TreeCache object to the input sampleTree and add it to the list of cached trees
                        self.treeCaches.append(
                            tc.setSampleTree(self.sampleTree).cache())
        else:
            print("WARNING: no datacard regions added, nothing to do.")
        return self
Exemple #8
0
    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # cache samples
        # ----------------------------------------------------------------------------------------------------------------------
        for sampleToCache in [self.sampleIdentifier]:
            print('*' * 80)
            print(' ', sampleToCache)
            print('*' * 80)
            # prepare caches for training and evaluation samples
            treeCaches = []
            self.sampleTree = None

            # use all (sub)samples which come from the same files (sampleIdentifier)
            subsamples = [
                x for x in self.samples if x.identifier == sampleToCache
            ]

            # list of branches to keep for use as MVA input variables
            branchListOfMVAVars = BranchList()
            for sample in subsamples:
                for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
                ):
                    for additionalCut in [self.TrainCut, self.EvalCut]:
                        branchListOfMVAVars.addCut(trainingRegionInfo['vars'])
            branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF'))
            mvaBranches = branchListOfMVAVars.getListOfBranches()

            # loop over all samples
            for sample in subsamples:

                # add cuts for all training regions
                for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems(
                ):

                    # add cuts for training and evaluation
                    for additionalCut in [self.TrainCut, self.EvalCut]:

                        # cuts
                        sampleCuts = [sample.subcut]
                        if additionalCut:
                            sampleCuts.append(additionalCut)
                        if trainingRegionInfo['cut']:
                            sampleCuts.append(trainingRegionInfo['cut'])

                        # add cache object
                        tc = TreeCache.TreeCache(
                            name='{region}_{sample}_{tr}'.format(
                                region=trainingRegion,
                                sample=sample.name,
                                tr='TRAIN'
                                if additionalCut == self.TrainCut else 'EVAL'),
                            sample=sample.name,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            splitFilesChunks=self.splitFilesChunks,
                            chunkNumber=self.chunkNumber,
                            splitFilesChunkSize=self.splitFilesChunkSize,
                            branches=mvaBranches,
                            config=self.config,
                            debug=True)

                        # check if this part of the sample is already cached
                        isCached = tc.partIsCached()
                        if not isCached or self.force:
                            if isCached:
                                tc.deleteCachedFiles(
                                    chunkNumber=self.chunkNumber)
                            # for the first sample which comes from this files, load the tree
                            if not self.sampleTree:
                                self.sampleTree = SampleTree(
                                    {
                                        'name': sample.identifier,
                                        'folder': self.samplesPath
                                    },
                                    splitFilesChunkSize=self.
                                    splitFilesChunkSize,
                                    chunkNumber=self.chunkNumber,
                                    config=self.config,
                                    saveMemory=True)
                            treeCaches.append(
                                tc.setSampleTree(self.sampleTree).cache())

            if len(treeCaches) > 0:
                # run on the tree
                self.sampleTree.process()
            else:
                print("nothing to do!")
Exemple #9
0
    def prepare(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        self.sampleTrees = []
        categories = ['BKG', 'SIG']
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        cachedFilesPath = self.getCachedNumpyArrayPath() 
        try:
            os.makedirs(cachedFilesPath)
        except:
            pass
        
        # load numpy arrays from disk if they have been already created
        if self.loadCachedNumpyArrays(cachedFilesPath):
            return self

        arrayLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        weightLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        targetLists = {datasetName:[] for datasetName in datasetParts.iterkeys()}
        
        # standard weight expression
        weightF = self.config.get('Weights','weightF')

        for category in categories:
            for sample in self.samples[category]:
                print ('*'*80,'\n%s\n'%sample,'*'*80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(
                            sample=sample,
                            cutList=sampleCuts,
                            inputFolder=self.samplesPath,
                            config=self.config,
                            debug=True
                        )
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(sample) * self.globalRescale
                        print ('scale:', treeScale)
                        
                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        nFeatures = len(features) 
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures), dtype=np.float32)

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        
                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(categories.index(category))

                        arrayLists[datasetName].append(inputData)

                    else:
                        print ("\x1b[31mERROR: TREE NOT FOUND:", sample.name, " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
                'train': {
                    'X': np.concatenate(arrayLists['train'], axis=0),
                    'y': np.array(targetLists['train'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['train'], dtype=np.float32),
                    },
                'test': {
                    'X': np.concatenate(arrayLists['test'], axis=0),
                    'y': np.array(targetLists['test'], dtype=np.float32),
                    'sample_weight': np.array(weightLists['test'], dtype=np.float32),
                    },
                }

        # write numpy arrays to disk
        self.writeNumpyArrays(cachedFilesPath)

        return self
Exemple #10
0
    def run(self):
        # ----------------------------------------------------------------------------------------------------------------------
        # add sig/bkg x training/testing trees
        # ----------------------------------------------------------------------------------------------------------------------
        categories = self.samples.keys()
        datasetParts = {'train': self.trainCut, 'test': self.evalCut}

        systematics = self.systematics
        arrayLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }
        arrayLists_sys = {
            x: {datasetName: []
                for datasetName in datasetParts.iterkeys()}
            for x in systematics
        }
        weightLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }
        targetLists = {
            datasetName: []
            for datasetName in datasetParts.iterkeys()
        }

        weightListsSYS = {
            x: {datasetName: []
                for datasetName in datasetParts.iterkeys()}
            for x in self.weightSYS
        }

        # standard weight expression
        weightF = self.config.get('Weights', 'weightF')

        for category in categories:
            for sample in self.samples[category]:
                print('*' * 80, '\n%s\n' % sample, '*' * 80)
                for datasetName, additionalCut in datasetParts.iteritems():
                    # cuts
                    sampleCuts = [sample.subcut]
                    if additionalCut:
                        sampleCuts.append(additionalCut)
                    # cut from the mva region
                    if self.treeCut:
                        sampleCuts.append(self.treeCut)

                    # get ROOT tree for selected sample & region cut
                    tc = TreeCache.TreeCache(sample=sample,
                                             cutList=sampleCuts,
                                             inputFolder=self.samplesPath,
                                             config=self.config,
                                             debug=True)
                    sampleTree = tc.getTree()
                    if sampleTree:
                        treeScale = sampleTree.getScale(
                            sample) * self.globalRescale
                        print('scale:', treeScale)

                        # initialize numpy array
                        nSamples = sampleTree.GetEntries()
                        features = self.MVA_Vars['Nominal']
                        features_sys = {
                            x: self.MVA_Vars[x]
                            for x in systematics
                        }
                        nFeatures = len(features)
                        print('nFeatures:', nFeatures)
                        inputData = np.zeros((nSamples, nFeatures),
                                             dtype=np.float32)
                        inputData_sys = {
                            x: np.zeros((nSamples, nFeatures),
                                        dtype=np.float32)
                            for x in systematics
                        }

                        # initialize formulas for ROOT tree
                        for feature in features:
                            sampleTree.addFormula(feature)
                        for k, features_s in features_sys.iteritems():
                            for feature in features_s:
                                sampleTree.addFormula(feature)
                        sampleTree.addFormula(weightF)
                        for syst in self.weightSYS:
                            sampleTree.addFormula(self.weightSYSweights[syst])

                        # fill numpy array from ROOT tree
                        for i, event in enumerate(sampleTree):
                            for j, feature in enumerate(features):
                                inputData[i, j] = sampleTree.evaluate(feature)
                            # total weight comes from weightF (btag, lepton sf, ...) and treeScale to scale MC to x-section
                            totalWeight = treeScale * sampleTree.evaluate(
                                weightF)
                            weightLists[datasetName].append(totalWeight)
                            targetLists[datasetName].append(
                                categories.index(category))

                            # add weights varied by (btag) systematics
                            for syst in self.weightSYS:
                                weightListsSYS[syst][datasetName].append(
                                    treeScale * sampleTree.evaluate(
                                        self.weightSYSweights[syst]))

                            # fill systematics
                            for k, feature_s in features_sys.iteritems():
                                for j, feature in enumerate(feature_s):
                                    inputData_sys[k][
                                        i, j] = sampleTree.evaluate(feature)

                        arrayLists[datasetName].append(inputData)
                        for sys in systematics:
                            arrayLists_sys[sys][datasetName].append(
                                inputData_sys[sys])

                    else:
                        print("\x1b[31mERROR: TREE NOT FOUND:", sample.name,
                              " -> not cached??\x1b[0m")
                        raise Exception("CachedTreeMissing")

        # concatenate all data from different samples
        self.data = {
            'train': {
                'X': np.concatenate(arrayLists['train'], axis=0),
                'y': np.array(targetLists['train'], dtype=np.float32),
                'sample_weight': np.array(weightLists['train'],
                                          dtype=np.float32),
            },
            'test': {
                'X': np.concatenate(arrayLists['test'], axis=0),
                'y': np.array(targetLists['test'], dtype=np.float32),
                'sample_weight': np.array(weightLists['test'],
                                          dtype=np.float32),
            },
            'category_labels':
            {idx: label
             for idx, label in enumerate(categories)},
            'meta': {
                'version': self.dataFormatVersion,
                'region': self.mvaName,
                'cutName': self.treeCutName,
                'cut': self.treeCut,
                'trainCut': self.trainCut,
                'testCut': self.evalCut,
                'samples': self.sampleNames,
                'weightF': weightF,
                'weightSYS': self.weightSYS,
                'variables': ' '.join(self.MVA_Vars['Nominal'])
            }
        }
        # add systematics variations
        for sys in systematics:
            self.data['train']['X_' + sys] = np.concatenate(
                arrayLists_sys[sys]['train'], axis=0)
        for syst in self.weightSYS:
            self.data['train']['sample_weight_' + syst] = np.array(
                weightListsSYS[syst]['train'], dtype=np.float32)

        numpyOutputFileName = './' + self.mvaName + '.dmpz'
        with gzip.open(numpyOutputFileName, 'wb') as outputFile:
            pickle.dump(self.data, outputFile)
        print(self.data['meta'])
        print("written to:\x1b[34m", numpyOutputFileName, " \x1b[0m")