Esempio n. 1
0
    def getentries(self, selection, **kwargs):
        """Get number of events for a given selection string."""
        verbosity = LOG.getverbosity(kwargs)
        norm = kwargs.get('norm', True)  # normalize to cross section
        norm = self.norm if norm else 1.
        parallel = kwargs.get('parallel', False)
        kwargs['cuts'] = joincuts(kwargs.get('cuts'), self.cuts)
        kwargs['weight'] = joinweights(kwargs.get('weight', ""),
                                       self.weight)  # pass weight down
        kwargs['scale'] = kwargs.get(
            'scale', 1.0) * self.scale * self.norm  # pass scale down
        kwargs['parallel'] = False

        # GET NUMBER OF EVENTS
        nevents = 0
        if parallel and len(self.samples) > 1:
            processor = MultiProcessor()
            for sample in self.samples:
                processor.start(sample.getentries, (selection, ), kwargs)
            for process in processor:
                nevents += process.join()
        else:
            for sample in self.samples:
                nevents += sample.getentries(selection, **kwargs)

        # PRINT
        if verbosity >= 3:
            print ">>>\n>>> MergedSample.getentries: %s" % (color(
                self.name, color="grey"))
            print ">>>   entries: %d" % (nevents)
            print ">>>   scale: %.6g (scale=%.6g, norm=%.6g)" % (
                scale, self.scale, self.norm)
            print ">>>   %r" % (cuts)

        return nevents
Esempio n. 2
0
def iterevts(fnames,
             tree,
             filenevts,
             refresh=False,
             nchunks=None,
             ncores=0,
             verb=0):
    """Help function for Sample._getnevents to iterate over file names and get number of events processed."""
    if ncores >= 2 and len(fnames) > 5:  # run events check in parallel
        from TauFW.Plotter.plot.MultiThread import MultiProcessor
        from TauFW.common.tools.math import partition

        def loopevts(fnames_):
            """Help function for parallel running on subsets."""
            return [(getnevents(f, tree), f) for f in fnames_]

        processor = MultiProcessor(max=ncores)
        if not nchunks:
            nchunks = 10 if len(fnames) < 100 else 20 if len(
                fnames) < 500 else 50 if len(fnames) < 1000 else 100
            nchunks = max(nchunks, 2 * ncores)
        if nchunks >= len(fnames):
            nchunks = len(fnames) - 1
        if verb >= 2:
            print ">>> iterevts: partitioning %d files into %d chunks for ncores=%d" % (
                len(fnames), nchunks, ncores)
        for i, subset in enumerate(partition(
                fnames, nchunks)):  # process in ncores chunks
            for fname in subset[:]:  # check cache
                if not refresh and fname in filenevts:
                    nevts = filenevts[fname]
                    subset.remove(fname)  # don't run again
                    yield nevts, fname
            if not subset:
                break
            name = "iterevts_%d" % (i)
            processor.start(loopevts, subset, name=name)
        for process in processor:  # collect output from parallel processes
            if verb >= 2:
                print ">>> iterevts: joining process %r..." % (process.name)
            nevtfiles = process.join()
            for nevts, fname in nevtfiles:
                yield nevts, fname
    else:  # run events check in series
        for fname in fnames:
            if refresh or fname not in filenevts:
                nevts = getnevents(fname, tree)
            else:  # get from cache or efficiency
                nevts = filenevts[fname]
            yield nevts, fname
Esempio n. 3
0
def itervalid(fnames,
              checkevts=True,
              nchunks=None,
              ncores=4,
              verb=0,
              **kwargs):
    """Iterate over file names and get number of events processed & check for corruption."""
    if not checkevts:  # just skip validation step and return 0
        for fname in fnames:
            yield 0, fname
    elif ncores >= 2 and len(fnames) > 5:  # run validation in parallel
        from TauFW.Plotter.plot.MultiThread import MultiProcessor
        from TauFW.common.tools.math import partition
        processor = MultiProcessor(max=ncores)

        def loopvalid(fnames_, **kwargs):
            """Help function for parallel running on subsets."""
            return [(isvalid(f, **kwargs), f) for f in fnames_]

        if not nchunks:
            nchunks = 10 if len(fnames) < 100 else 20 if len(
                fnames) < 500 else 50 if len(fnames) < 1000 else 100
            nchunks = max(nchunks, 2 * ncores)
        if nchunks >= len(fnames):
            nchunks = len(fnames) - 1
        if verb >= 2:
            print ">>> itervalid: partitioning %d files into %d chunks for ncores=%d" % (
                len(fnames), nchunks, ncores)
        for i, subset in enumerate(partition(
                fnames, nchunks)):  # process in ncores chunks
            if not subset: break
            name = "itervalid_%d" % (i)
            processor.start(loopvalid, subset, kwargs, name=name)
        for process in processor:
            if verb >= 2:
                print ">>> joining process %r..." % (process.name)
            nevtfiles = process.join()
            for nevts, fname in nevtfiles:
                yield nevts, fname
    else:  # run validation in series
        for fname in fnames:
            if verb >= 2:
                print ">>>   Validating job output '%s'..." % (fname)
            nevts = isvalid(fname)
            yield nevts, fname
Esempio n. 4
0
    def gethist(self, *args, **kwargs):
        """Create and fill histgram for multiple samples. Overrides Sample.gethist."""
        variables, selection, issingle = unwrap_gethist_args(*args)
        verbosity = LOG.getverbosity(kwargs)
        name = kwargs.get('name', self.name)
        name += kwargs.get('tag', "")
        title = kwargs.get('title', self.title)
        parallel = kwargs.get('parallel', False)
        kwargs['cuts'] = joincuts(kwargs.get('cuts'), self.cuts)
        kwargs['weight'] = joinweights(kwargs.get('weight', ""),
                                       self.weight)  # pass weight down
        kwargs['scale'] = kwargs.get(
            'scale', 1.0) * self.scale * self.norm  # pass scale down

        # HISTOGRAMS
        allhists = []
        garbage = []
        hargs = (variables, selection)
        hkwargs = kwargs.copy()
        if parallel and len(self.samples) > 1:
            hkwargs['parallel'] = False
            processor = MultiProcessor()
            for sample in self.samples:
                processor.start(sample.gethist,
                                hargs,
                                hkwargs,
                                name=sample.title)
            for process in processor:
                allhists.append(process.join())
        else:
            for sample in self.samples:
                if 'name' in kwargs:  # prevent memory leaks
                    hkwargs['name'] = makehistname(kwargs.get('name', ""),
                                                   sample.name)
                allhists.append(sample.gethist(*hargs, **hkwargs))

        # SUM
        sumhists = []
        if any(len(subhists) < len(variables) for subhists in allhists):
            LOG.error(
                "MergedSample.gethist: len(subhists) = %s < %s = len(variables)"
                % (len(subhists), len(variables)))
        for ivar, variable in enumerate(variables):
            subhists = [subhists[ivar] for subhists in allhists]
            sumhist = None
            for subhist in subhists:
                if sumhist == None:
                    sumhist = subhist.Clone(
                        makehistname(variable.filename, name))
                    sumhist.SetTitle(title)
                    sumhist.SetDirectory(0)
                    sumhist.SetLineColor(self.linecolor)
                    sumhist.SetFillColor(kWhite if self.isdata or self.issignal
                                         else self.fillcolor)
                    sumhist.SetMarkerColor(self.fillcolor)
                    sumhists.append(sumhist)
                else:
                    sumhist.Add(subhist)
            if verbosity >= 4:
                printhist(sumhist, pre=">>>   ")
            deletehist(subhists)

        # PRINT
        if verbosity >= 2:
            nentries, integral = -1, -1
            for sumhist in sumhists:
                if sumhist.GetEntries() > nentries:
                    nentries = sumhist.GetEntries()
                    integral = sumhist.Integral()
            print ">>>\n>>> MergedSample.gethist - %s" % (color(name,
                                                                color="grey"))
            print ">>>    entries: %d (%.2f integral)" % (nentries, integral)

        if issingle:
            return sumhists[0]
        return sumhists
Esempio n. 5
0
    def gethists(self, *args, **kwargs):
        """Create and fill histograms for all samples and return lists of histograms."""
        verbosity = LOG.getverbosity(kwargs)
        if verbosity >= 1:
            print ">>> gethists"
        variables, selection, issingle = unwrap_gethist_args(*args)
        datavars = filter(lambda v: v.data,
                          variables)  # filter out gen-level variables
        dodata = kwargs.get('data', True)  # create data hists
        domc = kwargs.get('mc', True)  # create expected (SM background) hists
        doexp = kwargs.get('exp',
                           domc)  # create expected (SM background) hists
        dosignal = kwargs.get(
            'signal', domc and
            self.sigsamples)  # create signal hists (for new physics searches)
        weight = kwargs.get('weight', "")  # extra weight (for MC only)
        dataweight = kwargs.get('dataweight', "")  # extra weight for data
        replaceweight = kwargs.get('replaceweight',
                                   None)  # replace substring of weight
        split = kwargs.get('split', True)  # split samples into components
        blind = kwargs.get(
            'blind',
            True)  # blind data in some given range: blind={xvar:(xmin,xmax)}
        scaleup = kwargs.get('scaleup', 0.0)  # scale up histograms
        reset = kwargs.get('reset', False)  # reset scales
        parallel = kwargs.get('parallel',
                              False)  # create and fill hists in parallel
        tag = kwargs.get('tag', "")
        method = kwargs.get(
            'method', None
        )  # data-driven method; 'QCD_OSSS', 'QCD_ABCD', 'JTF', 'FakeFactor', ...
        imethod = kwargs.get(
            'imethod', -1)  # position on list; -1 = last (bottom of stack)
        filters = kwargs.get('filter', None) or []  # filter these samples
        vetoes = kwargs.get('veto', None) or []  # filter out these samples
        #makeJTF       = kwargs.get('JTF',           False   ) and data
        #nojtf         = kwargs.get('nojtf',         makeJTF ) and data
        #keepWJ        = kwargs.get('keepWJ',        False   )
        #makeQCD       = kwargs.get('QCD',           False   ) and data and not makeJTF
        #ratio_WJ_QCD  = kwargs.get('ratio_WJ_QCD_SS', False   )
        #QCDshift      = kwargs.get('QCDshift',      0.0     )
        #QCDrelax      = kwargs.get('QCDrelax',      False   )
        #JTFshift      = kwargs.get('JTFshift',      [ ]     )
        sysvars = kwargs.get(
            'sysvars',
            {})  # list or dict to be filled up with systematic variations
        addsys = kwargs.get('addsys', True)
        task = kwargs.get('task',
                          "Creating histograms")  # task title for loading bar
        #saveto        = kwargs.get('saveto',        ""     ) # save to TFile
        #file          = createFile(saveto,text=cuts) if saveto else None
        filters = ensurelist(filters)
        vetoes = ensurelist(vetoes)
        if method and not hasattr(self, method):
            ensuremodule(method,
                         'Plotter.methods')  # load SampleSet class method

        # FILTER
        samples = []
        for sample in self.samples:
            if not dosignal and sample.issignal: continue
            if not dodata and sample.isdata: continue
            if split and sample.splitsamples:
                subsamples = sample.splitsamples
            else:
                subsamples = [sample]  # sample itself
            for subsample in subsamples:
                if filters and not subsample.match(*filters): continue
                if vetoes and subsample.match(*vetoes): continue
                samples.append(subsample)
        #if nojtf:
        #  samples = [s for s in samples if not ((not keepWJ and s.match('WJ',"W*J","W*j")) or "gen_match_2==6" in s.cuts or "genPartFlav_2==0" in s.cuts)]

        # INPUT / OUTPUT
        mcargs = (variables, selection)
        dataargs = (datavars, selection)
        expkwargs = {
            'tag': tag,
            'weight': weight,
            'replaceweight': replaceweight,
            'verbosity': verbosity,
        }  #'nojtf': nojtf
        sigkwargs = {
            'tag': tag,
            'weight': weight,
            'replaceweight': replaceweight,
            'verbosity': verbosity,
            'scaleup': scaleup
        }
        datakwargs = {
            'tag': tag,
            'weight': dataweight,
            'verbosity': verbosity,
            'blind': blind,
            'parallel': parallel
        }
        result = HistSet(
            variables, dodata, doexp, dosignal
        )  # container for dictionaries of histogram (list): data, exp, signal
        if not variables:
            LOG.warning(
                "Sample.gethists: No variables to make histograms for...")
            return result

        # PRINT
        bar = None
        if verbosity >= 2:
            if not ('QCD' in task or 'JFR' in task):
                LOG.header("Creating histograms for %s" % selection)  #.title
            print ">>> variables: '%s'" % ("', '".join(v.filename
                                                       for v in variables))
            #print ">>> split=%s, makeQCD=%s, makeJTF=%s, nojtf=%s, keepWJ=%s"%(split,makeQCD,makeJTF,nojtf,keepWJ)
            print '>>>   with extra weights "%s" for MC and "%s" for data' % (
                weight, dataweight)
        elif self.loadingbar and verbosity <= 1:
            bar = LoadingBar(len(samples),
                             width=16,
                             pre=">>> %s: " % (task),
                             counter=True,
                             remove=True)  # %s: selection.title

        # GET HISTOGRAMS (PARALLEL)
        if parallel:
            expproc = MultiProcessor()
            sigproc = MultiProcessor()
            dataproc = MultiProcessor()
            for sample in samples:
                if reset: sample.resetscale()
                if sample.name in self.ignore: continue
                if dosignal and sample.issignal:  # SIGNAL
                    sigproc.start(sample.gethist,
                                  mcargs,
                                  sigkwargs,
                                  name=sample.title)
                elif doexp and sample.isexp:  # EXPECTED (SM BACKGROUND)
                    expproc.start(sample.gethist,
                                  mcargs,
                                  expkwargs,
                                  name=sample.title)
                elif dodata and sample.isdata:  # DATA
                    dataproc.start(sample.gethist,
                                   dataargs,
                                   datakwargs,
                                   name=sample.title)
            for dtype, processor, varset in [('exp', expproc, variables),
                                             ('sig', sigproc, variables),
                                             ('data', dataproc, datavars)]:
                for process in processor:
                    if bar: bar.message(process.name)
                    newhists = process.join()
                    for var, hist in zip(
                            varset,
                            newhists):  # assume match variables -> histograms
                        if dtype == 'data':
                            getattr(result, dtype)[var] = hist
                        else:
                            getattr(result, dtype)[var].append(hist)
                    if bar: bar.count("%s done" % process.name)

        # GET HISTOGRAMS (SEQUENTIAL)
        else:
            for sample in samples:
                if bar: bar.message(sample.title)
                if reset: sample.resetscale()
                if sample.name in self.ignore:
                    if bar: bar.count("%s skipped" % sample.title)
                    continue
                if dosignal and sample.issignal:  # SIGNAL
                    hists = sample.gethist(*mcargs, **sigkwargs)
                    for var, hist in zip(variables, hists):
                        result.signal[var].append(hist)
                elif doexp and sample.isexp:  # EXPECTED (SM BACKGROUND)
                    hists = sample.gethist(*mcargs, **expkwargs)
                    for var, hist in zip(variables, hists):
                        result.exp[var].append(hist)
                elif dodata and sample.isdata:  # DATA
                    hists = sample.gethist(*mcargs, **datakwargs)
                    for var, hist in zip(datavars, hists):
                        result.data[var] = hist
                if bar: bar.count("%s done" % sample.title)

        # EXTRA METHODS
        if method:
            hists = getattr(self, method)(*dataargs, **kwargs)
            for var, hist in zip(datavars, hists):
                idx = imethod if imethod >= 0 else len(
                    result.exp[var]) + 1 + imethod
                result.exp[var].insert(idx, hist)

        ## ADD QCD
        #if makeJTF:
        #  hists = self.jetTauFake(*argsD,tag=tag,weight=weight,replaceweight=replaceweight,verbosity=verbosity,saveToFile=file,parallel=parallel,shift=JTFshift,sysvars=sysvars,addsys=addsys)
        #  for var, hist in zip(variablesD,hists):
        #    result.exp[var].insert(0,hist)
        #elif makeQCD:
        #  hists = self.QCD(*argsD,tag=tag,weight=weight,replaceweight=replaceweight,verbosity=verbosity,shift=QCDshift,ratio_WJ_QCD_SS=ratio_WJ_QCD,saveToFile=file,parallel=parallel)
        #  for var, hist in zip(variablesD,hists):
        #    result.exp[var].insert(0,hist)

        ## SAVE histograms
        #if file:
        #  file.cd()
        #  for hist in histsD + result.exp + result.exp:
        #    hist.GetXaxis().SetTitle(var)
        #    hist.Write(hist.GetName())
        #    #file.Write(hist.GetName())
        #  file.Close()

        # YIELDS
        if verbosity >= 2 and len(variables) > 0:
            var = variables[0]
            print ">>> selection:"
            print ">>>  %r" % (selection.selection)
            print ">>> yields: "
            TAB = LOG.table("%11.1f %11.2f    %r")
            TAB.printheader("entries", "integral", "hist name")
            totint = 0
            totent = 0
            if dodata:
                TAB.printrow(result.data[var].Integral(),
                             result.data[var].GetEntries(),
                             result.data[var].GetName())
            for hist in result.exp[var]:
                totint += hist.Integral()
                totent += hist.GetEntries()
                TAB.printrow(hist.Integral(), hist.GetEntries(),
                             hist.GetName())
            TAB.printrow(totint, totent, "total exp.")
            if dosignal:
                for hist in result.signal[var]:
                    TAB.printrow(hist.Integral(), hist.GetEntries(),
                                 hist.GetName())

        if issingle:
            result.setsingle()
            return result
        return result
Esempio n. 6
0
    def gethist2D(self, *args, **kwargs):
        """Create and fill 2D histgram for multiple samples. Overrides Sample.gethist2D."""
        variables, selection, issingle = unwrap_gethist2D_args(*args)
        verbosity = LOG.getverbosity(kwargs)
        name = kwargs.get('name', self.name + "_merged")
        name += kwargs.get('tag', "")
        title = kwargs.get('title', self.title)
        parallel = kwargs.get('parallel', False)
        kwargs['cuts'] = joincuts(kwargs.get('cuts'), self.cuts)
        kwargs['weight'] = joinweights(kwargs.get('weight', ""),
                                       self.weight)  # pass scale down
        kwargs['scale'] = kwargs.get(
            'scale', 1.0) * self.scale * self.norm  # pass scale down
        if verbosity >= 2:
            print ">>>\n>>> MergedSample.gethist2D: %s: %s" % (color(
                name, color="grey"), self.fnameshort)
            #print ">>>    norm=%.4f, scale=%.4f, total %.4f"%(self.norm,kwargs['scale'],self.scale)

        # HISTOGRAMS
        allhists = []
        hargs = (variables, selection)
        hkwargs = kwargs.copy()
        if parallel and len(self.samples) > 1:
            hkwargs['parallel'] = False
            processor = MultiProcessor()
            for sample in self.samples:
                processor.start(sample.gethist2D,
                                hargs,
                                hkwargs,
                                name=sample.title)
            for process in processor:
                allhists.append(process.join())
        else:
            for sample in self.samples:
                if 'name' in kwargs:  # prevent memory leaks
                    hkwargs['name'] = makehistname(kwargs.get('name', ""),
                                                   sample.name)
                allhists.append(sample.gethist2D(*hargs, **hkwargs))

        # SUM
        sumhists = []
        if any(len(subhists) < len(variables) for subhists in allhists):
            LOG.error(
                "MergedSample.gethist2D: len(subhists) = %s < %s = len(variables)"
                % (len(subhists), len(variables)))
        for ivar, (xvariable, yvariable) in enumerate(variables):
            subhists = [subhists[ivar] for subhists in allhists]
            sumhist = None
            for subhist in subhists:
                if sumhist == None:
                    hname = makehistname(
                        "%s_vs_%s" % (xvariable.filename, yvariable.filename),
                        name)
                    sumhist = subhist.Clone(hname)
                    sumhist.SetTitle(title)
                    sumhist.SetDirectory(0)
                    sumhist.SetLineColor(self.linecolor)
                    sumhist.SetFillColor(kWhite if self.isdata or self.issignal
                                         else self.fillcolor)
                    sumhist.SetMarkerColor(self.fillcolor)
                    sumhists.append(sumhist)
                else:
                    sumhist.Add(subhist)
            if verbosity >= 4:
                printhist(sumhist, pre=">>>   ")
            deletehist(subhists)

        if issingle:
            return sumhists[0]
        return sumhists