def getentries(self, selection, **kwargs): """Get number of events for a given selection string.""" verbosity = LOG.getverbosity(kwargs) norm = kwargs.get('norm', True) # normalize to cross section norm = self.norm if norm else 1. parallel = kwargs.get('parallel', False) kwargs['cuts'] = joincuts(kwargs.get('cuts'), self.cuts) kwargs['weight'] = joinweights(kwargs.get('weight', ""), self.weight) # pass weight down kwargs['scale'] = kwargs.get( 'scale', 1.0) * self.scale * self.norm # pass scale down kwargs['parallel'] = False # GET NUMBER OF EVENTS nevents = 0 if parallel and len(self.samples) > 1: processor = MultiProcessor() for sample in self.samples: processor.start(sample.getentries, (selection, ), kwargs) for process in processor: nevents += process.join() else: for sample in self.samples: nevents += sample.getentries(selection, **kwargs) # PRINT if verbosity >= 3: print ">>>\n>>> MergedSample.getentries: %s" % (color( self.name, color="grey")) print ">>> entries: %d" % (nevents) print ">>> scale: %.6g (scale=%.6g, norm=%.6g)" % ( scale, self.scale, self.norm) print ">>> %r" % (cuts) return nevents
def iterevts(fnames, tree, filenevts, refresh=False, nchunks=None, ncores=0, verb=0): """Help function for Sample._getnevents to iterate over file names and get number of events processed.""" if ncores >= 2 and len(fnames) > 5: # run events check in parallel from TauFW.Plotter.plot.MultiThread import MultiProcessor from TauFW.common.tools.math import partition def loopevts(fnames_): """Help function for parallel running on subsets.""" return [(getnevents(f, tree), f) for f in fnames_] processor = MultiProcessor(max=ncores) if not nchunks: nchunks = 10 if len(fnames) < 100 else 20 if len( fnames) < 500 else 50 if len(fnames) < 1000 else 100 nchunks = max(nchunks, 2 * ncores) if nchunks >= len(fnames): nchunks = len(fnames) - 1 if verb >= 2: print ">>> iterevts: partitioning %d files into %d chunks for ncores=%d" % ( len(fnames), nchunks, ncores) for i, subset in enumerate(partition( fnames, nchunks)): # process in ncores chunks for fname in subset[:]: # check cache if not refresh and fname in filenevts: nevts = filenevts[fname] subset.remove(fname) # don't run again yield nevts, fname if not subset: break name = "iterevts_%d" % (i) processor.start(loopevts, subset, name=name) for process in processor: # collect output from parallel processes if verb >= 2: print ">>> iterevts: joining process %r..." % (process.name) nevtfiles = process.join() for nevts, fname in nevtfiles: yield nevts, fname else: # run events check in series for fname in fnames: if refresh or fname not in filenevts: nevts = getnevents(fname, tree) else: # get from cache or efficiency nevts = filenevts[fname] yield nevts, fname
def itervalid(fnames, checkevts=True, nchunks=None, ncores=4, verb=0, **kwargs): """Iterate over file names and get number of events processed & check for corruption.""" if not checkevts: # just skip validation step and return 0 for fname in fnames: yield 0, fname elif ncores >= 2 and len(fnames) > 5: # run validation in parallel from TauFW.Plotter.plot.MultiThread import MultiProcessor from TauFW.common.tools.math import partition processor = MultiProcessor(max=ncores) def loopvalid(fnames_, **kwargs): """Help function for parallel running on subsets.""" return [(isvalid(f, **kwargs), f) for f in fnames_] if not nchunks: nchunks = 10 if len(fnames) < 100 else 20 if len( fnames) < 500 else 50 if len(fnames) < 1000 else 100 nchunks = max(nchunks, 2 * ncores) if nchunks >= len(fnames): nchunks = len(fnames) - 1 if verb >= 2: print ">>> itervalid: partitioning %d files into %d chunks for ncores=%d" % ( len(fnames), nchunks, ncores) for i, subset in enumerate(partition( fnames, nchunks)): # process in ncores chunks if not subset: break name = "itervalid_%d" % (i) processor.start(loopvalid, subset, kwargs, name=name) for process in processor: if verb >= 2: print ">>> joining process %r..." % (process.name) nevtfiles = process.join() for nevts, fname in nevtfiles: yield nevts, fname else: # run validation in series for fname in fnames: if verb >= 2: print ">>> Validating job output '%s'..." % (fname) nevts = isvalid(fname) yield nevts, fname
def gethist(self, *args, **kwargs): """Create and fill histgram for multiple samples. Overrides Sample.gethist.""" variables, selection, issingle = unwrap_gethist_args(*args) verbosity = LOG.getverbosity(kwargs) name = kwargs.get('name', self.name) name += kwargs.get('tag', "") title = kwargs.get('title', self.title) parallel = kwargs.get('parallel', False) kwargs['cuts'] = joincuts(kwargs.get('cuts'), self.cuts) kwargs['weight'] = joinweights(kwargs.get('weight', ""), self.weight) # pass weight down kwargs['scale'] = kwargs.get( 'scale', 1.0) * self.scale * self.norm # pass scale down # HISTOGRAMS allhists = [] garbage = [] hargs = (variables, selection) hkwargs = kwargs.copy() if parallel and len(self.samples) > 1: hkwargs['parallel'] = False processor = MultiProcessor() for sample in self.samples: processor.start(sample.gethist, hargs, hkwargs, name=sample.title) for process in processor: allhists.append(process.join()) else: for sample in self.samples: if 'name' in kwargs: # prevent memory leaks hkwargs['name'] = makehistname(kwargs.get('name', ""), sample.name) allhists.append(sample.gethist(*hargs, **hkwargs)) # SUM sumhists = [] if any(len(subhists) < len(variables) for subhists in allhists): LOG.error( "MergedSample.gethist: len(subhists) = %s < %s = len(variables)" % (len(subhists), len(variables))) for ivar, variable in enumerate(variables): subhists = [subhists[ivar] for subhists in allhists] sumhist = None for subhist in subhists: if sumhist == None: sumhist = subhist.Clone( makehistname(variable.filename, name)) sumhist.SetTitle(title) sumhist.SetDirectory(0) sumhist.SetLineColor(self.linecolor) sumhist.SetFillColor(kWhite if self.isdata or self.issignal else self.fillcolor) sumhist.SetMarkerColor(self.fillcolor) sumhists.append(sumhist) else: sumhist.Add(subhist) if verbosity >= 4: printhist(sumhist, pre=">>> ") deletehist(subhists) # PRINT if verbosity >= 2: nentries, integral = -1, -1 for sumhist in sumhists: if sumhist.GetEntries() > nentries: nentries = sumhist.GetEntries() integral = sumhist.Integral() print ">>>\n>>> MergedSample.gethist - %s" % (color(name, color="grey")) print ">>> entries: %d (%.2f integral)" % (nentries, integral) if issingle: return sumhists[0] return sumhists
def gethists(self, *args, **kwargs): """Create and fill histograms for all samples and return lists of histograms.""" verbosity = LOG.getverbosity(kwargs) if verbosity >= 1: print ">>> gethists" variables, selection, issingle = unwrap_gethist_args(*args) datavars = filter(lambda v: v.data, variables) # filter out gen-level variables dodata = kwargs.get('data', True) # create data hists domc = kwargs.get('mc', True) # create expected (SM background) hists doexp = kwargs.get('exp', domc) # create expected (SM background) hists dosignal = kwargs.get( 'signal', domc and self.sigsamples) # create signal hists (for new physics searches) weight = kwargs.get('weight', "") # extra weight (for MC only) dataweight = kwargs.get('dataweight', "") # extra weight for data replaceweight = kwargs.get('replaceweight', None) # replace substring of weight split = kwargs.get('split', True) # split samples into components blind = kwargs.get( 'blind', True) # blind data in some given range: blind={xvar:(xmin,xmax)} scaleup = kwargs.get('scaleup', 0.0) # scale up histograms reset = kwargs.get('reset', False) # reset scales parallel = kwargs.get('parallel', False) # create and fill hists in parallel tag = kwargs.get('tag', "") method = kwargs.get( 'method', None ) # data-driven method; 'QCD_OSSS', 'QCD_ABCD', 'JTF', 'FakeFactor', ... imethod = kwargs.get( 'imethod', -1) # position on list; -1 = last (bottom of stack) filters = kwargs.get('filter', None) or [] # filter these samples vetoes = kwargs.get('veto', None) or [] # filter out these samples #makeJTF = kwargs.get('JTF', False ) and data #nojtf = kwargs.get('nojtf', makeJTF ) and data #keepWJ = kwargs.get('keepWJ', False ) #makeQCD = kwargs.get('QCD', False ) and data and not makeJTF #ratio_WJ_QCD = kwargs.get('ratio_WJ_QCD_SS', False ) #QCDshift = kwargs.get('QCDshift', 0.0 ) #QCDrelax = kwargs.get('QCDrelax', False ) #JTFshift = kwargs.get('JTFshift', [ ] ) sysvars = kwargs.get( 'sysvars', {}) # list or dict to be filled up with systematic variations addsys = kwargs.get('addsys', True) task = kwargs.get('task', "Creating histograms") # task title for loading bar #saveto = kwargs.get('saveto', "" ) # save to TFile #file = createFile(saveto,text=cuts) if saveto else None filters = ensurelist(filters) vetoes = ensurelist(vetoes) if method and not hasattr(self, method): ensuremodule(method, 'Plotter.methods') # load SampleSet class method # FILTER samples = [] for sample in self.samples: if not dosignal and sample.issignal: continue if not dodata and sample.isdata: continue if split and sample.splitsamples: subsamples = sample.splitsamples else: subsamples = [sample] # sample itself for subsample in subsamples: if filters and not subsample.match(*filters): continue if vetoes and subsample.match(*vetoes): continue samples.append(subsample) #if nojtf: # samples = [s for s in samples if not ((not keepWJ and s.match('WJ',"W*J","W*j")) or "gen_match_2==6" in s.cuts or "genPartFlav_2==0" in s.cuts)] # INPUT / OUTPUT mcargs = (variables, selection) dataargs = (datavars, selection) expkwargs = { 'tag': tag, 'weight': weight, 'replaceweight': replaceweight, 'verbosity': verbosity, } #'nojtf': nojtf sigkwargs = { 'tag': tag, 'weight': weight, 'replaceweight': replaceweight, 'verbosity': verbosity, 'scaleup': scaleup } datakwargs = { 'tag': tag, 'weight': dataweight, 'verbosity': verbosity, 'blind': blind, 'parallel': parallel } result = HistSet( variables, dodata, doexp, dosignal ) # container for dictionaries of histogram (list): data, exp, signal if not variables: LOG.warning( "Sample.gethists: No variables to make histograms for...") return result # PRINT bar = None if verbosity >= 2: if not ('QCD' in task or 'JFR' in task): LOG.header("Creating histograms for %s" % selection) #.title print ">>> variables: '%s'" % ("', '".join(v.filename for v in variables)) #print ">>> split=%s, makeQCD=%s, makeJTF=%s, nojtf=%s, keepWJ=%s"%(split,makeQCD,makeJTF,nojtf,keepWJ) print '>>> with extra weights "%s" for MC and "%s" for data' % ( weight, dataweight) elif self.loadingbar and verbosity <= 1: bar = LoadingBar(len(samples), width=16, pre=">>> %s: " % (task), counter=True, remove=True) # %s: selection.title # GET HISTOGRAMS (PARALLEL) if parallel: expproc = MultiProcessor() sigproc = MultiProcessor() dataproc = MultiProcessor() for sample in samples: if reset: sample.resetscale() if sample.name in self.ignore: continue if dosignal and sample.issignal: # SIGNAL sigproc.start(sample.gethist, mcargs, sigkwargs, name=sample.title) elif doexp and sample.isexp: # EXPECTED (SM BACKGROUND) expproc.start(sample.gethist, mcargs, expkwargs, name=sample.title) elif dodata and sample.isdata: # DATA dataproc.start(sample.gethist, dataargs, datakwargs, name=sample.title) for dtype, processor, varset in [('exp', expproc, variables), ('sig', sigproc, variables), ('data', dataproc, datavars)]: for process in processor: if bar: bar.message(process.name) newhists = process.join() for var, hist in zip( varset, newhists): # assume match variables -> histograms if dtype == 'data': getattr(result, dtype)[var] = hist else: getattr(result, dtype)[var].append(hist) if bar: bar.count("%s done" % process.name) # GET HISTOGRAMS (SEQUENTIAL) else: for sample in samples: if bar: bar.message(sample.title) if reset: sample.resetscale() if sample.name in self.ignore: if bar: bar.count("%s skipped" % sample.title) continue if dosignal and sample.issignal: # SIGNAL hists = sample.gethist(*mcargs, **sigkwargs) for var, hist in zip(variables, hists): result.signal[var].append(hist) elif doexp and sample.isexp: # EXPECTED (SM BACKGROUND) hists = sample.gethist(*mcargs, **expkwargs) for var, hist in zip(variables, hists): result.exp[var].append(hist) elif dodata and sample.isdata: # DATA hists = sample.gethist(*mcargs, **datakwargs) for var, hist in zip(datavars, hists): result.data[var] = hist if bar: bar.count("%s done" % sample.title) # EXTRA METHODS if method: hists = getattr(self, method)(*dataargs, **kwargs) for var, hist in zip(datavars, hists): idx = imethod if imethod >= 0 else len( result.exp[var]) + 1 + imethod result.exp[var].insert(idx, hist) ## ADD QCD #if makeJTF: # hists = self.jetTauFake(*argsD,tag=tag,weight=weight,replaceweight=replaceweight,verbosity=verbosity,saveToFile=file,parallel=parallel,shift=JTFshift,sysvars=sysvars,addsys=addsys) # for var, hist in zip(variablesD,hists): # result.exp[var].insert(0,hist) #elif makeQCD: # hists = self.QCD(*argsD,tag=tag,weight=weight,replaceweight=replaceweight,verbosity=verbosity,shift=QCDshift,ratio_WJ_QCD_SS=ratio_WJ_QCD,saveToFile=file,parallel=parallel) # for var, hist in zip(variablesD,hists): # result.exp[var].insert(0,hist) ## SAVE histograms #if file: # file.cd() # for hist in histsD + result.exp + result.exp: # hist.GetXaxis().SetTitle(var) # hist.Write(hist.GetName()) # #file.Write(hist.GetName()) # file.Close() # YIELDS if verbosity >= 2 and len(variables) > 0: var = variables[0] print ">>> selection:" print ">>> %r" % (selection.selection) print ">>> yields: " TAB = LOG.table("%11.1f %11.2f %r") TAB.printheader("entries", "integral", "hist name") totint = 0 totent = 0 if dodata: TAB.printrow(result.data[var].Integral(), result.data[var].GetEntries(), result.data[var].GetName()) for hist in result.exp[var]: totint += hist.Integral() totent += hist.GetEntries() TAB.printrow(hist.Integral(), hist.GetEntries(), hist.GetName()) TAB.printrow(totint, totent, "total exp.") if dosignal: for hist in result.signal[var]: TAB.printrow(hist.Integral(), hist.GetEntries(), hist.GetName()) if issingle: result.setsingle() return result return result
def gethist2D(self, *args, **kwargs): """Create and fill 2D histgram for multiple samples. Overrides Sample.gethist2D.""" variables, selection, issingle = unwrap_gethist2D_args(*args) verbosity = LOG.getverbosity(kwargs) name = kwargs.get('name', self.name + "_merged") name += kwargs.get('tag', "") title = kwargs.get('title', self.title) parallel = kwargs.get('parallel', False) kwargs['cuts'] = joincuts(kwargs.get('cuts'), self.cuts) kwargs['weight'] = joinweights(kwargs.get('weight', ""), self.weight) # pass scale down kwargs['scale'] = kwargs.get( 'scale', 1.0) * self.scale * self.norm # pass scale down if verbosity >= 2: print ">>>\n>>> MergedSample.gethist2D: %s: %s" % (color( name, color="grey"), self.fnameshort) #print ">>> norm=%.4f, scale=%.4f, total %.4f"%(self.norm,kwargs['scale'],self.scale) # HISTOGRAMS allhists = [] hargs = (variables, selection) hkwargs = kwargs.copy() if parallel and len(self.samples) > 1: hkwargs['parallel'] = False processor = MultiProcessor() for sample in self.samples: processor.start(sample.gethist2D, hargs, hkwargs, name=sample.title) for process in processor: allhists.append(process.join()) else: for sample in self.samples: if 'name' in kwargs: # prevent memory leaks hkwargs['name'] = makehistname(kwargs.get('name', ""), sample.name) allhists.append(sample.gethist2D(*hargs, **hkwargs)) # SUM sumhists = [] if any(len(subhists) < len(variables) for subhists in allhists): LOG.error( "MergedSample.gethist2D: len(subhists) = %s < %s = len(variables)" % (len(subhists), len(variables))) for ivar, (xvariable, yvariable) in enumerate(variables): subhists = [subhists[ivar] for subhists in allhists] sumhist = None for subhist in subhists: if sumhist == None: hname = makehistname( "%s_vs_%s" % (xvariable.filename, yvariable.filename), name) sumhist = subhist.Clone(hname) sumhist.SetTitle(title) sumhist.SetDirectory(0) sumhist.SetLineColor(self.linecolor) sumhist.SetFillColor(kWhite if self.isdata or self.issignal else self.fillcolor) sumhist.SetMarkerColor(self.fillcolor) sumhists.append(sumhist) else: sumhist.Add(subhist) if verbosity >= 4: printhist(sumhist, pre=">>> ") deletehist(subhists) if issingle: return sumhists[0] return sumhists