def _make_dataset_(tree, variables, selection='', name='', title='', silent=False): """Create the dataset from the tree >>> tree = ... >>> ds = tree.make_dataset ( [ 'px , 'py' , 'pz' ] ) """ import ostap.trees.cuts import ostap.fitting.roofit cuts = ROOT.TCut(selection) varset = ROOT.RooArgSet() for v in variables: if isinstance(v, str): v = Variable(v) elif isinstance(v, (tuple, list)): v = Variable(*v) elif isinstance(v, dict): v = Variable(**v) assert isinstance( v, Variable), "Can't create Variable from %s/%s" % (v, type(v)) assert v.trivial, "Variable %s is not ``trivial''" % v.name assert hasattr(tree, v.name), "Tree/Chain has no branch ``%s''" % v.name varset.add(v.var) mn, mx = v.minmax if _minv < mn: cuts &= "%.16g <= %s" % (mn, v.name) if _maxv > mx: cuts &= "%s <= %.16g" % (v.name, mx) if not name: from ostap.core.core import dsID name = '%s_%s' % (dsID(), tree.GetName()) if not title: title = '%s/%s' % (name, tree.GetTitle()) total = len(tree) processed = tree.statVar('1', selection).nEntries() skipped = tree.statVar('1', str(cuts)).nEntries() stat = total, processed, processed - skipped from ostap.logger.utils import rooSilent, rootError with rooSilent(ROOT.RooFit.ERROR, True): with rootError(ROOT.kWarning): ds = ROOT.RooDataSet(name, title, tree, varset, str(cuts)) if not silent: from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat[2] skipped = '/' + attention(skipped) if stat[2] else '' logger.info( 'make_dataset: Events Total:%d/Processed:%s%s CUTS: "%s"\n# %s' % (stat[0], stat[1], skipped, selection, ds)) return ds, stat
def pprocess( chain, selector, nevents=-1, first=0, shortcut=True, ## important chunk_size=100000, ## important max_files=5, ppservers=(), use_frame=20000, ## important silent=False): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain ch = Chain(chain) selection = selector.selection variables = selector.variables ## trivial = selector.trivial_vars and not selector.morecuts trivial = selector.really_trivial and not selector.morecuts all = 0 == first and (0 > nevents or len(chain) <= nevents) if all and trivial and 1 < len(ch.files): logger.info( "Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask(variables, selection, trivial, use_frame) wmgr = WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees dataset, stat = task.results() selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat.skipped skipped = '/' + attention(skipped) if stat.skipped else '' logger.info( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' % (selector.name, stat.processed, stat.total, skipped, selector.cuts(), dataset)) return 1
def _pprocess_(chain, selector, nevents=-1, first=0, shortcut=True, chunk_size=100000, ppservers=(), silent=False): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain ch = Chain(chain) selection = selector.selection variables = selector.variables trivial = selector.trivial all = 0 == first and (0 > nevents or len(chain) <= nevents) if all and trivial and 1 < len(ch.files): logger.info( "Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask(variables, selection, trivial) wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent) wmgr.process(task, ch.split(chunk_size=chunk_size)) dataset, stat = task.output selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat[2] skipped = '/' + attention(skipped) if stat[2] else '' logger.info( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' % (selector.name, stat[1], stat[0], skipped, selector.cuts(), dataset)) return 1
def parallel_fill ( chain , selector , nevents = -1 , first = 0 , shortcut = True , ## important chunk_size = 1000000 , ## important max_files = 5 , use_frame = 20000 , ## important silent = False , job_chunk = -1 , **kwargs ) : """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ import ostap.fitting.roofit from ostap.fitting.pyselectors import SelectorWithVars from ostap.trees.trees import Chain assert isinstance ( selector , SelectorWithVars ) , \ "Invalid type of ``selector'': %s" % type ( selector ) ch = Chain ( chain ) selection = selector.selection variables = selector.variables roo_cuts = selector.roo_cuts ## trivial = selector.trivial_vars and not selector.morecuts trivial = selector.really_trivial and not selector.morecuts all = 0 == first and ( 0 > nevents or len ( chain ) <= nevents ) if all and trivial and 1 < len( ch.files ) : logger.info ("Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask ( variables = variables , selection = selection , roo_cuts = roo_cuts , trivial = trivial , use_frame = use_frame ) wmgr = WorkManager ( silent = silent , **kwargs ) trees = ch.split ( chunk_size = chunk_size , max_files = max_files ) wmgr.process( task , trees , chunk_size = job_chunk ) del trees dataset, stat = task.results() selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat.skipped skipped = '/' + attention ( skipped ) if stat.skipped else '' logger.info ( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n%s' % ( selector.name , stat.processed , stat.total , skipped , selector.cuts() , dataset.table ( prefix = '# ' ) ) ) return dataset, stat
def Terminate ( self ) : # if self.__progress : self.__progress.end() # ## Aborted? if 0 != self.GetAbort() : self.__logger.fatal('Selector(%s): process has been aborted!' % self.__name ) self.__data = None del self.__varset del self.__variables self.__varset = () self.__variables = () return ## RETURN ##get total number of input events from base class self.__stat[0] = self.event() if not self.__silence : skipped = 'Skipped:%d' % self.skipped skipped = '/' + attention ( skipped ) if self.skipped else '' cuts = allright ( '"%s"' % self.cuts () ) if self.trivial_sel else attention ( '"%s"' % self.cuts() ) self.__logger.info ( 'Selector(%s): Events Total:%d/Processed:%d%s CUTS: %s' % ( self.__name , self.total , self.processed , skipped , cuts ) ) self.__logger.info ( 'Selector(%s): dataset created:%s' % ( self.__name , self.__data ) ) if self.__data and not self.__silence : vars = [] for v in self.__variables : s = self.__data.statVar( v.name ) mnmx = s.minmax () mean = s.mean () rms = s.rms () r = ( v.name , ## 0 v.description , ## 1 ('%+.5g' % mean.value() ).strip() , ## 2 ('%.5g' % rms ).strip() , ## 3 ('%+.5g' % mnmx[0] ).strip() , ## 4 ('%+.5g' % mnmx[1] ).strip() ) ## 5 s = self.__skip [ v.name] if s : skip = '%-d' % s else : skip = '' r += skip, ## 6 vars.append ( r ) vars.sort() name_l = len ( 'Variable' ) + 2 desc_l = len ( 'Description' ) + 2 mean_l = len ( 'mean' ) + 2 rms_l = len ( 'rms' ) + 2 min_l = len ( 'min' ) + 2 max_l = len ( 'max' ) + 2 skip_l = len ( 'Skip' ) for v in vars : name_l = max ( name_l , len ( v[0] ) ) desc_l = max ( desc_l , len ( v[1] ) ) mean_l = max ( mean_l , len ( v[2] ) ) rms_l = max ( rms_l , len ( v[3] ) ) min_l = max ( min_l , len ( v[4] ) ) max_l = max ( max_l , len ( v[5] ) ) skip_l = max ( skip_l , len ( v[6] ) ) sep = '# -%s+%s+%s+%s+%s-' % ( ( name_l + 2 ) * '-' , ( desc_l + 2 ) * '-' , ( mean_l+rms_l + 5 ) * '-' , ( min_l +max_l + 5 ) * '-' , ( skip_l + 2 ) * '-' ) fmt = '# %%%ds | %%-%ds | %%%ds / %%-%ds | %%%ds / %%-%ds | %%-%ds ' % ( name_l , desc_l , mean_l , rms_l , min_l , max_l , skip_l ) report = 'Dataset(%s) created:' % self.__name report += ' ' + allright ( '%s entries, %s variables' % ( len ( self.__data ) , len ( self.variables ) ) ) if self.trivial_vars : report += ' Vars:' + allright ('trivial' ) + ';' else : report += ' Vars:' + attention ('non-trivial' ) + ';' if self.trivial_sel : report += ' Cuts:' + allright ('trivial' ) + ';' else : report += ' Cuts:' + attention ('non-trivial' ) + ';' if not self.__cuts : report += ' ' + allright ( 'no py-cuts' ) else : report += ' ' + attention ( 'with py-cuts' ) header = fmt % ( 'Variable' , 'Description' , 'mean' , 'rms' , 'min' , 'max' , 'skip' ) report += '\n' + sep report += '\n' + header report += '\n' + sep for v in vars : line = fmt % ( v[0] , v[1] , v[2] , v[3] , v[4] , v[5] , attention ( v[6] ) ) report += '\n' + line report += '\n' + sep self.__logger.info ( report ) if not len ( self.__data ) : skip = 0 for k,v in self.__skip.iteritems() : skip += v self.__logger.warning("Selector(%s): empty dataset! Total:%s/Processed:%s/Skipped:%d" % ( self.__name , self.total , self.processed , skip ) ) ## attention: delete these del self.__varset del self.__variables self.__varset = () self.__variables = ()
def __init__ ( self , variables , ## list of variables selection , ## Tree-selection cuts = None , name = '' , fullname = '' , silence = False ) : if not name : from ostap.core.core import dsID name = dsID() if not fullname : fullname = name self.__name = name # ## create the logger # from ostap.logger.logger import getLogger self.__logger = logger ## getLogger ( fullname ) # self.__silence = silence ## assert 0 < len(variables) , "Empty list of variables" # ## instantiate the base class # SelectorWithCuts.__init__ ( self , selection ) ## initialize the base self.__cuts = cuts self.__variables = [] self.__varset = ROOT.RooArgSet() self.__triv_vars = True vvars = set() for v in variables : vv = v if isinstance ( v , str ) : vv = Variable ( v ) elif isinstance ( v , ROOT.RooAbsReal ) : vv = Variable ( v ) elif isinstance ( v , ( tuple , list ) ) : vv = Variable ( *v ) elif isinstance ( v , dict ) : vv = Variable ( **v ) elif isinstance ( v , Variable ) : vv = v assert isinstance ( vv , Variable ), 'Invalid variable %s/%s' % ( vv , type ( vv ) ) self.__variables.append ( vv ) self.__varset .add ( vv.var ) # if v.trivial and v.name == v.formula : pass elif v.formula : pass else : self.__triv_vars = False # vvars.add ( vv ) self.__variables = tuple( self.__variables ) self.__triv_sel = valid_formula ( selection , self.__varset ) triv_cuts = not cuts self.__trivial = self.__triv_vars and self.__triv_sel and triv_cuts if not silence : tv = allright ( 'True' ) if self.__triv_vars else attention ( 'False' ) ts = allright ( 'True' ) if self.__triv_sel else attention ( 'False' ) tc = allright ( 'True' ) if triv_cuts else attention ( 'False' ) self.__logger.info ( "Suitable for fast processing: variables:%s, selection:%s, py-cuts:%s" % ( tv , ts , tc ) ) if not self.__silence: nl = 0 dl = 0 for v in self.__variables : nl = max ( nl , len( v.name ) ) dl = max ( dl , len( v.description ) ) dl = max ( dl , len ( 'Description' ) + 2 ) nl = max ( nl , len ( 'Variable' ) + 2 ) line1 = '\n# | %%%ds | %%-%ds | min / max | Trivial? | ' % ( nl , dl ) line2 = '\n# | %%%ds | %%-%ds | %%+11.3g / %%-+11.3g | %%s | ' % ( nl , dl ) the_line = 'Booked %d variables:' % len ( self.variables ) sep = '\n# +%s+%s+%s+%s+' % ( (nl+2)*'-' , (dl+2)*'-' , 27*'-', 10*'-' ) the_line += sep the_line += line1 % ( 'Variable' , 'Description' ) the_line += sep for v in self.__variables : trivial = allright ('True') + 4* ' ' if v.trivial else attention ( 'False' ) + 3 * ' ' fmt = line2 % ( v.name , v.description , v.minmax[0] , v.minmax[1] , trivial ) the_line += fmt the_line += sep self.__logger.info ( the_line ) ## Book dataset self.__data = ROOT.RooDataSet ( ## self.name , fullname , ## self.__varset ) # ## it is still very puzzling for me: should this line be here at all?? ROOT.SetOwnership ( self.__data , False ) self.__progress = None from collections import defaultdict self.__skip = defaultdict(int) self.__notifier = None self.__stat = [ 0 , 0 , 0 ]
def make_dataset ( tree , variables , selection = '' , name = '' , title = '' , silent = False ) : """Create the dataset from the tree >>> tree = ... >>> ds = tree.make_dataset ( [ 'px , 'py' , 'pz' ] ) """ import ostap.trees.cuts import ostap.fitting.roofit varset = ROOT.RooArgSet() vars = set() formulas = [] selection = str ( selection ) if isinstance ( selection , ROOT.TCut ) else selection selection = selection.strip() if isinstance ( selection , str ) else selection cuts = [ selection ] if selection else [] for v in variables : if isinstance ( v , str ) : vv = Variable ( v ) elif isinstance ( v , ROOT.RooRealVar ) : vv = Variable ( v ) elif isinstance ( v , ( tuple , list ) ) : vv = Variable ( *v ) elif isinstance ( v , dict ) : vv = Variable ( **v ) elif isinstance ( v , Variable ) : vv = v else : logger.error("Do not know how to treat the variable %s/%s, skip it" % ( v , type ( v ) ) ) continue if vv.trivial and vv.name == vv.formula : assert hasattr ( tree , vv.name ) , "Tree/Chain has no branch ``%s''" % vv.name assert hasattr ( tree , vv.name ) , "Tree/Chain has no branch ``%s''" % vv.name varset.add ( vv.var ) vars.add ( vv ) elif vv.formula : formulas.append ( vv ) continue else : logger.error("Do not know how to treat the variable %s, skip it" % vv.name ) continue mn , mx = vv.minmax if _minv < mn : cuts.append ( "(%.16g <= %s)" % ( mn , vv.name ) ) if _maxv > mx : cuts.append ( "(%s <= %.16g)" % ( vv.name , mx ) ) ## cuts = ROOT.TCut(' && '.join(cuts) ) if cuts else ROOT.TCut() ## extended varset stor = set() varsete = ROOT.RooArgSet() for v in varset : varsete.add ( v ) expressions = [ f.formula for f in formulas ] if selection : expressions.append ( selection ) if expressions : tt = None if isinstance ( tree , ROOT.TChain ) : nf = len ( tree.files() ) for i in range ( nf ) : tt = tree[i] if tt : break if not tt : tt = tree from ostap.core.core import fID for expression in expressions : tf = Ostap.Formula ( fID () , str ( expression ) , tt ) assert tf.ok() , 'Invalid formula %s' % expression i = 0 leaf = tf.GetLeaf( i ) while leaf : lname = leaf.GetName() if not lname in varsete : v = Variable ( lname ) varsete.add ( v.var ) stor.add ( v ) i += 1 leaf = tf.GetLeaf( i ) del tf if not name : from ostap.core.core import dsID name = '%s_%s' % ( dsID() , tree.GetName() ) if not title : title = '%s/%s' % ( name , tree.GetTitle() ) total = len ( tree ) processed = tree.statVar ( '1' , selection ).nEntries() skipped = tree.statVar ( '1' , str ( cuts ) ).nEntries() stat = total, processed , processed - skipped from ostap.logger.utils import rooSilent, rootError with rooSilent ( ROOT.RooFit.ERROR , True ) : with rootError( ROOT.kWarning ) : ds = ROOT.RooDataSet ( name , title , tree , varsete , str( cuts ) ) varsete = ds.get() ## add complex expressions if formulas : # a vset = ds.get() vlst = ROOT.RooArgList() for v in vset : vlst.add ( v ) fcols = ROOT.RooArgList() ffs = [] fcuts = [] for f in formulas : fv = ROOT.RooFormulaVar ( f.name , f.description , f.formula , vlst ) assert fv.ok() , 'Invalid formula: %s' % f.formula ffs.append ( fv ) fcols.add ( fv ) mn , mx = f.minmax if _minv < mn : fcuts.append ( "(%.16g <= %s)" % ( mn , fv.name ) ) if _maxv > mx : fcuts.append ( "(%s <= %.16g)" % ( fv.name , mx ) ) ds.addColumns ( fcols ) ## apply cuts (if any) for the complex expressions if fcuts : fcuts = [ '(%s)' % f for f in fcuts ] fcuts = ' && '.join ( fcuts ) _vars = ds.get() ds1 = ROOT.RooDataSet ( dsID() , ds.title , ds , _vars , fcuts ) ds.clear() del ds ds = ds1 varsete = ds.get() nvars = ROOT.RooArgSet() for v in varset : nvars.add ( v ) for v in formulas : nvars.add ( v.var ) varset = nvars varsete = ds.get() ## remove all temporary variables if len ( varset ) != len ( varsete ) : ds1 = ds.reduce ( varset ) ds.clear() del ds ds = ds1 if not silent : skipped = 'Skipped:%d' % stat[2] skipped = '/' + attention ( skipped ) if stat[2] else '' logger.info ( 'make_dataset: Events Total:%d/Processed:%s%s CUTS: "%s"\n# %s' % ( stat[0] , stat[1] , skipped , selection , ds ) ) return ds , stat
def _ds_table_0_(dataset, variables=[], cuts='', first=0, last=2**62): """Print data set as table """ varset = dataset.get() if not valid_pointer(varset): logger.error('Invalid dataset') return '' if isinstance(variables, str): variables = variables.strip() variables = variables.replace(',', ' ') variables = variables.replace(';', ' ') variables = variables.split() if 1 == len(variables): variables = variables[0] if isinstance(variables, str): if variables in varset: vars = [variables] else: vars = list(dataset.branches(variables)) elif variables: vars = [i.GetName() for i in varset if i in variables] else: vars = [i.GetName() for i in varset] # _vars = [] for v in vars: vv = getattr(varset, v) s = dataset.statVar(v, cuts, first, last) mnmx = s.minmax() mean = s.mean() rms = s.rms() r = ( vv.GetName(), ## 0 vv.GetTitle(), ## 1 ('%+.5g' % mean.value()).strip(), ## 2 ('%.5g' % rms).strip(), ## 3 ('%+.5g' % mnmx[0]).strip(), ## 4 ('%+.5g' % mnmx[1]).strip()) ## 5 _vars.append(r) _vars.sort() report = '# %s("%s","%s"):' % (dataset.__class__.__name__, dataset.GetName(), dataset.GetTitle()) report += allright('%d entries, %d variables' % (len(dataset), len(varset))) if not _vars: return report, 120 weight = None if isinstance(dataset, ROOT.RooDataHist): if dataset.isNonPoissonWeighted(): report += attention(' Binned/Weighted') else: report += allright(' Binned') elif dataset.isWeighted(): if dataset.isNonPoissonWeighted(): report += attention(' Weighted') else: report += attention(' Weighted(Poisson)') dstmp = None wvar = None ## 1) try to get the name of the weight variable store = dataset.store() if not valid_pointer(store): store = None if store and not isinstance(store, ROOT.RooTreeDataStore): dstmp = dataset.emptyClone() dstmp.convertToTreeStore() store = dstmp.store() if not valid_pointer(store): store = None if store and hasattr(store, 'tree') and valid_pointer(store.tree()): tree = store.tree() branches = set(tree.branches()) vvars = set([i.GetName() for i in varset]) wvars = branches - vvars if 1 == len(wvars): wvar = wvars.pop() if not wvar: wvar = Ostap.Utils.getWeight(dataset) if wvar: report += attention(' with "%s"' % wvar) store = None if dstmp: dstmp.reset() del dstmp dstmp = None ## 2) if weight name is known, try to get information about the weight if wvar: store = dataset.store() if not valid_pointer(store): store = None if store and not isinstance(store, ROOT.RooTreeDataStore): rargs = ROOT.RooFit.EventRange(first, last), if cuts: ## need all variables dstmp = dataset.reduce(ROOT.RooFit.Cut(cuts), *rargs) else: ## enough to keep only 1 variable vvs = ROOT.RooArgSet(varset[vars[0]]) dstmp = dataset.reduce(ROOT.RooFit.SelectVars(vvs), *rargs) dstmp.convertToTreeStore() store = dstmp.store() cuts, first, last = '', 0, 2**62 if hasattr(store, 'tree') and valid_pointer(store.tree()): tree = store.tree() if wvar in tree.branches(): s = tree.statVar(wvar, cuts, first, last) ## no cuts here... mnmx = s.minmax() mean = s.mean() rms = s.rms() weight = '*%s*' % wvar r = ( weight, ## 0 'Weight variable', ## 1 ('%+.5g' % mean.value()).strip(), ## 2 ('%.5g' % rms).strip(), ## 3 ('%+.5g' % mnmx[0]).strip(), ## 4 ('%+.5g' % mnmx[1]).strip()) ## 5 _vars.append(r) with_weight = True store = None if not dstmp is None: dstmp.reset() del dstmp dstmp = None # ============================================================================================== # build the actual table # ============================================================================================== name_l = len('Variable') + 2 desc_l = len('Description') + 2 mean_l = len('mean') + 2 rms_l = len('rms') + 2 min_l = len('min') + 2 max_l = len('max') + 2 for v in _vars: name_l = max(name_l, len(v[0])) desc_l = max(desc_l, len(v[1])) mean_l = max(mean_l, len(v[2])) rms_l = max(rms_l, len(v[3])) min_l = max(min_l, len(v[4])) max_l = max(max_l, len(v[5])) sep = '# +%s+%s+%s+%s+' % ((name_l + 2) * '-', (desc_l + 2) * '-', (mean_l + rms_l + 5) * '-', (min_l + max_l + 5) * '-') fmt = '# | %%-%ds | %%-%ds | %%%ds / %%-%ds | %%%ds / %%-%ds |' % ( name_l, desc_l, mean_l, rms_l, min_l, max_l) header = fmt % ('Variable', 'Description', 'mean', 'rms', 'min', 'max') report += '\n' + sep report += '\n' + header report += '\n' + sep vlst = _vars if weight: vlst = _vars[:-1] for v in vlst: line = fmt % (v[0], v[1], v[2], v[3], v[4], v[5]) report += '\n' + line report += '\n' + sep if weight: v = _vars[-1] line = fmt % (v[0], v[1], v[2], v[3], v[4], v[5]) report += '\n' + line.replace(weight, attention(weight)) report += '\n' + sep return report, len(sep)