def prepare_data(tmpdir, nfiles=100, nentries=100, ppservers=(), silent=True): ## Use generic Task from Kisa from ostap.parallel.parallel import GenericTask as Task task = Task(processor=create_tree) ## task = PrepareTask () wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent) from ostap.utils.cleanup import CleanUp tmpfile = CleanUp.tempfile(prefix='test_kisa_', suffix='.root', dir=tmpdir) fname = '%s/test_kisa_%d.root' files = [ CleanUp.tempfile(prefix='test_kisa_', suffix='.root', dir=tmpdir) for i in range(nfiles) ] wmgr.process(task, [(f, nentries) for f in files]) the_files = set() for f in task.results(): if os.path.exists(f): the_files.add(f) from ostap.trees.data import Data the_files = list(the_files) the_files.sort() return Data('S', list(the_files))
def cproject(chain, histo, what, cuts, nentries=-1, first=0, chunk_size=1000000, silent=False): """Make a projection of the loooong chain into histogram >>> chain = ... ## large chain >>> histo = ... ## histogram template >>> cproject ( chain , histo , 'mass' , 'pt>10' ) >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto For 12-core machine, clear speedup factor of about 8 is achieved """ # from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nentries) task = ProjectTask(histo, what, cuts) wmgr = Parallel.WorkManager(silent=silent) wmgr.process(task, ch.split(chunk_size=chunk_size)) filtered = task.output[0] histo += task.output[1] return filtered, histo
def _pprocess_(chain, selector, nevents=-1, first=0, shortcut=True, chunk_size=100000, ppservers=(), max_files=10, silent=False): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain ch = Chain(chain) selection = selector.selection variables = selector.variables trivial = selector.trivial_vars and not selector.morecuts all = 0 == first and (0 > nevents or len(chain) <= nevents) if all and trivial and 1 < len(ch.files): logger.info( "Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask(variables, selection, trivial) wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees dataset, stat = task.output selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat[2] skipped = '/' + attention(skipped) if stat[2] else '' logger.info( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' % (selector.name, stat[1], stat[0], skipped, selector.cuts(), dataset)) return 1
def cproject(chain, histo, what, cuts): """Make a projection of the loooong chain into histogram >>> chain = ... ## large chain >>> histo = ... ## histogram template >>> cproject ( chain , histo , 'mass' , 'pt>10' ) >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto For 12-core machine, clear speedup factor of about 8 is achieved """ # if not chain: return 0, histo if not histo: logger.error('cproject: invalid histogram') return 0, histo import ROOT histo.Reset() if not isinstance(chain, ROOT.TChain): logger.warning( 'cproject method is TChain-specific, skip parallelization') from ostap.trees.trees import _tt_project_ return _tt_project_(chain, histo, what, cuts) if isinstance(cuts, ROOT.TCut): cuts = str(cuts) ## if isinstance(what, str): what = what.split(',') if isinstance(what, str): what = what.split(';') if isinstance(what, str): what = [what] import ostap.trees.trees files = chain.files() cname = chain.GetName() params = [(f, cname, str(w), cuts) for f in files for w in what] task = ProjectTask(histo) wmgr = Parallel.WorkManager() wmgr.process(task, params) filtered = task.output[0] histo += task.output[1] return filtered, histo
def fillDataSet(chain, variables, selection, ppservers=()): """Fill dataset from loooong TChain using per-file parallelisation >>> chain = >>> vars = ... >>> dset = fillDataSet ( chain , vars , 'pt>10' ) - for 12-core machine, clear speed-up factor of about 8 is achieved """ task = FillTask(variables, selection) wmgr = Parallel.WorkManager(ppservers=ppservers) cname = chain.GetName() files = chain.files() pairs = [(cname, i) for i in files] wmgr.process(task, pairs) return task.output
def _pStatVar_(chain, what, cuts='', nevents=-1, first=0, chunk_size=100000, max_files=10, ppservers=(), silent=True): """ Parallel processing of loooong chain/tree >>> chain = ... >>> chain.pstatVar( 'mass' , 'pt>1') """ ## few special/trivial cases last = min(n_large, first + nevents if 0 < nevents else n_large) if 0 <= first and 0 < nevents < chunk_size: return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TChain): if chain.nFiles() < 5 and len(chain) < chunk_size: return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TTree) and len(chain) < chunk_size: return chain.statVar(what, cuts, first, last) from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nevents) task = StatVarTask(what, cuts) wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees del ch return task.output
def tproject( tree, ## the tree histo, ## histogram what, ## variable/expression/list to be projected cuts='', ## selection/weighting criteria nentries=-1, ## number of entries first=0, ## the first entry chunk_size=1000000, ## chunk size silent=False): ## silent processing """Make a projection of the loooong tree into histogram >>> tree = ... ## large chain >>> histo = ... ## histogram template >>> tproject ( tree , histo , 'mass' , 'pt>10' ) >>> tree.pproject ( histo , 'mass' , 'pt>10' ) ## ditto - significant gain can be achieved for very large TTrees with complicated expressions and cuts - maxentries parameter should be rather large Arguments: - tree the tree - histo the histogram - what variable/expression/varlist to be projected - cuts selection/weighting criteria - nentries number of entries to process (>0: all entries in th tree) - first the first entry to process - maxentries chunk size for parallel processing """ from ostap.trees.trees import Tree ch = Tree(tree, first=first, nevents=nevents) task = ProjectTask(histo, what, cuts) wmgr = Parallel.WorkManager(silent=silent) wmgr.process(task, ch.split(chunk_size=chunk_size)) filtered = task.output[0] histo += task.output[1] return filtered, histo
def tproject( tree, ## the tree histo, ## histogram what, ## variable/expression/list to be projected cuts='', ## selection/weighting criteria nentries=-1, ## number of entries first=0, ## the first entry maxentries=1000000): ## chunk size """Make a projection of the loooong tree into histogram >>> tree = ... ## large chain >>> histo = ... ## histogram template >>> tproject ( tree , histo , 'mass' , 'pt>10' ) >>> tree.pproject ( histo , 'mass' , 'pt>10' ) ## ditto - significant gain can be achieved for very large ttrees with complicated expressions and cuts - maxentries parameter should be rather large Arguments: - tree the tree - histo the histogram - what variable/expression/varlist to be projected - cuts selection/weighting criteria - nentries number of entries to process (>0: all entries in th tree) - first the first entry to process - maxentries chunk size for parallel processing """ if not tree: return 0, histo if not histo: logger.error('tproject: invalid histogram') return 0, histo import ROOT histo.Reset() num = len(tree) if num <= first: return 0, histo if 0 > nentries: nentries = n_large maxentries = long(maxentries) if 0 >= maxentries: maxentries = n_large if 0 > first: first = 0 ## use the regular projection from ostap.trees.trees import _tt_project_ fname = None tname = None if isinstance(tree, ROOT.TChain): if 1 == len(tree.files()): fname = tree.files()[0] tname = tree.GetName() else: logger.warning('``tproject' ' method is TTree-specific, skip parallelization') return _tt_project_(tree, histo, what, cuts, '', nentries, first) else: tdir = tree.GetDirectory() ftree = tdir.GetFile() if not ftree: logger.debug('TTree is not file resident, skip parallelization') return _tt_project_(tree, histo, what, cuts, '', total, first) fname = ftree.GetName() tpath = tdir.GetPath() pr, d, path = tpath.rpartition(':') tname = path + '/' + tree.GetName() if not fname: logger.info("Can't determine fname, skip parallelization") return _tt_project_(tree, histo, what, cuts, '', total, first) if not tname: logger.info("Can't determine tname, skip parallelization") return _tt_project_(tree, histo, what, cuts, '', total, first) # if isinstance(cuts, ROOT.TCut): cuts = str(cuts) if isinstance(what, ROOT.TCut): what = str(what) ## if isinstance(what, str): what = what.split(',') if isinstance(what, str): what = what.split(',') if isinstance(what, str): what = what.split(';') if isinstance(what, str): what = [what] ## nothing to project if not what: return 0, histo ## total number of events to process : total = min(num - first, nentries) ## the event range is rather short, no real need in parallel processing if total * len(what) < maxentries and len(what) < 4: return _tt_project_(tree, histo, what, cuts, '', total, first) ## number of chunks & reminder nchunks, rest = divmod(total, maxentries) csize = int(total / nchunks) ## chunk size ## final list of parameters [ (file_name, what , cuts , first_event , num_events ) , ... ] params = [] for i in range(nchunks): for w in what: params.append( (fname, tname, str(w), cuts, first + i * csize, csize)) if rest: nchunks += 1 for w in what: params.append( (fname, tname, str(w), cuts, first + nchunks * csize, rest)) task = ProjectTask(histo) wmgr = Parallel.WorkManager() wmgr.process(task, params) filtered = task.output[0] histo += task.output[1] return filtered, histo