Beispiel #1
0
def chunkify_by_evts(fnames, nmax, evenly=True):
    """Test implementation of event-based splitting of large files to limit number of events
  to process during jobs with a given maximum. Small files are still grouped as long as their
  total events is less than the maximum.
  For full implementation, see TauFW.PicoProducer.batch.utils"""
    result = []
    nlarge = {}
    nsmall = {}
    for fname in fnames:
        file = ensureTFile(fname, 'READ')
        nevts = file.Get('Events').GetEntries()
        file.Close()
        print "%10d %s" % (nevts, fname)
        if nevts < nmax:
            nsmall.setdefault(nevts, []).append(fname)
        else:
            nlarge.setdefault(nevts, []).append(fname)
    #nlarge = {
    #  1081403L: ['nano_1.root'],
    #  2235175L: ['nano_2.root'],
    #   144447L: ['nano_3.root'],
    #  #1515407L: ['nano_4.root'],
    #    200000: ['nano_5.root'],
    #    150000: ['nano_6.root'],
    #    100000: ['nano_7.root'],
    #}
    #nsmall = {
    #     50000: ['nano_8.root', 'nano_9.root', 'nano_10.root'],
    #     20000: ['nano_11.root','nano_12.root','nano_13.root'],
    #}
    print 'nlarge =', nlarge
    print 'nsmall =', nsmall
    for nevts in nlarge:
        for fname in nlarge[nevts]:
            nmax_ = nmax
            if evenly:
                nchunks = ceil(float(nevts) / nmax)
                nmax_ = int(ceil(nevts / nchunks))
                print nevts, nmax, nmax_, nchunks
            ifirst = 0
            while ifirst < nevts:
                result.append(["%s:%d:%d" % (fname, ifirst, nmax_)])
                ifirst += nmax_
    mylist = []
    for nevts in nsmall:
        mylist.extend([nevts] * len(nsmall[nevts]))
    for part in partition_by_max(mylist, nmax):
        result.append([])
        for nevts in part:
            fname = nsmall[nevts][0]
            nsmall[nevts].remove(fname)
            result[-1].append(fname + ":%d" % nevts)
    return result
Beispiel #2
0
def chunkify_by_evts(fnames,maxevts,evenly=True,evtdict=None,verb=0):
  """Split list of files into chunks with total events per chunks less than given maximum,
  and update input fnames to bookkeep first event and maximum events.
  E.g. ['nano_1.root','nano_2.root','nano_3.root','nano_4.root']
        -> [ ['nano_1.root:0:1000'], ['nano_1.root:1000:1000'], # 'fname:firstevt:maxevts'
             ['nano_2.root','nano_3.root','nano_4.root'] ]
  """ 
  result   = [ ] # list of chunks
  nlarge   = { }
  nsmall   = { }
  ntot     = 0
  if verb>=4:
    print ">>> chunkify_by_evts: events per file:"
  for fname in fnames[:]:
    if evtsplitexp.match(fname): # already split; cannot be split again
      # TODO: add maxevts to ntot ?
      result.append([fname]) # do not split again, keep in single chunk
      continue
    if evtdict and fname in evtdict: # get number of events from sample's dictionary to speed up
      nevts = evtdict[fname]
      if verb>=4:
        print ">>> %10d %s (dict)"%(nevts,fname)
    else: # get number of events from file
      file  = ensureTFile(fname,'READ')
      nevts = file.Get('Events').GetEntries()
      file.Close()
      if isinstance(evtdict,dict):
        evtdict[fname] = nevts # store for possible later reuse (if same sample is submitted multiple times)
      if verb>=4:
        print ">>> %10d %s"%(nevts,fname)
    if nevts<maxevts: # split this large file into several chunks
      nsmall.setdefault(nevts,[ ]).append(fname)
    else: # don't split this small, group with others in chunks, if possible
      nlarge.setdefault(nevts,[ ]).append(fname)
      fnames.remove(fname)
    ntot += nevts
  if verb>=1:
    print ">>> chunkify_by_evts: %d small files (<%d events) and %d large files (>=%d events)"%(
      len(nsmall),maxevts,len(nlarge),maxevts)
  for nevts in nlarge:
    for fname in nlarge[nevts]: # split large files into several chunks
      maxevts_ = maxevts
      if evenly: # split events evenly over chunks
        nchunks  = ceil(float(nevts)/maxevts)
        maxevts_ = int(ceil(nevts/nchunks)) # new maxevts per chunk
        if verb>=3:
          print ">>>   nevts/maxevts = %d/%d = %.2f => make %d chunks with max. %d events"%(
            nevts,maxevts,nevts/float(maxevts),nchunks,maxevts_)
      ifirst = 0 # first event to process in first chunk
      while ifirst<nevts:
        #if ifirst+maxevts_+1>=nevts: # if nevts%maxevts_!=0; index starts counting from 0
        #  maxevts_ = nevts - (nchunks-1)*maxevts_ # maxevts for the last chunk; use correct maxevts for bookkeeping ntot
        infname = "%s:%d:%d"%(fname,ifirst,maxevts_)
        fnames.append(infname) # update for book keeping
        result.append([infname])
        ifirst += maxevts_
  mylist = [ ]
  for nevts in nsmall:
    mylist.extend([nevts]*len(nsmall[nevts]))
  for part in partition_by_max(mylist,maxevts): # group small files into one chunk
    result.append([ ])
    for nevts in part:
      fname = nsmall[nevts][0]
      nsmall[nevts].remove(fname)
      result[-1].append(fname) #+":%d"%nevts)
  if verb>=4:
    print ">>> chunkify_by_evts: chunks = ["
    for chunk in result:
      print ">>>   %s"%(chunk)
    print ">>> ]"
  return ntot, result
Beispiel #3
0
def chunkify_by_evts(fnames, maxevts, evenly=True, verb=0):
    """Split list of files into chunks with total events per chunks less than given maximum,
  and update input fnames to bookkeep first event and maximum events.
  E.g. ['nano_1.root','nano_2.root','nano_3.root','nano_4.root']
        -> [ ['nano_1.root:0:1000'], ['nano_1.root:1000:1000'], # 'fname:firstevt:maxevts'
             ['nano_2.root','nano_3.root','nano_4.root'] ]
  """
    result = []  # list of chunks
    nlarge = {}
    nsmall = {}
    if verb >= 4:
        print ">>> chunkify_by_evts: events per file:"
    for fname in fnames[:]:
        if evtsplitexp.match(fname):  # already split; cannot be split again
            result.append([fname])
        else:  # get number of events
            file = ensureTFile(fname, 'READ')
            nevts = file.Get('Events').GetEntries()
            file.Close()
            if verb >= 4:
                print "%10d %s" % (nevts, fname)
            if nevts < maxevts:  # split this large file into several chunks
                nsmall.setdefault(nevts, []).append(fname)
            else:  # don't split this small, group with others in chunks, if possible
                nlarge.setdefault(nevts, []).append(fname)
                fnames.remove(fname)
    if verb >= 1:
        print ">>> chunkify_by_evts: %d small files (<%d events) and %d large files (>=%d events)" % (
            len(nsmall), maxevts, len(nlarge), maxevts)
    for nevts in nlarge:
        for fname in nlarge[nevts]:  # split large files into several chunks
            maxevts_ = maxevts
            if evenly:
                nchunks = ceil(float(nevts) / maxevts)
                maxevts_ = int(ceil(nevts / nchunks))
                if verb >= 3:
                    print ">>>   nevts/maxevts = %d/%d = %.2f => make %d chunks with max. %d events" % (
                        nevts, maxevts, nevts / float(maxevts), nchunks,
                        maxevts_)
            ifirst = 0
            while ifirst < nevts:
                infname = "%s:%d:%d" % (fname, ifirst, maxevts_)
                fnames.append(infname)  # update for book keeping
                result.append([infname])
                ifirst += maxevts_
    mylist = []
    for nevts in nsmall:
        mylist.extend([nevts] * len(nsmall[nevts]))
    for part in partition_by_max(mylist,
                                 maxevts):  # group small files into one chunk
        result.append([])
        for nevts in part:
            fname = nsmall[nevts][0]
            nsmall[nevts].remove(fname)
            result[-1].append(fname)  #+":%d"%nevts)
    if verb >= 4:
        print ">>> chunkify_by_evts: chunks = ["
        for chunk in result:
            print ">>>   %s" % (chunk)
        print ">>> ]"
    return result