Esempio n. 1
0
class Looper(object):
    """Creates a set of analyzers, and schedules the event processing."""

    def __init__( self, name,
                  config, 
                  nEvents=None,
                  firstEvent=0,
                  nPrint=0,
                  timeReport=False,
                  quiet=False):
        """Handles the processing of an event sample.
        An Analyzer is built for each Config.Analyzer present
        in sequence. The Looper can then be used to process an event,
        or a collection of events.

        Parameters:
        name    : name of the Looper, will be used as the output directory name
        config  : process configuration information, see Config
        nEvents : number of events to process. Defaults to all.
        firstEvent : first event to process. Defaults to the first one.
        nPrint  : number of events to print at the beginning
        """

        self.name = self._prepareOutput(name)
        self.outDir = self.name
        # self.logger writes to stdout and to log.txt.
        # configured in the users cfg by doing:
        # import logging
        #   logging.basicConfig(level=logging.ERROR)
        self.logger = logging.getLogger( self.name )
        self.logger.addHandler(logging.FileHandler('/'.join([self.name,
                                                             'log.txt'])))
        self.logger.propagate = False
        if not quiet: 
            self.logger.addHandler( logging.StreamHandler(sys.stdout) )

        self.cfg_comp = config.components[0]
        self.classes = {}
        self.analyzers = map( self._build, config.sequence )
        self.nEvents = nEvents
        self.firstEvent = firstEvent
        self.nPrint = int(nPrint)
        self.timeReport = [ {'time':0.0,'events':0} for a in self.analyzers ] if timeReport else False
        tree_name = None
        if( hasattr(self.cfg_comp, 'tree_name') ):
            tree_name = self.cfg_comp.tree_name
        if len(self.cfg_comp.files)==0:
            errmsg = 'please provide at least an input file in the files attribute of this component\n' + str(self.cfg_comp)
            raise ValueError( errmsg )
        if hasattr(config,"preprocessor") and config.preprocessor is not None :
              self.cfg_comp = config.preprocessor.run(self.cfg_comp,
                                                      self.outDir,
                                                      firstEvent,
                                                      nEvents)
        if hasattr(self.cfg_comp,"options"):
              print self.cfg_comp.files,self.cfg_comp.options
              self.events = config.events_class(self.cfg_comp.files,
                                                tree_name,
                                                options=self.cfg_comp.options)
        else :
              self.events = config.events_class(self.cfg_comp.files, tree_name)
        if hasattr(self.cfg_comp, 'fineSplit'):
            fineSplitIndex, fineSplitFactor = self.cfg_comp.fineSplit
            if fineSplitFactor > 1:
                if len(self.cfg_comp.files) != 1:
                    raise RuntimeError, "Any component with fineSplit > 1 is supposed to have just a single file, while %s has %s" % (self.cfg_comp.name, self.cfg_comp.files)
                totevents = min(len(self.events),int(nEvents)) if (nEvents and int(nEvents) not in [-1,0]) else len(self.events)
                self.nEvents = int(ceil(totevents/float(fineSplitFactor)))
                self.firstEvent = firstEvent + fineSplitIndex * self.nEvents
                if self.firstEvent + self.nEvents >= totevents:
                    self.nEvents = totevents - self.firstEvent 
                #print "For component %s will process %d events starting from the %d one, ending at %d excluded" % (self.cfg_comp.name, self.nEvents, self.firstEvent, self.nEvents + self.firstEvent)
        # self.event is set in self.process
        self.event = None
        services = dict()
        for cfg_serv in config.services:
            service = self._build(cfg_serv)
            services[cfg_serv.name] = service
        # would like to provide a copy of the config to the setup,
        # so that analyzers cannot modify the config of other analyzers. 
        # but cannot copy the autofill config.
        self.setup = Setup(config, services)

    def _build(self, cfg):
        try: 
            theClass = cfg.class_object
        except AttributeError:
            errfgmt = 'an object of class {cfg_class}'.format(
                cfg_class=cfg.__class__
            )
            if type(cfg) is type:
                errfgmt = 'a class named {class_name}'.format(
                    class_name=cfg.__name__
                )
            err='''
The looper is trying to build an analyzer configured by {errfgmt}. 

Make sure that the configuration object is of class cfg.Analyzer.
            '''.format(errfgmt=errfgmt)
            raise ValueError(err)
        obj = theClass( cfg, self.cfg_comp, self.outDir )
        return obj
        
    def _prepareOutput(self, name):
        index = 0
        tmpname = name
        while True and index < 2000:
            try:
                # print 'mkdir', self.name
                os.mkdir( tmpname )
                break
            except OSError:
                # failed to create the directory
                # is it empty?
                if not os.listdir(tmpname):
                    break  # it is, so use it
                else:
                    # if not we append a number to the directory name
                    index += 1
                    tmpname = '%s_%d' % (name, index)
        if index == 2000:
              raise ValueError( "More than 2000 output folder with same name or 2000 attempts failed, please clean-up, change name or check permissions")
        return tmpname


    def loop(self):
        """Loop on a given number of events.

        At the beginning of the loop, 
        Analyzer.beginLoop is called for each Analyzer.
        At each event, self.process is called.
        At the end of the loop, Analyzer.endLoop is called.
        """
        nEvents = self.nEvents
        firstEvent = self.firstEvent
        iEv = firstEvent
        self.nEvProcessed = 0
        if nEvents is None or int(nEvents)-firstEvent > len(self.events) :
            nEvents = len(self.events) - firstEvent
        else:
            nEvents = int(nEvents)
        self.logger.info(
            'starting loop at event {firstEvent} '\
                'to process {nEvents} events.'.format(firstEvent=firstEvent,
                                                        nEvents=nEvents))
        self.logger.info( str( self.cfg_comp ) )
        for analyzer in self.analyzers:
            analyzer.beginLoop(self.setup)

        if hasattr(self.events, '__getitem__'):
            # events backend supports indexing, e.g. CMS, FCC, bare root
            for iEv in range(firstEvent, firstEvent+nEvents):
                if iEv%100 == 0:
                    if not hasattr(self,'start_time'):
                        self.logger.info( 'event {iEv}'.format(iEv=iEv))
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        self.logger.warning( 'event %d (%.1f ev/s)' % (iEv, (iEv-self.start_time_event)/float(timeit.default_timer() - self.start_time)) )
                try:
                    self.process( iEv )
                    self.nEvProcessed += 1
                    if iEv<self.nPrint:
                        print self.event.__str__() 
                except UserStop as err:
                    print 'Stopped loop following a UserStop exception:'
                    print err
                    break
        else:
            # events backend does not support indexing, e.g. LCIO
            iEv = 0
            for ii, event in enumerate(self.events):
                if ii < firstEvent:
                    continue
                iEv += 1
                if iEv%100 == 0:
                    if not hasattr(self,'start_time'):
                        self.logger.warning( 'event {iEv}'.format(iEv=iEv))
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        self.logger.info( 'event %d (%.1f ev/s)' % (iEv, (iEv-self.start_time_event)/float(timeit.default_timer() - self.start_time)) )
                try:
                    self.event = Event(iEv, event, self.setup)
                    self.iEvent = iEv
                    self._run_analyzers_on_event()
                    self.nEvProcessed += 1
                    if iEv<self.nPrint:
                        print self.event.__str__() 
                except UserStop as err:
                    print 'Stopped loop following a UserStop exception:'
                    print err
                    break            
            
        warning = self.logger.warning
        warning('')
        warning( self.cfg_comp )
        warning('')        
        for analyzer in self.analyzers:
            analyzer.endLoop(self.setup)
        if self.timeReport:
            allev = max([x['events'] for x in self.timeReport])
            warning("\n      ---- TimeReport (all times in ms; first evt is skipped) ---- ")
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("processed","all evts","time/proc", " time/all", "  [%] ", "analyer"))
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("---------","--------","---------", "---------", " -----", "-------------"))
            sumtime = sum(rep['time'] for rep in self.timeReport)
            passev  = self.timeReport[-1]['events']
            for ana,rep in zip(self.analyzers,self.timeReport):
                timePerProcEv = rep['time']/(rep['events']-1) if rep['events'] > 1 else 0
                timePerAllEv  = rep['time']/(allev-1)         if allev > 1         else 0
                fracAllEv     = rep['time']/sumtime
                warning( "%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( rep['events'], allev, 1000*timePerProcEv, 1000*timePerAllEv, 100.0*fracAllEv, ana.name))
            totPerProcEv = sumtime/(passev-1) if passev > 1 else 0
            totPerAllEv  = sumtime/(allev-1)  if allev > 1  else 0
            warning("%9s   %9s    %9s   %9s   %s" % ("---------","--------","---------", "---------", "-------------"))
            warning("%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( passev, allev, 1000*totPerProcEv, 1000*totPerAllEv, 100.0, "TOTAL"))
            warning("")
        logfile = open('/'.join([self.name,'log.txt']),'a')
        logfile.write('number of events processed: {nEv}\n'.format(
            nEv=self.nEvProcessed)
        )
        logfile.close()

    def process(self, iEv ):
        """Run event processing for all analyzers in the sequence.

        This function can be called directly from
        the python interpreter, to jump to a given event and process it.
        """
        if not hasattr(self.events, '__getitem__'):
            msg = '''
Your events backend, of type 
{evclass}
does not support indexing. 
Therefore, you cannot directly access a given event using Loop.process.
However, you may still iterate on your events using Loop.loop, 
possibly skipping a number of events at the beginning.
'''.format(evclass=self.events.__class__)
            raise TypeError(msg)
        self.event = Event(iEv, self.events[iEv], self.setup)            
        self.iEvent = iEv
        return self._run_analyzers_on_event()

    def _run_analyzers_on_event(self):
        '''Run all analysers on the current event, self.event. 
        Returns a tuple (success?, last_analyzer_name).
        '''
        for i,analyzer in enumerate(self.analyzers):
            if not analyzer.beginLoopCalled:
                analyzer.beginLoop(self.setup)
            start = timeit.default_timer()
            ret = analyzer.process( self.event )
            if self.timeReport:
                self.timeReport[i]['events'] += 1
                if self.timeReport[i]['events'] > 0:
                    self.timeReport[i]['time'] += timeit.default_timer() - start
            if ret == False:
                return (False, analyzer.name)
        return (True, analyzer.name)

    
    def write(self):
        """Writes all analyzers.

        See Analyzer.Write for more information.
        """
        for analyzer in self.analyzers:
            analyzer.write(self.setup)
        self.setup.close() 
Esempio n. 2
0
class Looper(object):
    """Creates a set of analyzers, and schedules the event processing."""

    def __init__( self, name,
                  config, 
                  nEvents=None,
                  firstEvent=0,
                  nPrint=0,
                  timeReport=False,
                  quiet=False,
                  memCheckFromEvent=-1,
                  stopFlag = None):
        """Handles the processing of an event sample.
        An Analyzer is built for each Config.Analyzer present
        in sequence. The Looper can then be used to process an event,
        or a collection of events.

        Parameters:
        name    : name of the Looper, will be used as the output directory name
        config  : process configuration information, see Config
        nEvents : number of events to process. Defaults to all.
        firstEvent : first event to process. Defaults to the first one.
        nPrint  : number of events to print at the beginning
    
        stopFlag: it should be a multiprocessing.Value instance, that is set to 1 
                  when this thread, or any other, receives a SIGUSR2 to ask for
                  a graceful job termination. In this case, the looper will also
                  set up a signal handler for SIGUSR2.
                  (if set to None, nothing of all this happens)
        """

        self.config = config
        self.name = self._prepareOutput(name)
        self.outDir = self.name
        self.logger = logging.getLogger( self.name )
        self.logger.addHandler(logging.FileHandler('/'.join([self.name,
                                                             'log.txt'])))
        self.logger.propagate = False
        if not quiet: 
            self.logger.addHandler( logging.StreamHandler(sys.stdout) )

        self.cfg_comp = config.components[0]
        self.classes = {}
        # keep track of analyzers in a list for sequential event processing
        self._analyzers = []
        # and in a dict for easy user access
        self._analyzer_dict = dict()
        for anacfg in self.config.sequence:
            anaobj = self._build(anacfg)
            self._analyzers.append(anaobj)
            self._analyzer_dict[anacfg.name] = anaobj        
        self.nEvents = nEvents
        self.firstEvent = firstEvent
        self.nPrint = int(nPrint)
        self.timeReport = [ {'time':0.0,'events':0} for a in self._analyzers ] if timeReport else False
        self.memReportFirstEvent = memCheckFromEvent
        self.memLast=0
        self.stopFlag = stopFlag
        if stopFlag:
            import signal
            def doSigUsr2(sig,frame):
                print 'SIGUSR2 received, signaling graceful stop'
                self.stopFlag.value = 1
            signal.signal(signal.SIGUSR2, doSigUsr2)
        tree_name = None
        if( hasattr(self.cfg_comp, 'tree_name') ):
            tree_name = self.cfg_comp.tree_name
        if len(self.cfg_comp.files)==0:
            errmsg = 'please provide at least an input file in the files attribute of this component\n' + str(self.cfg_comp)
            raise ValueError( errmsg )
        if hasattr(config,"preprocessor") and config.preprocessor is not None :
            self.cfg_comp = config.preprocessor.run(self.cfg_comp,
                                                    self.outDir,
                                                    firstEvent,
                                                    nEvents)
        if hasattr(self.cfg_comp,"options"):
            print self.cfg_comp.files,self.cfg_comp.options
            self.events = config.events_class(self.cfg_comp.files,
                                              tree_name,
                                              options=self.cfg_comp.options)
        else :
            self.events = config.events_class(self.cfg_comp.files, tree_name)
        if hasattr(self.cfg_comp, 'fineSplit'):
            fineSplitIndex, fineSplitFactor = self.cfg_comp.fineSplit
            if fineSplitFactor > 1:
                if len(self.cfg_comp.files) != 1:
                    raise RuntimeError("Any component with fineSplit > 1 is supposed to have just a single file, while %s has %s" % (self.cfg_comp.name, self.cfg_comp.files))
                totevents = min(len(self.events),int(nEvents)) if (nEvents and int(nEvents) not in [-1,0]) else len(self.events)
                self.nEvents = int(ceil(totevents/float(fineSplitFactor)))
                self.firstEvent = firstEvent + fineSplitIndex * self.nEvents
                if self.firstEvent + self.nEvents >= totevents:
                    self.nEvents = totevents - self.firstEvent 
                #print "For component %s will process %d events starting from the %d one, ending at %d excluded" % (self.cfg_comp.name, self.nEvents, self.firstEvent, self.nEvents + self.firstEvent)
        # self.event is set in self.process
        self.event = None
        services = dict()
        for cfg_serv in config.services:
            service = self._build(cfg_serv)
            services[cfg_serv.name] = service
        # would like to provide a copy of the config to the setup,
        # so that analyzers cannot modify the config of other analyzers. 
        # but cannot copy the autofill config.
        self.setup = Setup(config, services)

    def _build(self, cfg):
        try: 
            theClass = cfg.class_object
        except AttributeError:
            errfgmt = 'an object of class {cfg_class}'.format(
                cfg_class=cfg.__class__
            )
            if type(cfg) is type:
                errfgmt = 'a class named {class_name}'.format(
                    class_name=cfg.__name__
                )
            err='''
The looper is trying to build an analyzer configured by {errfgmt}. 

Make sure that the configuration object is of class cfg.Analyzer.
            '''.format(errfgmt=errfgmt)
            raise ValueError(err)
        obj = theClass( cfg, self.cfg_comp, self.outDir )
        return obj
      
    #----------------------------------------------------------------------
    def analyzer(self, name):
        """@return: analyzer with this name."""
        return self._analyzer_dict[name]
    
    def _prepareOutput(self, name):
        index = 0
        tmpname = name
        while True and index < 2000:
            try:
                # print 'mkdir', self.name
                os.mkdir( tmpname )
                break
            except OSError:
                # failed to create the directory
                # is it empty?
                if not os.listdir(tmpname):
                    break  # it is, so use it
                else:
                    # if not we append a number to the directory name
                    index += 1
                    tmpname = '%s_%d' % (name, index)
        if index == 2000:
              raise ValueError( "More than 2000 output folder with same name or 2000 attempts failed, please clean-up, change name or check permissions")
        return tmpname


    def loop(self):
        """Loop on a given number of events.

        At the beginning of the loop, 
        Analyzer.beginLoop is called for each Analyzer.
        At each event, self.process is called.
        At the end of the loop, Analyzer.endLoop is called.
        """
        nEvents = self.nEvents
        firstEvent = self.firstEvent
        iEv = firstEvent
        self.nEvProcessed = 0
        if nEvents is None or int(nEvents)-firstEvent > len(self.events) :
            nEvents = len(self.events) - firstEvent
        else:
            nEvents = int(nEvents)
        self.logger.info(
            'starting loop at event {firstEvent} '\
                'to process {nEvents} events.'.format(firstEvent=firstEvent,
                                                        nEvents=nEvents))
        self.logger.info( str( self.cfg_comp ) )
        for analyzer in self._analyzers:
            analyzer.beginLoop(self.setup)

        if hasattr(self.events, '__getitem__'):
            # events backend supports indexing, e.g. CMS, FCC, bare root
            for iEv in range(firstEvent, firstEvent+nEvents):
                if iEv%100 == 0:
                    if not hasattr(self,'start_time'):
                        self.logger.info( 'event {iEv}'.format(iEv=iEv))
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        self.logger.warning( 'event %d (%.1f ev/s)' % (iEv, (iEv-self.start_time_event)/float(timeit.default_timer() - self.start_time)) )
                try:
                    self.process( iEv )
                    self.nEvProcessed += 1
                    if iEv<self.nPrint:
                        self.logger.info(self.event.__str__())
                    if self.stopFlag and self.stopFlag.value:
                        print 'stopping gracefully at event %d' % (iEv)
                        break
                except UserStop as err:
                    print 'Stopped loop following a UserStop exception:'
                    print err
                    break
        else:
            # events backend does not support indexing, e.g. LCIO
            iEv = 0
            for ii, event in enumerate(self.events):
                if ii < firstEvent:
                    continue
                iEv += 1
                if iEv%100 == 0:
                    if not hasattr(self,'start_time'):
                        self.logger.warning( 'event {iEv}'.format(iEv=iEv))
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        self.logger.info( 'event %d (%.1f ev/s)' % (iEv, (iEv-self.start_time_event)/float(timeit.default_timer() - self.start_time)) )
                try:
                    self.event = Event(iEv, event, self.setup)
                    self.iEvent = iEv
                    self._run_analyzers_on_event()
                    self.nEvProcessed += 1
                    if iEv<self.nPrint:
                        self.logger.info(self.event.__str__())
                    if self.stopFlag and self.stopFlag.value:
                        print 'stopping gracefully at event %d' % (iEv)
                        break
                except UserStop as err:
                    print 'Stopped loop following a UserStop exception:'
                    print err
                    break            
            
        warning = self.logger.warning
        warning('')
        warning( self.cfg_comp )
        warning('')        
        for analyzer in self._analyzers:
            analyzer.endLoop(self.setup)
        if self.timeReport:
            allev = max([x['events'] for x in self.timeReport])
            warning("\n      ---- TimeReport (all times in ms; first evt is skipped) ---- ")
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("processed","all evts","time/proc", " time/all", "  [%] ", "analyer"))
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("---------","--------","---------", "---------", " -----", "-------------"))
            sumtime = sum(rep['time'] for rep in self.timeReport)
            passev  = self.timeReport[-1]['events']
            for ana,rep in zip(self._analyzers,self.timeReport):
                timePerProcEv = rep['time']/(rep['events']-1) if rep['events'] > 1 else 0
                timePerAllEv  = rep['time']/(allev-1)         if allev > 1         else 0
                fracAllEv     = rep['time']/sumtime
                warning( "%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( rep['events'], allev, 1000*timePerProcEv, 1000*timePerAllEv, 100.0*fracAllEv, ana.name))
            totPerProcEv = sumtime/(passev-1) if passev > 1 else 0
            totPerAllEv  = sumtime/(allev-1)  if allev > 1  else 0
            warning("%9s   %9s    %9s   %9s   %s" % ("---------","--------","---------", "---------", "-------------"))
            warning("%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( passev, allev, 1000*totPerProcEv, 1000*totPerAllEv, 100.0, "TOTAL"))
            warning("")
        logfile = open('/'.join([self.name,'log.txt']),'a')
        logfile.write('number of events processed: {nEv}\n'.format(
            nEv=self.nEvProcessed)
        )
        logfile.close()

    def process(self, iEv ):
        """Run event processing for all analyzers in the sequence.

        This function can be called directly from
        the python interpreter, to jump to a given event and process it.
        """
        if not hasattr(self.events, '__getitem__'):
            msg = '''
Your events backend, of type 
{evclass}
does not support indexing. 
Therefore, you cannot directly access a given event using Loop.process.
However, you may still iterate on your events using Loop.loop, 
possibly skipping a number of events at the beginning.
'''.format(evclass=self.events.__class__)
            raise TypeError(msg)
        self.event = Event(iEv, self.events[iEv], self.setup)            
        self.iEvent = iEv
        return self._run_analyzers_on_event()

    def _run_analyzers_on_event(self):
        '''Run all analysers on the current event, self.event. 
        Returns a tuple (success?, last_analyzer_name).
        '''
        for i,analyzer in enumerate(self._analyzers):
            if not analyzer.beginLoopCalled:
                analyzer.beginLoop(self.setup)
            start = timeit.default_timer()
            if self.memReportFirstEvent >=0 and iEv >= self.memReportFirstEvent:           
                memNow=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                if memNow > self.memLast :
                   print  "Mem Jump detected before analyzer %s at event %s. RSS(before,after,difference) %s %s %s "%( analyzer.name, iEv, self.memLast, memNow, memNow-self.memLast)
                self.memLast=memNow
            ret = analyzer.process( self.event )
            if self.memReportFirstEvent >=0 and iEv >= self.memReportFirstEvent:           
                memNow=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                if memNow > self.memLast :
                   print "Mem Jump detected in analyzer %s at event %s. RSS(before,after,difference) %s %s %s "%( analyzer.name, iEv, self.memLast, memNow, memNow-self.memLast)
                self.memLast=memNow
            if self.timeReport:
                self.timeReport[i]['events'] += 1
                if self.timeReport[i]['events'] > 0:
                    self.timeReport[i]['time'] += timeit.default_timer() - start
            if ret == False:
                return (False, analyzer.name)
        return (True, analyzer.name)

    def write(self):
        """Writes all analyzers.

        See Analyzer.Write for more information.
        """
        for analyzer in self._analyzers:
            analyzer.write(self.setup)
        self.setup.close() 
Esempio n. 3
0
class Looper(object):
    """Creates a set of analyzers, and schedules the event processing."""

    def __init__( self, name,
                  config, 
                  nEvents=None,
                  firstEvent=0,
                  nPrint=0,
                  timeReport=False,
                  quiet=False):
        """Handles the processing of an event sample.
        An Analyzer is built for each Config.Analyzer present
        in sequence. The Looper can then be used to process an event,
        or a collection of events.

        Parameters:
        name    : name of the Looper, will be used as the output directory name
        config  : process configuration information, see Config
        nEvents : number of events to process. Defaults to all.
        firstEvent : first event to process. Defaults to the first one.
        nPrint  : number of events to print at the beginning
        """

        self.name = self._prepareOutput(name)
        self.outDir = self.name
        self.logger = logging.getLogger( self.name )
        self.logger.addHandler(logging.FileHandler('/'.join([self.name,
                                                             'log.txt'])))
        self.logger.propagate = False
        if not quiet: 
            self.logger.addHandler( logging.StreamHandler(sys.stdout) )

        self.cfg_comp = config.components[0]
        self.classes = {}
        self.analyzers = map( self._build, config.sequence )
        self.nEvents = nEvents
        self.firstEvent = firstEvent
        self.nPrint = int(nPrint)
        self.timeReport = [ {'time':0.0,'events':0} for a in self.analyzers ] if timeReport else False
        tree_name = None
        if( hasattr(self.cfg_comp, 'tree_name') ):
            tree_name = self.cfg_comp.tree_name
        if len(self.cfg_comp.files)==0:
            errmsg = 'please provide at least an input file in the files attribute of this component\n' + str(self.cfg_comp)
            raise ValueError( errmsg )
        if hasattr(config,"preprocessor") and config.preprocessor is not None :
              self.cfg_comp = config.preprocessor.run(self.cfg_comp,self.outDir,firstEvent,nEvents)
        if hasattr(self.cfg_comp,"options"):
              print self.cfg_comp.files,self.cfg_comp.options
              self.events = config.events_class(self.cfg_comp.files, tree_name,options=self.cfg_comp.options)
        else :
              self.events = config.events_class(self.cfg_comp.files, tree_name)
        if hasattr(self.cfg_comp, 'fineSplit'):
            fineSplitIndex, fineSplitFactor = self.cfg_comp.fineSplit
            if fineSplitFactor > 1:
                if len(self.cfg_comp.files) != 1:
                    raise RuntimeError, "Any component with fineSplit > 1 is supposed to have just a single file, while %s has %s" % (self.cfg_comp.name, self.cfg_comp.files)
                totevents = min(len(self.events),int(nEvents)) if (nEvents and int(nEvents) not in [-1,0]) else len(self.events)
                self.nEvents = int(ceil(totevents/float(fineSplitFactor)))
                self.firstEvent = firstEvent + fineSplitIndex * self.nEvents
                if self.firstEvent + self.nEvents >= totevents:
                    self.nEvents = totevents - self.firstEvent 
                #print "For component %s will process %d events starting from the %d one, ending at %d excluded" % (self.cfg_comp.name, self.nEvents, self.firstEvent, self.nEvents + self.firstEvent)
        # self.event is set in self.process
        self.event = None
        services = dict()
        for cfg_serv in config.services:
            service = self._build(cfg_serv)
            services[cfg_serv.name] = service
        # would like to provide a copy of the config to the setup,
        # so that analyzers cannot modify the config of other analyzers. 
        # but cannot copy the autofill config.
        self.setup = Setup(config, services)

    def _build(self, cfg):
        try: 
            theClass = cfg.class_object
        except AttributeError:
            errfgmt = 'an object of class {cfg_class}'.format(
                cfg_class=cfg.__class__
            )
            if type(cfg) is type:
                errfgmt = 'a class named {class_name}'.format(
                    class_name=cfg.__name__
                )
            err='''
The looper is trying to build an analyzer configured by {errfgmt}. 

Make sure that the configuration object is of class cfg.Analyzer.
            '''.format(errfgmt=errfgmt)
            raise ValueError(err)
        obj = theClass( cfg, self.cfg_comp, self.outDir )
        return obj
        
    def _prepareOutput(self, name):
        index = 0
        tmpname = name
        while True and index < 2000:
            try:
                # print 'mkdir', self.name
                os.mkdir( tmpname )
                break
            except OSError:
                index += 1
                tmpname = '%s_%d' % (name, index)
        if index == 2000:
              raise ValueError( "More than 2000 output folder with same name or 2000 attempts failed, please clean-up, change name or check permissions")
        return tmpname


    def loop(self):
        """Loop on a given number of events.

        At the beginning of the loop, 
        Analyzer.beginLoop is called for each Analyzer.
        At each event, self.process is called.
        At the end of the loop, Analyzer.endLoop is called.
        """
        nEvents = self.nEvents
        firstEvent = self.firstEvent
        iEv = firstEvent
        if nEvents is None or int(nEvents) > len(self.events) :
            nEvents = len(self.events)
        else:
            nEvents = int(nEvents)
        eventSize = nEvents
        self.logger.info(
            'starting loop at event {firstEvent} '\
                'to process {eventSize} events.'.format(firstEvent=firstEvent,
                                                        eventSize=eventSize))
        self.logger.info( str( self.cfg_comp ) )
        for analyzer in self.analyzers:
            analyzer.beginLoop(self.setup)
        try:
            for iEv in range(firstEvent, firstEvent+eventSize):
                # if iEv == nEvents:
                #     break
                if iEv%100 ==0:
                    # print 'event', iEv
                    if not hasattr(self,'start_time'):
                        print 'event', iEv
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        print 'event %d (%.1f ev/s)' % (iEv, (iEv-self.start_time_event)/float(timeit.default_timer() - self.start_time))

                self.process( iEv )
                if iEv<self.nPrint:
                    print self.event

        except UserWarning:
            print 'Stopped loop following a UserWarning exception'

        warning = self.logger.warning
        info = self.logger.info
        warning('number of events processed: {nEv}'.format(nEv=iEv+1))
        warning('')
        warning( self.cfg_comp )
        warning('')        
        for analyzer in self.analyzers:
            analyzer.endLoop(self.setup)
        if self.timeReport:
            allev = max([x['events'] for x in self.timeReport])
            warning("\n      ---- TimeReport (all times in ms; first evt is skipped) ---- ")
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("processed","all evts","time/proc", " time/all", "  [%] ", "analyer"))
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("---------","--------","---------", "---------", " -----", "-------------"))
            sumtime = sum(rep['time'] for rep in self.timeReport)
            passev  = self.timeReport[-1]['events']
            for ana,rep in zip(self.analyzers,self.timeReport):
                timePerProcEv = rep['time']/(rep['events']-1) if rep['events'] > 1 else 0
                timePerAllEv  = rep['time']/(allev-1)         if allev > 1         else 0
                fracAllEv     = rep['time']/sumtime
                warning( "%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( rep['events'], allev, 1000*timePerProcEv, 1000*timePerAllEv, 100.0*fracAllEv, ana.name))
            totPerProcEv = sumtime/(passev-1) if passev > 1 else 0
            totPerAllEv  = sumtime/(allev-1)  if allev > 1  else 0
            warning("%9s   %9s    %9s   %9s   %s" % ("---------","--------","---------", "---------", "-------------"))
            warning("%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( passev, allev, 1000*totPerProcEv, 1000*totPerAllEv, 100.0, "TOTAL"))
            warning("")

    def process(self, iEv ):
        """Run event processing for all analyzers in the sequence.

        This function is called by self.loop,
        but can also be called directly from
        the python interpreter, to jump to a given event.
        """
        self.event = Event(iEv, self.events[iEv], self.setup)
        self.iEvent = iEv
        for i,analyzer in enumerate(self.analyzers):
            if not analyzer.beginLoopCalled:
                analyzer.beginLoop(self.setup)
            start = timeit.default_timer()
            ret = analyzer.process( self.event )
            if self.timeReport:
                self.timeReport[i]['events'] += 1
                if self.timeReport[i]['events'] > 0:
                    self.timeReport[i]['time'] += timeit.default_timer() - start
            if ret == False:
                return (False, analyzer.name)
        if iEv<self.nPrint:
            self.logger.info( self.event.__str__() )
        return (True, analyzer.name)

    def write(self):
        """Writes all analyzers.

        See Analyzer.Write for more information.
        """
        for analyzer in self.analyzers:
            analyzer.write(self.setup)
        self.setup.close() 
Esempio n. 4
0
class Looper(object):
    """Creates a set of analyzers, and schedules the event processing."""

    def __init__( self, name,
                  config, 
                  nEvents=None,
                  firstEvent=0,
                  nPrint=0,
                  timeReport=False,
                  quiet=False,
                  memCheckFromEvent=-1,
                  stopFlag = None):
        """Handles the processing of an event sample.
        An Analyzer is built for each Config.Analyzer present
        in sequence. The Looper can then be used to process an event,
        or a collection of events.

        Parameters:
        name    : name of the Looper, will be used as the output directory name
        config  : process configuration information, see Config
        nEvents : number of events to process. Defaults to all.
        firstEvent : first event to process. Defaults to the first one.
        nPrint  : number of events to print at the beginning
    
        stopFlag: it should be a multiprocessing.Value instance, that is set to 1 
                  when this thread, or any other, receives a SIGUSR2 to ask for
                  a graceful job termination. In this case, the looper will also
                  set up a signal handler for SIGUSR2.
                  (if set to None, nothing of all this happens)
        """

        self.config = config
        self.name = self._prepareOutput(name)
        self.outDir = self.name
        self.logger = logging.getLogger( self.name )
        self.logger.addHandler(logging.FileHandler('/'.join([self.name,
                                                             'log.txt'])))
        self.logger.propagate = False
        if not quiet: 
            self.logger.addHandler( logging.StreamHandler(sys.stdout) )

        self.cfg_comp = config.components[0]
        self.classes = {}
        self.analyzers = map( self._build, config.sequence )
        self.nEvents = nEvents
        self.firstEvent = firstEvent
        self.nPrint = int(nPrint)
        self.timeReport = [ {'time':0.0,'events':0} for a in self.analyzers ] if timeReport else False
        self.memReportFirstEvent = memCheckFromEvent
        self.memLast=0
        self.stopFlag = stopFlag
        if stopFlag:
            import signal
            def doSigUsr2(sig,frame):
                print 'SIGUSR2 received, signaling graceful stop'
                self.stopFlag.value = 1
            signal.signal(signal.SIGUSR2, doSigUsr2)
        tree_name = None
        if( hasattr(self.cfg_comp, 'tree_name') ):
            tree_name = self.cfg_comp.tree_name
        if len(self.cfg_comp.files)==0:
            errmsg = 'please provide at least an input file in the files attribute of this component\n' + str(self.cfg_comp)
            raise ValueError( errmsg )
        if hasattr(config,"preprocessor") and config.preprocessor is not None :
              self.cfg_comp = config.preprocessor.run(self.cfg_comp,self.outDir,firstEvent,nEvents)
        if hasattr(self.cfg_comp,"options"):
              print self.cfg_comp.files,self.cfg_comp.options
              self.events = config.events_class(self.cfg_comp.files, tree_name,options=self.cfg_comp.options)
        else :
              self.events = config.events_class(self.cfg_comp.files, tree_name)
        if hasattr(self.cfg_comp, 'fineSplit'):
            fineSplitIndex, fineSplitFactor = self.cfg_comp.fineSplit
            if fineSplitFactor > 1:
                if len(self.cfg_comp.files) != 1:
                    raise RuntimeError("Any component with fineSplit > 1 is supposed to have just a single file, while %s has %s" % (self.cfg_comp.name, self.cfg_comp.files))
                totevents = min(len(self.events),int(nEvents)) if (nEvents and int(nEvents) not in [-1,0]) else len(self.events)
                self.nEvents = int(ceil(totevents/float(fineSplitFactor)))
                self.firstEvent = firstEvent + fineSplitIndex * self.nEvents
                if self.firstEvent + self.nEvents >= totevents:
                    self.nEvents = totevents - self.firstEvent 
                #print "For component %s will process %d events starting from the %d one, ending at %d excluded" % (self.cfg_comp.name, self.nEvents, self.firstEvent, self.nEvents + self.firstEvent)
        # self.event is set in self.process
        self.event = None
        services = dict()
        for cfg_serv in config.services:
            service = self._build(cfg_serv)
            services[cfg_serv.name] = service
        # would like to provide a copy of the config to the setup,
        # so that analyzers cannot modify the config of other analyzers. 
        # but cannot copy the autofill config.
        self.setup = Setup(config, services)

    def _build(self, cfg):
        theClass = cfg.class_object
        obj = theClass( cfg, self.cfg_comp, self.outDir )
        return obj
        
    def _prepareOutput(self, name):
        index = 0
        tmpname = name
        while True and index < 2000:
            try:
                # print 'mkdir', self.name
                os.mkdir( tmpname )
                break
            except OSError:
                index += 1
                tmpname = '%s_%d' % (name, index)
        if index == 2000:
              raise ValueError( "More than 2000 output folder with same name or 2000 attempts failed, please clean-up, change name or check permissions")
        return tmpname


    def loop(self):
        """Loop on a given number of events.

        At the beginning of the loop, 
        Analyzer.beginLoop is called for each Analyzer.
        At each event, self.process is called.
        At the end of the loop, Analyzer.endLoop is called.
        """
        nEvents = self.nEvents
        firstEvent = self.firstEvent
        iEv = firstEvent
        if nEvents is None or int(nEvents) > len(self.events) :
            nEvents = len(self.events)
        else:
            nEvents = int(nEvents)
        eventSize = nEvents
        self.logger.info(
            'starting loop at event {firstEvent} '\
                'to process {eventSize} events.'.format(firstEvent=firstEvent,
                                                        eventSize=eventSize))
        self.logger.info( str( self.cfg_comp ) )
        for analyzer in self.analyzers:
            analyzer.beginLoop(self.setup)
        try:
            for iEv in range(firstEvent, firstEvent+eventSize):
                # if iEv == nEvents:
                #     break
                if iEv%100 ==0:
                    # print 'event', iEv
                    if not hasattr(self,'start_time'):
                        print 'event', iEv
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        print 'event %d (%.1f ev/s)' % (iEv, (iEv-self.start_time_event)/float(timeit.default_timer() - self.start_time))

                self.process( iEv )
                if iEv<self.nPrint:
                    print self.event
                if self.stopFlag and self.stopFlag.value:
                    print 'stopping gracefully at event %d' % (iEv)
                    break

        except UserWarning:
            print 'Stopped loop following a UserWarning exception'

        info = self.logger.info
        warning = self.logger.warning
        warning('number of events processed: {nEv}'.format(nEv=iEv+1))
        warning('')
        info( self.cfg_comp )
        info('')        
        for analyzer in self.analyzers:
            analyzer.endLoop(self.setup)
        if self.timeReport:
            allev = max([x['events'] for x in self.timeReport])
            warning("\n      ---- TimeReport (all times in ms; first evt is skipped) ---- ")
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("processed","all evts","time/proc", " time/all", "  [%] ", "analyer"))
            warning("%9s   %9s    %9s   %9s %6s   %s" % ("---------","--------","---------", "---------", " -----", "-------------"))
            sumtime = sum(rep['time'] for rep in self.timeReport)
            passev  = self.timeReport[-1]['events']
            for ana,rep in zip(self.analyzers,self.timeReport):
                timePerProcEv = rep['time']/(rep['events']-1) if rep['events'] > 1 else 0
                timePerAllEv  = rep['time']/(allev-1)         if allev > 1         else 0
                fracAllEv     = rep['time']/sumtime
                warning( "%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( rep['events'], allev, 1000*timePerProcEv, 1000*timePerAllEv, 100.0*fracAllEv, ana.name))
            totPerProcEv = sumtime/(passev-1) if passev > 1 else 0
            totPerAllEv  = sumtime/(allev-1)  if allev > 1  else 0
            warning("%9s   %9s    %9s   %9s   %s" % ("---------","--------","---------", "---------", "-------------"))
            warning("%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" % ( passev, allev, 1000*totPerProcEv, 1000*totPerAllEv, 100.0, "TOTAL"))
            warning("")
        if hasattr(self.events, 'endLoop'): self.events.endLoop()
        if hasattr(self.config,"preprocessor") and self.config.preprocessor is not None:
              if hasattr(self.config.preprocessor,"endLoop"):
                  self.config.preprocessor.endLoop(self.cfg_comp)

    def process(self, iEv ):
        """Run event processing for all analyzers in the sequence.

        This function is called by self.loop,
        but can also be called directly from
        the python interpreter, to jump to a given event.
        """
        self.event = Event(iEv, self.events[iEv], self.setup)
        self.iEvent = iEv
        for i,analyzer in enumerate(self.analyzers):
            if not analyzer.beginLoopCalled:
                analyzer.beginLoop(self.setup)
            start = timeit.default_timer()
            if self.memReportFirstEvent >=0 and iEv >= self.memReportFirstEvent:           
                memNow=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                if memNow > self.memLast :
                   print  "Mem Jump detected before analyzer %s at event %s. RSS(before,after,difference) %s %s %s "%( analyzer.name, iEv, self.memLast, memNow, memNow-self.memLast)
                self.memLast=memNow
            ret = analyzer.process( self.event )
            if self.memReportFirstEvent >=0 and iEv >= self.memReportFirstEvent:           
                memNow=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                if memNow > self.memLast :
                   print "Mem Jump detected in analyzer %s at event %s. RSS(before,after,difference) %s %s %s "%( analyzer.name, iEv, self.memLast, memNow, memNow-self.memLast)
                self.memLast=memNow
            if self.timeReport:
                self.timeReport[i]['events'] += 1
                if self.timeReport[i]['events'] > 0:
                    self.timeReport[i]['time'] += timeit.default_timer() - start
            if ret == False:
                return (False, analyzer.name)
        if iEv<self.nPrint:
            self.logger.info( self.event.__str__() )
        return (True, analyzer.name)

    def write(self):
        """Writes all analyzers.

        See Analyzer.Write for more information.
        """
        for analyzer in self.analyzers:
            analyzer.write(self.setup)
        self.setup.close() 
Esempio n. 5
0
	def __str__( self ):
		return '%s, uri: %s, clsName: %s' % ( Event.__str__(self), str(self.uri), str(self.clsName) )
Esempio n. 6
0
class Looper(object):
    """Creates a set of analyzers, and schedules the event processing."""
    def __init__(self,
                 name,
                 config,
                 nEvents=None,
                 firstEvent=0,
                 nPrint=0,
                 timeReport=False,
                 quiet=False):
        """Handles the processing of an event sample.
        An Analyzer is built for each Config.Analyzer present
        in sequence. The Looper can then be used to process an event,
        or a collection of events.

        Parameters:
        name    : name of the Looper, will be used as the output directory name
        config  : process configuration information, see Config
        nEvents : number of events to process. Defaults to all.
        firstEvent : first event to process. Defaults to the first one.
        nPrint  : number of events to print at the beginning
        """

        self.name = self._prepareOutput(name)
        self.outDir = self.name
        self.logger = logging.getLogger(self.name)
        self.logger.addHandler(
            logging.FileHandler('/'.join([self.name, 'log.txt'])))
        self.logger.propagate = False
        if not quiet:
            self.logger.addHandler(logging.StreamHandler(sys.stdout))

        self.cfg_comp = config.components[0]
        self.classes = {}
        self.analyzers = map(self._build, config.sequence)
        self.nEvents = nEvents
        self.firstEvent = firstEvent
        self.nPrint = int(nPrint)
        self.timeReport = [{
            'time': 0.0,
            'events': 0
        } for a in self.analyzers] if timeReport else False
        tree_name = None
        if (hasattr(self.cfg_comp, 'tree_name')):
            tree_name = self.cfg_comp.tree_name
        if len(self.cfg_comp.files) == 0:
            errmsg = 'please provide at least an input file in the files attribute of this component\n' + str(
                self.cfg_comp)
            raise ValueError(errmsg)
        self.events = config.events_class(self.cfg_comp.files, tree_name)
        if hasattr(self.cfg_comp, 'fineSplit'):
            fineSplitIndex, fineSplitFactor = self.cfg_comp.fineSplit
            if fineSplitFactor > 1:
                if len(self.cfg_comp.files) != 1:
                    raise RuntimeError, "Any component with fineSplit > 1 is supposed to have just a single file, while %s has %s" % (
                        self.cfg_comp.name, self.cfg_comp.files)
                totevents = min(len(self.events), int(nEvents)) if (
                    nEvents and int(nEvents) not in [-1, 0]) else len(
                        self.events)
                self.nEvents = int(ceil(totevents / float(fineSplitFactor)))
                self.firstEvent = firstEvent + fineSplitIndex * self.nEvents
                #print "For component %s will process %d events starting from the %d one" % (self.cfg_comp.name, self.nEvents, self.firstEvent)
        # self.event is set in self.process
        self.event = None
        services = dict()
        for cfg_serv in config.services:
            service = self._build(cfg_serv)
            services[cfg_serv.name] = service
        # would like to provide a copy of the config to the setup,
        # so that analyzers cannot modify the config of other analyzers.
        # but cannot copy the autofill config.
        self.setup = Setup(config, services)

    def _build(self, cfg):
        theClass = cfg.class_object
        obj = theClass(cfg, self.cfg_comp, self.outDir)
        return obj

    def _prepareOutput(self, name):
        index = 0
        tmpname = name
        while True and index < 2000:
            try:
                # print 'mkdir', self.name
                os.mkdir(tmpname)
                break
            except OSError:
                index += 1
                tmpname = '%s_%d' % (name, index)
        if index == 2000:
            raise ValueError(
                "More than 2000 output folder with same name or 2000 attempts failed, please clean-up, change name or check permissions"
            )
        return tmpname

    def loop(self):
        """Loop on a given number of events.

        At the beginning of the loop, 
        Analyzer.beginLoop is called for each Analyzer.
        At each event, self.process is called.
        At the end of the loop, Analyzer.endLoop is called.
        """
        nEvents = self.nEvents
        firstEvent = self.firstEvent
        iEv = firstEvent
        if nEvents is None or int(nEvents) > len(self.events):
            nEvents = len(self.events)
        else:
            nEvents = int(nEvents)
        eventSize = nEvents
        self.logger.info(
            'starting loop at event {firstEvent} '\
                'to process {eventSize} events.'.format(firstEvent=firstEvent,
                                                        eventSize=eventSize))
        self.logger.info(str(self.cfg_comp))
        for analyzer in self.analyzers:
            analyzer.beginLoop(self.setup)
        try:
            for iEv in range(firstEvent, firstEvent + eventSize):
                # if iEv == nEvents:
                #     break
                if iEv % 100 == 0:
                    # print 'event', iEv
                    if not hasattr(self, 'start_time'):
                        print 'event', iEv
                        self.start_time = timeit.default_timer()
                        self.start_time_event = iEv
                    else:
                        print 'event %d (%.1f ev/s)' % (
                            iEv, (iEv - self.start_time_event) /
                            float(timeit.default_timer() - self.start_time))

                self.process(iEv)
                if iEv < self.nPrint:
                    print self.event

        except UserWarning:
            print 'Stopped loop following a UserWarning exception'

        info = self.logger.info
        info('number of events processed: {nEv}'.format(nEv=iEv + 1))
        info('')
        info(self.cfg_comp)
        info('')
        for analyzer in self.analyzers:
            analyzer.endLoop(self.setup)
        if self.timeReport:
            allev = max([x['events'] for x in self.timeReport])
            info = self.logger.info
            info(
                "\n      ---- TimeReport (all times in ms; first evt is skipped) ---- "
            )
            info(
                "%9s   %9s    %9s   %9s   %s" %
                ("processed", "all evts", "time/proc", " time/all", "analyer"))
            info("%9s   %9s    %9s   %9s   %s" %
                 ("---------", "--------", "---------", "---------",
                  "-------------"))
            for ana, rep in zip(self.analyzers, self.timeReport):
                info("%9d   %9d   %10.2f  %10.2f   %s" %
                     (rep['events'], allev, 1000 * rep['time'] /
                      (rep['events'] - 1) if rep['events'] > 1 else 0, 1000 *
                      rep['time'] / (allev - 1) if allev > 1 else 0, ana.name))
            info("")

    def process(self, iEv):
        """Run event processing for all analyzers in the sequence.

        This function is called by self.loop,
        but can also be called directly from
        the python interpreter, to jump to a given event.
        """
        self.event = Event(iEv, self.events[iEv], self.setup)
        self.iEvent = iEv
        for i, analyzer in enumerate(self.analyzers):
            if not analyzer.beginLoopCalled:
                analyzer.beginLoop(self.setup)
            start = timeit.default_timer()
            ret = analyzer.process(self.event)
            if self.timeReport:
                self.timeReport[i]['events'] += 1
                if self.timeReport[i]['events'] > 0:
                    self.timeReport[i]['time'] += timeit.default_timer(
                    ) - start
            if ret == False:
                return (False, analyzer.name)
        if iEv < self.nPrint:
            self.logger.info(self.event.__str__())
        return (True, analyzer.name)

    def write(self):
        """Writes all analyzers.

        See Analyzer.Write for more information.
        """
        for analyzer in self.analyzers:
            analyzer.write(self.setup)
        self.setup.close()
Esempio n. 7
0
class Looper(object):
    """Creates a set of analyzers, and schedules the event processing."""
    def __init__(self,
                 name,
                 config,
                 nEvents=None,
                 firstEvent=0,
                 nPrint=0,
                 timeReport=True,
                 quiet=False,
                 memCheckFromEvent=-1,
                 stopFlag=None):
        """Handles the processing of an event sample.
        An Analyzer is built for each Config.Analyzer present
        in sequence. The Looper can then be used to process an event,
        or a collection of events.

        Parameters:
        name    : name of the Looper, will be used as the output directory name
        config  : process configuration information, see Config
        nEvents : number of events to process. Defaults to all.
        firstEvent : first event to process. Defaults to the first one.
        nPrint  : number of events to print at the beginning
    
        stopFlag: it should be a multiprocessing.Value instance, that is set to 1 
                  when this thread, or any other, receives a SIGUSR2 to ask for
                  a graceful job termination. In this case, the looper will also
                  set up a signal handler for SIGUSR2.
                  (if set to None, nothing of all this happens)
        """

        self.config = config
        self.name = self._prepareOutput(name)
        self.outDir = self.name

        # set up logger
        self.logger = logging.getLogger(self.name)
        self.logger.addHandler(
            logging.FileHandler('/'.join([self.name, 'log.txt'])))
        self.logger.propagate = False
        if not quiet:
            self.logger.addHandler(logging.StreamHandler(sys.stdout))

        self.cfg_comp = config.components[0]
        self.classes = {}
        # keep track of analyzers in a list for sequential event processing
        self._analyzers = []
        # and in a dict for easy user access
        self._analyzer_dict = dict()
        self.analyzer_counter = Counter('analyzers')
        for anacfg in self.config.sequence:
            anaobj = self._build(anacfg)
            self._analyzers.append(anaobj)
            self._analyzer_dict[anacfg.name] = anaobj
            self.analyzer_counter.register(anacfg.name)
        self.nEvents = nEvents
        self.firstEvent = firstEvent
        self.nPrint = int(nPrint)
        self.timeReport = [{
            'time': 0.0,
            'events': 0
        } for a in self._analyzers] if timeReport else False
        self.memReportFirstEvent = memCheckFromEvent
        self.memLast = 0
        self.stopFlag = stopFlag
        if stopFlag:
            import signal

            def doSigUsr2(sig, frame):
                print 'SIGUSR2 received, signaling graceful stop'
                self.stopFlag.value = 1

            signal.signal(signal.SIGUSR2, doSigUsr2)
        tree_name = None
        if (hasattr(self.cfg_comp, 'tree_name')):
            tree_name = self.cfg_comp.tree_name
        if len(self.cfg_comp.files) == 0:
            errmsg = 'please provide at least an input file in the files attribute of this component\n' + str(
                self.cfg_comp)
            raise ValueError(errmsg)
        if hasattr(config, "preprocessor") and config.preprocessor is not None:
            self.cfg_comp = config.preprocessor.run(self.cfg_comp, self.outDir,
                                                    firstEvent, nEvents)
        if hasattr(self.cfg_comp, "options"):
            print self.cfg_comp.files, self.cfg_comp.options
            self.events = config.events_class(self.cfg_comp.files,
                                              tree_name,
                                              options=self.cfg_comp.options)
        else:
            self.events = config.events_class(self.cfg_comp.files, tree_name)
        if hasattr(self.cfg_comp, 'fineSplit'):
            fineSplitIndex, fineSplitFactor = self.cfg_comp.fineSplit
            if fineSplitFactor > 1:
                if len(self.cfg_comp.files) != 1:
                    raise RuntimeError(
                        "Any component with fineSplit > 1 is supposed to have just a single file, while %s has %s"
                        % (self.cfg_comp.name, self.cfg_comp.files))
                totevents = min(len(self.events), int(nEvents)) if (
                    nEvents and int(nEvents) not in [-1, 0]) else len(
                        self.events)
                self.nEvents = int(ceil(totevents / float(fineSplitFactor)))
                self.firstEvent = firstEvent + fineSplitIndex * self.nEvents
                if self.firstEvent + self.nEvents >= totevents:
                    self.nEvents = totevents - self.firstEvent
                #print "For component %s will process %d events starting from the %d one, ending at %d excluded" % (self.cfg_comp.name, self.nEvents, self.firstEvent, self.nEvents + self.firstEvent)
        # self.event is set in self.process
        self.event = None
        services = dict()
        for cfg_serv in config.services:
            service = self._build(cfg_serv)
            services[cfg_serv.name] = service
        # would like to provide a copy of the config to the setup,
        # so that analyzers cannot modify the config of other analyzers.
        # but cannot copy the autofill config.
        self.setup = Setup(config, services)

        ######### Save processing information to output directory

        # save the versions
        if self.config.versions:
            self.config.versions.write_yaml('/'.join(
                [self.outDir, 'software.yaml']))
        # remove versions from the config as it can't be pickled
        config_no_versions = copy.copy(self.config)
        delattr(config_no_versions, 'versions')

        # save the config
        pck_fname = '/'.join([self.outDir, 'config.pck'])
        with open(pck_fname, 'w') as out:
            pickle.dump(config_no_versions, out, protocol=-1)

        # later, it is possible that unpickling the config
        # does not work, e.g. because of
        # - changes in the type of the stored objects
        # - different machine
        # so we also keep the component in a simple form:
        comp_data = dict(name=self.cfg_comp.name, files=self.cfg_comp.files)
        pck_fname = '/'.join([self.outDir, 'component.pck'])
        with open(pck_fname, 'w') as out:
            pickle.dump(comp_data, out)

    def _build(self, cfg):
        try:
            theClass = cfg.class_object
        except AttributeError:
            errfgmt = 'an object of class {cfg_class}'.format(
                cfg_class=cfg.__class__)
            if type(cfg) is type:
                errfgmt = 'a class named {class_name}'.format(
                    class_name=cfg.__name__)
            err = '''
The looper is trying to build an analyzer configured by {errfgmt}. 

Make sure that the configuration object is of class cfg.Analyzer.
            '''.format(errfgmt=errfgmt)
            raise ValueError(err)
        obj = theClass(cfg, self.cfg_comp, self.outDir)
        return obj

    #----------------------------------------------------------------------
    def analyzer(self, name):
        """@return: analyzer with this name."""
        return self._analyzer_dict[name]

    def _prepareOutput(self, name):
        index = 0
        tmpname = name
        while True and index < 2000:
            try:
                os.makedirs(tmpname)
                break
            except OSError:
                # failed to create the directory
                # is it empty?
                if not os.listdir(tmpname):
                    break  # it is, so use it
                else:
                    # if not we append a number to the directory name
                    index += 1
                    tmpname = '%s_%d' % (name, index)
        if index == 2000:
            raise ValueError(
                "More than 2000 output folder with same name or 2000 attempts failed, please clean-up, change name or check permissions"
            )
        return tmpname

    def loop(self):
        """Loop on a given number of events.

        At the beginning of the loop, 
        Analyzer.beginLoop is called for each Analyzer.
        At each event, self.process is called.
        At the end of the loop, Analyzer.endLoop is called.
        """
        def initialize_timer(iEv):
            if iEv % 100 == 0:
                if not hasattr(self, 'start_time'):
                    self.logger.info('event {iEv}'.format(iEv=iEv))
                    self.start_time = timeit.default_timer()
                    self.start_time_event = iEv
                else:
                    self.logger.warning(
                        'event %d (%.1f ev/s)' %
                        (iEv, (iEv - self.start_time_event) /
                         float(timeit.default_timer() - self.start_time)))

        nEvents = self.nEvents
        firstEvent = self.firstEvent
        iEv = firstEvent
        self.nEvProcessed = 0
        if nEvents is None or int(nEvents) - firstEvent > len(self.events):
            nEvents = len(self.events) - firstEvent
        else:
            nEvents = int(nEvents)
        self.logger.info(
            'starting loop at event {firstEvent} '\
                'to process {nEvents} events.'.format(firstEvent=firstEvent,
                                                        nEvents=nEvents))
        self.logger.info(str(self.cfg_comp))
        for analyzer in self._analyzers:
            analyzer.beginLoop(self.setup)

        if hasattr(self.events, '__getitem__'):
            # events backend supports indexing, e.g. CMS, FCC, bare root
            for iEv in range(firstEvent, firstEvent + nEvents):
                initialize_timer(iEv)
                try:
                    self.process(iEv)
                    self.nEvProcessed += 1
                    if iEv < self.nPrint:
                        self.logger.info(self.event.__str__())
                    if self.stopFlag and self.stopFlag.value:
                        print 'stopping gracefully at event %d' % (iEv)
                        break
                except UserStop as err:
                    print 'Stopped loop following a UserStop exception:'
                    print err
                    break
        else:
            # events backend does not support indexing, e.g. LCIO
            iEv = 0
            for ii, event in enumerate(self.events):
                if ii < firstEvent:
                    continue
                initialize_timer(iEv)
                iEv += 1
                try:
                    self.event = Event(iEv, event, self.setup)
                    self.iEvent = iEv
                    self._run_analyzers_on_event()
                    self.nEvProcessed += 1
                    if iEv < self.nPrint:
                        self.logger.info(self.event.__str__())
                    if self.stopFlag and self.stopFlag.value:
                        print 'stopping gracefully at event %d' % (iEv)
                        break
                except UserStop as err:
                    print 'Stopped loop following a UserStop exception:'
                    print err
                    break
        for analyzer in self._analyzers:
            analyzer.endLoop(self.setup)
        self._write_log()

    def _write_log(self):
        warning = self.logger.warning
        warning('')
        warning(self.cfg_comp)
        warning('')
        if self.timeReport:
            allev = max([x['events'] for x in self.timeReport])
            warning(
                "\n      ---- TimeReport (all times in ms; first evt is skipped) ---- "
            )
            warning("%9s   %9s    %9s   %9s %6s   %s" %
                    ("processed", "all evts", "time/proc", " time/all",
                     "  [%] ", "analyer"))
            warning("%9s   %9s    %9s   %9s %6s   %s" %
                    ("---------", "--------", "---------", "---------",
                     " -----", "-------------"))
            sumtime = sum(rep['time'] for rep in self.timeReport)
            passev = self.timeReport[-1]['events']
            for ana, rep in zip(self._analyzers, self.timeReport):
                timePerProcEv = rep['time'] / (rep['events'] -
                                               1) if rep['events'] > 1 else 0
                timePerAllEv = rep['time'] / (allev - 1) if allev > 1 else 0
                fracAllEv = rep['time'] / sumtime
                warning("%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" %
                        (rep['events'], allev, 1000 * timePerProcEv,
                         1000 * timePerAllEv, 100.0 * fracAllEv, ana.name))
            totPerProcEv = sumtime / (passev - 1) if passev > 1 else 0
            totPerAllEv = sumtime / (allev - 1) if allev > 1 else 0
            warning("%9s   %9s    %9s   %9s   %s" %
                    ("---------", "--------", "---------", "---------",
                     "-------------"))
            warning("%9d   %9d   %10.2f  %10.2f %5.1f%%   %s" %
                    (passev, allev, 1000 * totPerProcEv, 1000 * totPerAllEv,
                     100.0, "TOTAL"))
            warning("")
        warning(self.analyzer_counter)
        # the following must be printed to the log file in all cases,
        # as the heppy batch scripts rely on this line to decide whether
        # processing is succesful.
        logfile = open('/'.join([self.name, 'log.txt']), 'a')
        logfile.write('number of events processed: {nEv}\n'.format(
            nEv=self.nEvProcessed))
        logfile.close()

    def process(self, iEv):
        """Run event processing for all analyzers in the sequence.

        This function can be called directly from
        the python interpreter, to jump to a given event and process it.
        """
        if not hasattr(self.events, '__getitem__'):
            msg = '''
Your events backend, of type 
{evclass}
does not support indexing. 
Therefore, you cannot directly access a given event using Loop.process.
However, you may still iterate on your events using Loop.loop, 
possibly skipping a number of events at the beginning.
'''.format(evclass=self.events.__class__)
            raise TypeError(msg)
        self.event = Event(iEv, self.events[iEv], self.setup)
        self.iEvent = iEv
        return self._run_analyzers_on_event()

    def _run_analyzers_on_event(self):
        '''Run all analysers on the current event, self.event. 
        Returns a tuple (success?, last_analyzer_name).
        '''
        for i, analyzer in enumerate(self._analyzers):
            if not analyzer.beginLoopCalled:
                analyzer.beginLoop(self.setup)
            start = timeit.default_timer()
            if self.memReportFirstEvent >= 0 and iEv >= self.memReportFirstEvent:
                memNow = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                if memNow > self.memLast:
                    print "Mem Jump detected before analyzer %s at event %s. RSS(before,after,difference) %s %s %s " % (
                        analyzer.name, iEv, self.memLast, memNow,
                        memNow - self.memLast)
                self.memLast = memNow
            ret = False
            try:
                ret = analyzer.process(self.event)
                ret = True if ret is None else ret
                self.event.analyzers.append((analyzer, ret))
            except:
                #TODO: check that this works fine with non podio inputs, e.g. plain TChains.
                if hasattr(self.event, 'input') and \
                   hasattr(self.event.input, 'current_filename'):
                    # TODO re-enable analyzer printout when next podio is available
                    ##                    print 'exception running analyzer {ana} on event {iev} in file\n {fname}'.format(
                    ##                        ana=analyzer.cfg_ana.name,
                    ##                        iev=self.event.iEv,
                    ##                        fname=self.event.input.current_filename()
                    ##                    )
                    pass
                raise
            if self.memReportFirstEvent >= 0 and iEv >= self.memReportFirstEvent:
                memNow = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
                if memNow > self.memLast:
                    print "Mem Jump detected in analyzer %s at event %s. RSS(before,after,difference) %s %s %s " % (
                        analyzer.name, iEv, self.memLast, memNow,
                        memNow - self.memLast)
                self.memLast = memNow
            if self.timeReport:
                self.timeReport[i]['events'] += 1
                if self.timeReport[i]['events'] > 0:
                    self.timeReport[i]['time'] += timeit.default_timer(
                    ) - start
            if ret == False:
                return (False, analyzer.name)
            else:
                self.analyzer_counter.inc(analyzer.name)
        return (True, analyzer.name)

    def write(self):
        """Writes the configuration, the software versions,
        and the output of all analyzers in the output directory.

        See Analyzer.Write for more information.
        """
        for analyzer in self._analyzers:
            analyzer.write(self.setup)
        self.setup.close()
Esempio n. 8
0
	def __str__( self ):
		return '%s, user: %s, source = %s, resource: %s, event: %s, type: %s, expires: %s, silent: %s' % ( Event.__str__(self), str(self.user.name), str(self.subscriptionSource), str(self.name), str(self.resourceUri), str(self.event), str(self.contentType), str(self.expires), str(self.silent) )