コード例 #1
0
class Cacher(AgentBase):
    # constructor
    def __init__(self, communicator, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator

    # main loop
    def run(self):
        while True:
            # execute
            self.execute()
            # check if being terminated
            if self.terminated(harvester_config.cacher.sleepTime,
                               randomize=False):
                return

    # main
    def execute(self, force_update=False, skip_lock=False):
        mainLog = self.make_logger(_logger,
                                   'id={0}'.format(self.get_pid()),
                                   method_name='execute')
        # get lock
        locked = self.dbProxy.get_process_lock(
            'cacher', self.get_pid(), harvester_config.cacher.sleepTime)
        if locked or skip_lock:
            mainLog.debug('getting information')
            timeLimit = datetime.datetime.utcnow() - \
                datetime.timedelta(minutes=harvester_config.cacher.refreshInterval)
            itemsList = []
            keysForceUpdate = []
            nItems = 4
            for tmpStr in harvester_config.cacher.data:
                tmpItems = tmpStr.split('|')
                if len(tmpItems) < 3:
                    continue
                tmpItems += [None] * (nItems - len(tmpItems))
                tmpItems = tmpItems[:nItems]
                itemsList.append(tmpItems)
            # add queues_config
            if core_utils.get_queues_config_url() is not None:
                tmpKey = 'queues_config_file'
                itemsList.append(
                    (tmpKey, None, core_utils.get_queues_config_url()))
                keysForceUpdate.append(tmpKey)
            # loop over all items
            for mainKey, subKey, infoURL, dumpFile in itemsList:
                if subKey == '':
                    subKey = None
                # check last update time
                lastUpdateTime = self.dbProxy.get_cache_last_update_time(
                    mainKey, subKey)
                if (not force_update or mainKey not in keysForceUpdate) and lastUpdateTime is not None \
                        and lastUpdateTime > timeLimit:
                    continue
                # get information
                tmpStat, newInfo = self.get_data(infoURL, mainLog)
                if not tmpStat:
                    mainLog.error(
                        'failed to get info for key={0} subKey={1}'.format(
                            mainKey, subKey))
                    continue
                # update
                tmpStat = self.dbProxy.refresh_cache(mainKey, subKey, newInfo)
                if tmpStat:
                    mainLog.debug('refreshed key={0} subKey={1}'.format(
                        mainKey, subKey))
                    if dumpFile is not None:
                        try:
                            tmpFileName = dumpFile + '.tmp'
                            with open(tmpFileName, 'w') as tmpFile:
                                json.dump(newInfo, tmpFile)
                            shutil.move(tmpFileName, dumpFile)
                        except Exception:
                            core_utils.dump_error_message(mainLog)
                else:
                    mainLog.error(
                        'failed to refresh key={0} subKey={1} due to a DB error'
                        .format(mainKey, subKey))
            mainLog.debug('done')

    # get new data
    def get_data(self, info_url, tmp_log):
        retStat = False
        retVal = None
        if info_url.startswith('file:'):
            try:
                with open(info_url.split(':')[-1], 'r') as infoFile:
                    retVal = infoFile.read()
                    try:
                        retVal = json.loads(retVal)
                    except Exception:
                        pass
            except Exception:
                core_utils.dump_error_message(tmp_log)
        elif info_url.startswith('http'):
            try:
                res = requests.get(info_url, timeout=60)
                if res.status_code == 200:
                    try:
                        retVal = res.json()
                    except Exception:
                        errMsg = 'corrupted json from {0} : {1}'.format(
                            info_url, res.text)
                        tmp_log.error(errMsg)
                else:
                    errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(
                        info_url, res.status_code, res.text)
                    tmp_log.error(errMsg)
            except requests.exceptions.ReadTimeout:
                tmp_log.error(
                    'read timeout when getting data from {0}'.format(info_url))
            except Exception:
                core_utils.dump_error_message(tmp_log)
        elif info_url.startswith('panda_cache:'):
            try:
                publicKey, privateKey = info_url.split(':')[-1].split('&')
                retVal, outStr = self.communicator.get_key_pair(
                    publicKey, privateKey)
                if retVal is None:
                    tmp_log.error(outStr)
            except Exception:
                core_utils.dump_error_message(tmp_log)
        elif info_url.startswith('panda_server:'):
            try:
                retVal, outStr = self.communicator.get_resource_types()
                if not retVal:
                    tmp_log.error(outStr)
            except Exception:
                core_utils.dump_error_message(tmp_log)
        else:
            errMsg = 'unsupported protocol for {0}'.format(info_url)
            tmp_log.error(errMsg)
        if retVal is not None:
            retStat = True
        return retStat, retVal

    # set single mode
    def set_single_mode(self, single_mode):
        self.singleMode = single_mode
コード例 #2
0
ファイル: cred_manager.py プロジェクト: HSF/harvester
class CredManager(AgentBase):

    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queue_config_mapper = queue_config_mapper
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        # plugin cores
        self.exeCores = []
        self.queue_exe_cores = []
        # get plugin from harvester config
        self.get_cores_from_harvester_config()
        # update plugin cores from queue config
        self.update_cores_from_queue_config()

    # get list
    def get_list(self, data):
        if isinstance(data, list):
            return data
        else:
            return [data]

    # get plugin cores from harvester config
    def get_cores_from_harvester_config(self):
        # get module and class names
        if hasattr(harvester_config.credmanager, 'moduleName'):
            moduleNames = self.get_list(
                harvester_config.credmanager.moduleName)
        else:
            moduleNames = []
        if hasattr(harvester_config.credmanager, 'className'):
            classNames = self.get_list(harvester_config.credmanager.className)
        else:
            classNames = []
        # file names of original certificates
        if hasattr(harvester_config.credmanager, 'inCertFile'):
            inCertFiles = self.get_list(
                harvester_config.credmanager.inCertFile)
        elif hasattr(harvester_config.credmanager, 'certFile'):
            inCertFiles = self.get_list(harvester_config.credmanager.certFile)
        else:
            inCertFiles = []
        # file names of certificates to be generated
        if hasattr(harvester_config.credmanager, 'outCertFile'):
            outCertFiles = self.get_list(
                harvester_config.credmanager.outCertFile)
        else:
            # use the file name of the certificate for panda connection as output name
            outCertFiles = self.get_list(harvester_config.pandacon.cert_file)
        # VOMS
        if hasattr(harvester_config.credmanager, 'voms'):
            vomses = self.get_list(harvester_config.credmanager.voms)
        else:
            vomses = []
        # direct and merged plugin configuration in json
        if hasattr(harvester_config.credmanager, 'pluginConfigs'):
            pluginConfigs = harvester_config.credmanager.pluginConfigs
        else:
            pluginConfigs = []
        # from traditional attributes
        for moduleName, className, inCertFile, outCertFile, voms in \
                zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses):
            pluginPar = {}
            pluginPar['module'] = moduleName
            pluginPar['name'] = className
            pluginPar['inCertFile'] = inCertFile
            pluginPar['outCertFile'] = outCertFile
            pluginPar['voms'] = voms
            try:
                exeCore = self.pluginFactory.get_plugin(pluginPar)
                self.exeCores.append(exeCore)
            except Exception:
                _logger.error(
                    'failed to launch credmanager with traditional attributes for {0}'
                    .format(pluginPar))
                core_utils.dump_error_message(_logger)
        # from pluginConfigs
        for pc in pluginConfigs:
            try:
                setup_maps = pc['configs']
                for setup_name, setup_map in setup_maps.items():
                    try:
                        pluginPar = {}
                        pluginPar['module'] = pc['module']
                        pluginPar['name'] = pc['name']
                        pluginPar['setup_name'] = setup_name
                        pluginPar.update(setup_map)
                        exeCore = self.pluginFactory.get_plugin(pluginPar)
                        self.exeCores.append(exeCore)
                    except Exception:
                        _logger.error(
                            'failed to launch credmanager in pluginConfigs for {0}'
                            .format(pluginPar))
                        core_utils.dump_error_message(_logger)
            except Exception:
                _logger.error('failed to parse pluginConfigs {0}'.format(pc))
                core_utils.dump_error_message(_logger)

    # update plugin cores from queue config
    def update_cores_from_queue_config(self):
        self.queue_exe_cores = []
        for queue_name, queue_config in self.queue_config_mapper.get_all_queues(
        ).items():
            if queue_config.queueStatus == 'offline' \
                    or not hasattr(queue_config, 'credmanagers') \
                    or not isinstance(queue_config.credmanagers, list):
                continue
            for cm_setup in queue_config.credmanagers:
                try:
                    pluginPar = {}
                    pluginPar['module'] = cm_setup['module']
                    pluginPar['name'] = cm_setup['name']
                    pluginPar['setup_name'] = queue_name
                    for k, v in cm_setup.items():
                        if k in ('module', 'name'):
                            pass
                        if isinstance(v, str) and '$' in v:
                            # replace placeholders
                            value = v
                            patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', v)
                            for patt in patts:
                                tmp_ph = '${' + patt + '}'
                                tmp_val = None
                                if patt == 'harvesterID':
                                    tmp_val = harvester_config.master.harvester_id
                                elif patt == 'queueName':
                                    tmp_val = queue_name
                                elif patt.startswith('common.'):
                                    # values from common blocks
                                    attr = patt.replace('common.', '')
                                    if hasattr(
                                            queue_config, 'common'
                                    ) and attr in queue_config.common:
                                        tmp_val = queue_config.common[attr]
                                if tmp_val is not None:
                                    value = value.replace(tmp_ph, tmp_val)
                            # fill in
                            pluginPar[k] = value
                        else:
                            # fill in
                            pluginPar[k] = v
                    exe_core = self.pluginFactory.get_plugin(pluginPar)
                    self.queue_exe_cores.append(exe_core)
                except Exception:
                    _logger.error(
                        'failed to launch about queue={0} for {1}'.format(
                            queue_name, pluginPar))
                    core_utils.dump_error_message(_logger)

    # main loop
    def run(self):
        while True:
            # update plugin cores from queue config
            self.update_cores_from_queue_config()

            # execute
            self.execute()  # this is the main run

            # check if being terminated
            if self.terminated(harvester_config.credmanager.sleepTime,
                               randomize=False):
                return

    # main
    def execute(self):
        # get lock
        locked = self.dbProxy.get_process_lock(
            'credmanager', self.get_pid(),
            harvester_config.credmanager.sleepTime)
        if not locked:
            return
        # loop over all plugins
        for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores):
            # do nothing
            if exeCore is None:
                continue
            # make logger
            credmanager_name = ''
            if hasattr(exeCore, 'setup_name'):
                credmanager_name = exeCore.setup_name
            else:
                credmanager_name = '{0} {1}'.format(exeCore.inCertFile,
                                                    exeCore.outCertFile)
            mainLog = self.make_logger(_logger,
                                       '{0} {1}'.format(
                                           exeCore.__class__.__name__,
                                           credmanager_name),
                                       method_name='execute')
            try:
                # check credential
                mainLog.debug('check credential')
                isValid = exeCore.check_credential()
                if isValid:
                    mainLog.debug('valid')
                elif not isValid:
                    # renew it if necessary
                    mainLog.debug('invalid')
                    mainLog.debug('renew credential')
                    tmpStat, tmpOut = exeCore.renew_credential()
                    if not tmpStat:
                        mainLog.error('failed : {0}'.format(tmpOut))
                        continue
            except Exception:
                core_utils.dump_error_message(mainLog)
            mainLog.debug('done')

    # monit main
    def execute_monit(self):
        self.update_cores_from_queue_config()

        metrics = {}
        # loop over all plugins
        for exeCore in itertools.chain(self.exeCores, self.queue_exe_cores):
            # do nothing
            if exeCore is None:
                continue

            # make logger
            if hasattr(exeCore, 'setup_name'):
                credmanager_name = exeCore.setup_name
            else:
                credmanager_name = '{0} {1}'.format(exeCore.inCertFile,
                                                    exeCore.outCertFile)

            subLog = self.make_logger(_logger,
                                      '{0} {1}'.format(
                                          exeCore.__class__.__name__,
                                          credmanager_name),
                                      method_name='execute_monit')
            try:
                # check credential
                subLog.debug('check credential lifetime')
                lifetime = exeCore.check_credential_lifetime()
                if lifetime is not None:
                    metrics[exeCore.outCertFile] = lifetime
            except Exception:
                core_utils.dump_error_message(subLog)

            subLog.debug('done')

        return metrics
コード例 #3
0
class CommandManager(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self.nodeName = socket.gethostname()
        self.lastHeartbeat = None

    # set single mode
    def set_single_mode(self, single_mode):
        self.singleMode = single_mode

    def convert_to_command_specs(self, commands):
        """
        Generates a list of CommandSpec objects
        """
        command_specs = []
        for command in commands:
            command_spec = CommandSpec()
            command_spec.convert_command_json(command)
            for comStr, receiver in iteritems(CommandSpec.receiver_map):
                if command_spec.command.startswith(comStr):
                    command_spec.receiver = receiver
                    break
            if command_spec.receiver is not None:
                command_specs.append(command_spec)
        return command_specs

    def run(self):
        """
        main
        """
        main_log = core_utils.make_logger(_logger,
                                          'id={0}'.format(self.ident),
                                          method_name='run')
        bulk_size = harvester_config.commandmanager.commands_bulk_size
        locked = self.db_proxy.get_process_lock(
            'commandmanager', self.get_pid(),
            harvester_config.commandmanager.sleepTime)
        if locked:
            # send command list to be received
            siteNames = set()
            commandList = []
            for queueName, queueConfig in iteritems(
                    self.queueConfigMapper.get_active_queues()):
                if queueConfig is None or queueConfig.runMode != 'slave':
                    continue
                # one command for all queues in one site
                if queueConfig.siteName not in siteNames:
                    commandItem = {
                        'command': CommandSpec.COM_reportWorkerStats,
                        'computingSite': queueConfig.siteName,
                        'resourceType': queueConfig.resourceType
                    }
                    commandList.append(commandItem)
                siteNames.add(queueConfig.siteName)
                # one command for each queue
                commandItem = {
                    'command': CommandSpec.COM_setNWorkers,
                    'computingSite': queueConfig.siteName,
                    'resourceType': queueConfig.resourceType
                }
                commandList.append(commandItem)
            if len(commandList) > 0:
                main_log.debug('sending command list to receive')
                self.communicator.is_alive({
                    'startTime':
                    datetime.datetime.utcnow(),
                    'commands':
                    commandList
                })

        # main loop
        while True:
            # get lock
            locked = self.db_proxy.get_process_lock(
                'commandmanager', self.get_pid(),
                harvester_config.commandmanager.sleepTime)
            if locked or self.singleMode:

                main_log.debug('polling commands loop')

                # send heartbeat
                if self.lastHeartbeat is None \
                        or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    self.lastHeartbeat = datetime.datetime.utcnow()
                    self.communicator.is_alive(
                        {'startTime': datetime.datetime.utcnow()})

                continuous_loop = True  # as long as there are commands, retrieve them

                while continuous_loop:

                    # get commands from panda server for this harvester instance
                    commands = self.communicator.get_commands(bulk_size)
                    main_log.debug('got {0} commands (bulk size: {1})'.format(
                        len(commands), bulk_size))
                    command_specs = self.convert_to_command_specs(commands)

                    # cache commands in internal DB
                    self.db_proxy.store_commands(command_specs)
                    main_log.debug('cached {0} commands in internal DB'.format(
                        len(command_specs)))

                    # retrieve processed commands from harvester cache
                    command_ids_ack = self.db_proxy.get_commands_ack()

                    for shard in core_utils.create_shards(
                            command_ids_ack, bulk_size):
                        # post acknowledgements to panda server
                        self.communicator.ack_commands(shard)
                        main_log.debug(
                            'acknowledged {0} commands to panda server'.format(
                                len(shard)))

                        # clean acknowledged commands
                        self.db_proxy.clean_commands_by_id(shard)

                    # clean commands that have been processed and do not need acknowledgement
                    self.db_proxy.clean_processed_commands()

                    # if we didn't collect the full bulk, give panda server a break
                    if len(commands) < bulk_size:
                        continuous_loop = False

            # check if being terminated
            if self.terminated(harvester_config.commandmanager.sleepTime,
                               randomize=False):
                main_log.debug('terminated')
                return
コード例 #4
0
class Monitor(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.queueConfigMapper = queue_config_mapper
        self.dbProxy = DBProxy()
        self.pluginFactory = PluginFactory()
        self.startTimestamp = time.time()
        self.monitor_fifo = MonitorFIFO()
        if self.monitor_fifo.enabled:
            self.monitor_event_fifo = MonitorEventFIFO()
        else:
            self.monitor_event_fifo = None
        self.apfmon = Apfmon(self.queueConfigMapper)
        self.eventBasedMonCoreList = []
        if getattr(harvester_config.monitor, 'eventBasedEnable', False):
            for pluginConf in harvester_config.monitor.eventBasedPlugins:
                pluginFactory = PluginFactory()
                self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))

    # main loop
    def run(self):
        lockedBy = 'monitor-{0}'.format(self.get_pid())
        # init messengers
        for queueConfig in self.queueConfigMapper.get_all_queues().values():
            # just import for module initialization
            self.pluginFactory.get_plugin(queueConfig.messenger)
        # main
        fifoSleepTimeMilli = getattr(harvester_config.monitor, 'fifoSleepTimeMilli', 5000)
        fifoCheckDuration = getattr(harvester_config.monitor, 'fifoCheckDuration', 30)
        fifoMaxWorkersPerChunk = getattr(harvester_config.monitor, 'fifoMaxWorkersPerChunk', 500)
        fifoProtectiveDequeue = getattr(harvester_config.monitor, 'fifoProtectiveDequeue', True)
        eventBasedCheckInterval = getattr(harvester_config.monitor, 'eventBasedCheckInterval', 300)
        eventBasedTimeWindow = getattr(harvester_config.monitor, 'eventBasedTimeWindow', 450)
        eventBasedCheckMaxEvents = getattr(harvester_config.monitor, 'eventBasedCheckMaxEvents', 500)
        eventBasedEventLifetime = getattr(harvester_config.monitor, 'eventBasedEventLifetime', 1800)
        eventBasedRemoveMaxEvents = getattr(harvester_config.monitor, 'eventBasedRemoveMaxEvents', 2000)
        last_DB_cycle_timestamp = 0
        last_event_delivery_timestamp = 0
        last_event_digest_timestamp = 0
        last_event_dispose_timestamp = 0
        monitor_fifo = self.monitor_fifo
        sleepTime = (fifoSleepTimeMilli / 1000.0) \
                        if monitor_fifo.enabled else harvester_config.monitor.sleepTime
        adjusted_sleepTime = sleepTime
        if monitor_fifo.enabled:
            monitor_fifo.restore()
        mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog.debug('start a monitor cycle')
            if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \
                    not (monitor_fifo.enabled and self.singleMode):
                # run with workers from DB
                sw_db = core_utils.get_stopwatch()
                mainLog.debug('starting run with DB')
                mainLog.debug('getting workers to monitor')
                workSpecsPerQueue = self.dbProxy.get_workers_to_update(harvester_config.monitor.maxWorkers,
                                                                       harvester_config.monitor.checkInterval,
                                                                       harvester_config.monitor.lockInterval,
                                                                       lockedBy)
                mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
                # loop over all workers
                for queueName, configIdWorkSpecs in iteritems(workSpecsPerQueue):
                    for configID, workSpecsList in iteritems(configIdWorkSpecs):
                        retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source='DB')
                        if monitor_fifo.enabled and retVal is not None:
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            if workSpecsToEnqueue:
                                mainLog.debug('putting workers to FIFO')
                                try:
                                    score = fifoCheckInterval + timeNow_timestamp
                                    monitor_fifo.put((queueName, workSpecsToEnqueue), score)
                                    mainLog.info('put workers of {0} to FIFO with score {1}'.format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error('failed to put object from FIFO: {0}'.format(errStr))
                            if workSpecsToEnqueueToHead:
                                mainLog.debug('putting workers to FIFO head')
                                try:
                                    score = fifoCheckInterval - timeNow_timestamp
                                    monitor_fifo.put((queueName, workSpecsToEnqueueToHead), score)
                                    mainLog.info('put workers of {0} to FIFO with score {1}'.format(queueName, score))
                                except Exception as errStr:
                                    mainLog.error('failed to put object from FIFO head: {0}'.format(errStr))
                last_DB_cycle_timestamp = time.time()
                if sw_db.get_elapsed_time_in_sec() > harvester_config.monitor.lockInterval:
                    mainLog.warning('a single DB cycle was longer than lockInterval ' + sw_db.get_elapsed_time())
                else:
                    mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time())
                mainLog.debug('ended run with DB')
            elif monitor_fifo.enabled:
                # with FIFO
                sw = core_utils.get_stopwatch()
                to_run_fifo_check = True
                n_loops = 0
                n_loops_hit = 0
                last_fifo_cycle_timestamp = time.time()
                to_break = False
                obj_dequeued_id_list = []
                obj_to_enqueue_dict = collections.defaultdict(lambda: [[], 0, 0])
                obj_to_enqueue_to_head_dict = collections.defaultdict(lambda: [[], 0, 0])
                remaining_obj_to_enqueue_dict = {}
                remaining_obj_to_enqueue_to_head_dict = {}
                n_chunk_peeked_stat, sum_overhead_time_stat = 0, 0.0
                # go get workers
                if self.monitor_event_fifo.enabled:
                    # run with workers reported from plugin (event-based check)
                    to_deliver = time.time() >= last_event_delivery_timestamp + eventBasedCheckInterval
                    to_digest = time.time() >= last_event_digest_timestamp + eventBasedCheckInterval/4
                    to_dispose = time.time() >= last_event_dispose_timestamp + eventBasedCheckInterval/2
                    if to_deliver:
                        # deliver events of worker update
                        got_lock = self.dbProxy.get_process_lock('monitor_event_deliverer', lockedBy,
                                                                   eventBasedCheckInterval)
                        if got_lock:
                            self.monitor_event_deliverer(time_window=eventBasedTimeWindow)
                        else:
                            mainLog.debug('did not get lock. Skip monitor_event_deliverer')
                        last_event_delivery_timestamp = time.time()
                    if to_digest:
                        # digest events of worker update
                        to_run_fifo_check = False
                        retMap = self.monitor_event_digester(locked_by=lockedBy, max_events=eventBasedCheckMaxEvents)
                        for qc_key, retVal in iteritems(retMap):
                            workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                            # only enqueue postprocessing workers to FIFO
                            obj_to_enqueue_to_head_dict[qc_key][0].extend(workSpecsToEnqueueToHead)
                            obj_to_enqueue_to_head_dict[qc_key][1] = max(obj_to_enqueue_to_head_dict[qc_key][1], timeNow_timestamp)
                            obj_to_enqueue_to_head_dict[qc_key][2] = max(obj_to_enqueue_to_head_dict[qc_key][2], fifoCheckInterval)
                        last_event_digest_timestamp = time.time()
                    if to_dispose:
                        # dispose of outdated events of worker update
                        self.monitor_event_disposer(event_lifetime=eventBasedEventLifetime, max_events=eventBasedRemoveMaxEvents)
                        last_event_dispose_timestamp = time.time()
                if to_run_fifo_check:
                    # run with workers from FIFO
                    while time.time() < last_fifo_cycle_timestamp + fifoCheckDuration:
                        sw.reset()
                        n_loops += 1
                        retVal, overhead_time = monitor_fifo.to_check_workers()
                        if overhead_time is not None:
                            n_chunk_peeked_stat += 1
                            sum_overhead_time_stat += overhead_time
                        if retVal:
                            # check fifo size
                            fifo_size = monitor_fifo.size()
                            mainLog.debug('FIFO size is {0}'.format(fifo_size))
                            mainLog.debug('starting run with FIFO')
                            try:
                                obj_gotten = monitor_fifo.get(timeout=1, protective=fifoProtectiveDequeue)
                            except Exception as errStr:
                                mainLog.error('failed to get object from FIFO: {0}'.format(errStr))
                            else:
                                if obj_gotten is not None:
                                    sw_fifo = core_utils.get_stopwatch()
                                    if fifoProtectiveDequeue:
                                        obj_dequeued_id_list.append(obj_gotten.id)
                                    queueName, workSpecsList = obj_gotten.item
                                    mainLog.debug('got a chunk of {0} workers of {1} from FIFO'.format(len(workSpecsList), queueName) + sw.get_elapsed_time())
                                    sw.reset()
                                    configID = None
                                    for workSpecs in workSpecsList:
                                        if configID is None and len(workSpecs) > 0:
                                            configID = workSpecs[0].configID
                                        for workSpec in workSpecs:
                                            if workSpec.pandaid_list is None:
                                                _jobspec_list = workSpec.get_jobspec_list()
                                                if _jobspec_list is not None:
                                                    workSpec.pandaid_list = [j.PandaID for j in workSpec.get_jobspec_list()]
                                                else:
                                                    workSpec.pandaid_list = []
                                                workSpec.force_update('pandaid_list')
                                    retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, from_fifo=True,
                                                                     config_id=configID, check_source='FIFO')
                                    if retVal is not None:
                                        workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal
                                        qc_key = (queueName, configID)
                                        try:
                                            if len(obj_to_enqueue_dict[qc_key][0]) + len(workSpecsToEnqueue) <= fifoMaxWorkersPerChunk:
                                                obj_to_enqueue_dict[qc_key][0].extend(workSpecsToEnqueue)
                                                obj_to_enqueue_dict[qc_key][1] = max(obj_to_enqueue_dict[qc_key][1], timeNow_timestamp)
                                                obj_to_enqueue_dict[qc_key][2] = max(obj_to_enqueue_dict[qc_key][2], fifoCheckInterval)
                                            else:
                                                to_break = True
                                                remaining_obj_to_enqueue_dict[qc_key] = [workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval]
                                        except Exception as errStr:
                                            mainLog.error('failed to gather workers for FIFO: {0}'.format(errStr))
                                            to_break = True
                                        try:
                                            if len(obj_to_enqueue_to_head_dict[qc_key][0]) + len(workSpecsToEnqueueToHead) <= fifoMaxWorkersPerChunk:
                                                obj_to_enqueue_to_head_dict[qc_key][0].extend(workSpecsToEnqueueToHead)
                                                obj_to_enqueue_to_head_dict[qc_key][1] = max(obj_to_enqueue_to_head_dict[qc_key][1], timeNow_timestamp)
                                                obj_to_enqueue_to_head_dict[qc_key][2] = max(obj_to_enqueue_to_head_dict[qc_key][2], fifoCheckInterval)
                                            else:
                                                to_break = True
                                                remaining_obj_to_enqueue_to_head_dict[qc_key] = [workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval]
                                        except Exception as errStr:
                                            mainLog.error('failed to gather workers for FIFO head: {0}'.format(errStr))
                                            to_break = True
                                        mainLog.debug('checked {0} workers from FIFO'.format(len(workSpecsList)) + sw.get_elapsed_time())
                                    else:
                                        mainLog.debug('monitor_agent_core returned None. Skipped putting to FIFO')
                                    if sw_fifo.get_elapsed_time_in_sec() > harvester_config.monitor.lockInterval:
                                        mainLog.warning('a single FIFO cycle was longer than lockInterval ' + sw_fifo.get_elapsed_time())
                                    else:
                                        mainLog.debug('done a FIFO cycle' + sw_fifo.get_elapsed_time())
                                        n_loops_hit += 1
                                    if to_break:
                                        break
                                else:
                                    mainLog.debug('got nothing in FIFO')
                        else:
                            mainLog.debug('workers in FIFO too young to check. Skipped')
                            if self.singleMode:
                                break
                            if overhead_time is not None:
                                time.sleep(max(-overhead_time*random.uniform(0.1, 1), adjusted_sleepTime))
                            else:
                                time.sleep(max(fifoCheckDuration*random.uniform(0.1, 1), adjusted_sleepTime))
                    mainLog.debug('run {0} loops, including {1} FIFO cycles'.format(n_loops, n_loops_hit))
                # enqueue to fifo
                sw.reset()
                n_chunk_put = 0
                mainLog.debug('putting worker chunks to FIFO')
                for _dct in (obj_to_enqueue_dict, remaining_obj_to_enqueue_dict):
                    for ((queueName, configID), obj_to_enqueue) in iteritems(_dct):
                        try:
                            workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue
                            if workSpecsToEnqueue:
                                score = fifoCheckInterval + timeNow_timestamp
                                monitor_fifo.put((queueName, workSpecsToEnqueue), score)
                                n_chunk_put += 1
                                mainLog.info('put a chunk of {0} workers of {1} to FIFO with score {2}'.format(
                                                len(workSpecsToEnqueue), queueName, score))
                        except Exception as errStr:
                            mainLog.error('failed to put object from FIFO: {0}'.format(errStr))
                mainLog.debug('putting worker chunks to FIFO head')
                for _dct in (obj_to_enqueue_to_head_dict, remaining_obj_to_enqueue_to_head_dict):
                    for ((queueName, configID), obj_to_enqueue_to_head) in iteritems(_dct):
                        try:
                            workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head
                            if workSpecsToEnqueueToHead:
                                score = fifoCheckInterval + timeNow_timestamp - 2**32
                                monitor_fifo.put((queueName, workSpecsToEnqueueToHead), score)
                                n_chunk_put += 1
                                mainLog.info('put a chunk of {0} workers of {1} to FIFO with score {2}'.format(
                                                len(workSpecsToEnqueueToHead), queueName, score))
                        except Exception as errStr:
                            mainLog.error('failed to put object from FIFO head: {0}'.format(errStr))
                # delete protective dequeued objects
                if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0:
                    monitor_fifo.delete(ids=obj_dequeued_id_list)
                mainLog.debug('put {0} worker chunks into FIFO'.format(n_chunk_put) + sw.get_elapsed_time())
                # adjust adjusted_sleepTime
                if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime:
                    speedup_factor = (sum_overhead_time_stat - sleepTime) / (n_chunk_peeked_stat * harvester_config.monitor.checkInterval)
                    speedup_factor = max(speedup_factor, 0)
                    adjusted_sleepTime = adjusted_sleepTime / (1. + speedup_factor)
                elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0:
                    adjusted_sleepTime = (sleepTime + adjusted_sleepTime)/2
                mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format(adjusted_sleepTime))
                # end run with fifo
                mainLog.debug('ended run with FIFO')
            # time the cycle
            mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(adjusted_sleepTime):
                mainLog.debug('terminated')
                return

    # core of monitor agent to check workers in workSpecsList of queueName
    def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False, config_id=None, check_source=None):
        tmpQueLog = self.make_logger(_logger, 'id={0} queue={1}'.format(lockedBy, queueName),
                                     method_name='run')
        # check queue
        if not self.queueConfigMapper.has_queue(queueName, config_id):
            tmpQueLog.error('config not found')
            return None
        # get queue
        queueConfig = self.queueConfigMapper.get_queue(queueName, config_id)
        # get plugins
        monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
        messenger = self.pluginFactory.get_plugin(queueConfig.messenger)
        # workspec chunk of active workers
        workSpecsToEnqueue_dict = {}
        workSpecsToEnqueueToHead_dict = {}
        timeNow_timestamp = time.time()
        # get fifoCheckInterval for PQ and other fifo attributes
        try:
            fifoCheckInterval = monCore.fifoCheckInterval
        except Exception:
            if hasattr(harvester_config.monitor, 'fifoCheckInterval'):
                fifoCheckInterval = harvester_config.monitor.fifoCheckInterval
            else:
                fifoCheckInterval = harvester_config.monitor.checkInterval
        try:
            forceEnqueueInterval = harvester_config.monitor.fifoForceEnqueueInterval
        except AttributeError:
            forceEnqueueInterval = 3600
        try:
            fifoMaxPreemptInterval = harvester_config.monitor.fifoMaxPreemptInterval
        except AttributeError:
            fifoMaxPreemptInterval = 60
        # check workers
        allWorkers = [item for sublist in workSpecsList for item in sublist]
        tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
        tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog, from_fifo)
        if tmpStat:
            # loop over all worker chunks
            tmpQueLog.debug('update jobs and workers')
            iWorker = 0
            for workSpecs in workSpecsList:
                jobSpecs = None
                pandaIDsList = []
                eventsToUpdateList = []
                filesToStageOutList = dict()
                isCheckedList = []
                mapType = workSpecs[0].mapType
                # loop over workSpecs
                for workSpec in workSpecs:
                    tmpLog = self.make_logger(_logger,
                                              'id={0} workerID={1} from={2}'.format(
                                                    lockedBy, workSpec.workerID, check_source),
                                                    method_name='run')
                    tmpOut = tmpRetMap[workSpec.workerID]
                    oldStatus = tmpOut['oldStatus']
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    diagMessage = tmpOut['diagMessage']
                    workAttributes = tmpOut['workAttributes']
                    eventsToUpdate = tmpOut['eventsToUpdate']
                    filesToStageOut = tmpOut['filesToStageOut']
                    eventsRequestParams = tmpOut['eventsRequestParams']
                    nJobsToReFill = tmpOut['nJobsToReFill']
                    pandaIDs = tmpOut['pandaIDs']
                    isChecked = tmpOut['isChecked']
                    tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                    tmpStr += 'postProcessed={3} files={4}'
                    tmpLog.debug(tmpStr.format(newStatus, monStatus, diagMessage,
                                               workSpec.is_post_processed(),
                                               str(filesToStageOut)))
                    iWorker += 1
                    # check status
                    if newStatus not in WorkSpec.ST_LIST:
                        tmpLog.error('unknown status={0}'.format(newStatus))
                        return
                    # update worker
                    workSpec.set_status(newStatus)
                    workSpec.set_work_attributes(workAttributes)
                    workSpec.set_dialog_message(diagMessage)
                    if isChecked:
                        workSpec.checkTime = datetime.datetime.utcnow()
                    isCheckedList.append(isChecked)
                    if monStatus == WorkSpec.ST_failed:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(PilotErrors.ERR_GENERALERROR, diagMessage)
                    elif monStatus == WorkSpec.ST_cancelled:
                        if not workSpec.has_pilot_error():
                            workSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL, diagMessage)
                    if monStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled]:
                        workSpec.set_work_params({'finalMonStatus': monStatus})
                    # request events
                    if eventsRequestParams != {}:
                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                        workSpec.eventsRequestParams = eventsRequestParams
                    # jobs to refill
                    if nJobsToReFill is not None:
                        workSpec.nJobsToReFill = nJobsToReFill
                    # get associated jobs for the worker chunk
                    if workSpec.hasJob == 1 and jobSpecs is None:
                        jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID,
                                                                        None,
                                                                        only_running=True,
                                                                        slim=True)
                    # pandaIDs for push
                    pandaIDsList.append(pandaIDs)
                    if len(eventsToUpdate) > 0:
                        eventsToUpdateList.append(eventsToUpdate)
                    if len(filesToStageOut) > 0:
                        filesToStageOutList[workSpec.workerID] = filesToStageOut
                    # apfmon status update
                    if newStatus != oldStatus:
                        tmpQueLog.debug('newStatus: {0} monStatus: {1} oldStatus: {2} workSpecStatus: {3}'.
                                        format(newStatus, monStatus, oldStatus, workSpec.status))
                        self.apfmon.update_worker(workSpec, monStatus)

                # lock workers for fifo
                if from_fifo:
                    # collect some attributes to be updated when workers are locked
                    worker_id_list = dict()
                    for workSpec, isChecked in zip(workSpecs, isCheckedList):
                        attrs = dict()
                        if isChecked:
                            attrs['checkTime'] = workSpec.checkTime
                            workSpec.force_not_update('checkTime')
                        if workSpec.has_updated_attributes():
                            attrs['lockedBy'] = lockedBy
                            workSpec.lockedBy = lockedBy
                            workSpec.force_not_update('lockedBy')
                        else:
                            attrs['lockedBy'] = None
                        worker_id_list[workSpec.workerID] = attrs
                    temRetLockWorker = self.dbProxy.lock_workers(worker_id_list,
                                                                 harvester_config.monitor.lockInterval)
                    # skip if not locked
                    if not temRetLockWorker:
                        continue
                # update jobs and workers
                if jobSpecs is not None and len(jobSpecs) > 0:
                    tmpQueLog.debug('updating {0} jobs with {1} workers'.format(len(jobSpecs), len(workSpecs)))
                    core_utils.update_job_attributes_with_workers(mapType, jobSpecs, workSpecs,
                                                                  filesToStageOutList, eventsToUpdateList)
                # update local database
                tmpRet = self.dbProxy.update_jobs_workers(jobSpecs, workSpecs, lockedBy, pandaIDsList)
                if not tmpRet:
                    for workSpec in workSpecs:
                        tmpLog = self.make_logger(_logger,
                                                  'id={0} workerID={1}'.format(lockedBy, workSpec.workerID),
                                                  method_name='run')
                        if from_fifo:
                            tmpLog.info('failed to update the DB. Maybe locked by other thread running with DB')
                        else:
                            if workSpec.status in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_missed]:
                                tmpLog.info('worker already in final status. Skipped')
                            else:
                                tmpLog.error('failed to update the DB. lockInterval may be too short')
                else:
                    if jobSpecs is not None:
                        for jobSpec in jobSpecs:
                            tmpLog = self.make_logger(_logger,
                                                      'id={0} PandaID={1}'.format(lockedBy, jobSpec.PandaID),
                                                      method_name='run')
                            tmpLog.debug('new status={0} subStatus={1} status_in_metadata={2}'.format(
                                jobSpec.status,
                                jobSpec.subStatus,
                                jobSpec.get_job_status_from_attributes()))
                # send ACK to workers for events and files
                if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0:
                    for workSpec in workSpecs:
                        try:
                            messenger.acknowledge_events_files(workSpec)
                        except Exception:
                            core_utils.dump_error_message(tmpQueLog)
                            tmpQueLog.error('failed to send ACK to workerID={0}'.format(workSpec.workerID))
                # active workers for fifo
                if self.monitor_fifo.enabled and workSpecs:
                    workSpec = workSpecs[0]
                    tmpOut = tmpRetMap[workSpec.workerID]
                    newStatus = tmpOut['newStatus']
                    monStatus = tmpOut['monStatus']
                    if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \
                        and workSpec.mapType != WorkSpec.MT_MultiWorkers \
                            and workSpec.workAttributes is not None:
                        timeNow = datetime.datetime.utcnow()
                        timeNow_timestamp = time.time()
                        # get lastCheckAt
                        _bool, lastCheckAt = workSpec.get_work_params('lastCheckAt')
                        try:
                            last_check_period = timeNow_timestamp - lastCheckAt
                        except TypeError:
                            last_check_period = forceEnqueueInterval + 1.0
                        # get lastForceEnqueueAt
                        _bool, lastForceEnqueueAt = workSpec.get_work_params('lastForceEnqueueAt')
                        if not (_bool and lastForceEnqueueAt is not None):
                            lastForceEnqueueAt = 0
                        # notification
                        intolerable_delay = max(forceEnqueueInterval*2, harvester_config.monitor.checkInterval * 4)
                        if _bool and lastCheckAt is not None \
                                and last_check_period > harvester_config.monitor.checkInterval \
                                and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp:
                            if last_check_period > intolerable_delay:
                                tmpQueLog.error('last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly'.format(
                                                    workSpec.workerID, last_check_period))
                            else:
                                tmpQueLog.warning('last check period of workerID={0} is {1} sec, longer than monitor checkInterval'.format(
                                                    workSpec.workerID, last_check_period))
                        # prepartion to enqueue fifo
                        if (from_fifo) \
                            or (not from_fifo
                                and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp
                                and last_check_period > forceEnqueueInterval
                                and last_check_period < intolerable_delay
                                and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval):
                            if not from_fifo:
                                # in DB cycle
                                tmpQueLog.warning('last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force'.format(
                                                    workSpec.workerID, last_check_period))
                                workSpec.set_work_params({'lastForceEnqueueAt': timeNow_timestamp})
                            workSpec.set_work_params({'lastCheckAt': timeNow_timestamp})
                            workSpec.lockedBy = None
                            workSpec.force_update('lockedBy')
                            if monStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled]:
                                # for post-processing
                                _bool, startFifoPreemptAt = workSpec.get_work_params('startFifoPreemptAt')
                                if not _bool or startFifoPreemptAt is None:
                                    startFifoPreemptAt = timeNow_timestamp
                                    workSpec.set_work_params({'startFifoPreemptAt': startFifoPreemptAt})
                                tmpQueLog.debug('workerID={0} , startFifoPreemptAt: {1}'.format(workSpec.workerID, startFifoPreemptAt))
                                if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval:
                                    workSpecsToEnqueueToHead_dict[workSpec.workerID] = workSpecs
                                else:
                                    workSpec.set_work_params({'startFifoPreemptAt': timeNow_timestamp})
                                    workSpec.modificationTime = timeNow
                                    workSpec.force_update('modificationTime')
                                    workSpecsToEnqueue_dict[workSpec.workerID] = workSpecs
                            else:
                                workSpec.modificationTime = timeNow
                                workSpec.force_update('modificationTime')
                                workSpecsToEnqueue_dict[workSpec.workerID] = workSpecs
        else:
            tmpQueLog.error('failed to check workers')
        workSpecsToEnqueue = list(workSpecsToEnqueue_dict.values())
        workSpecsToEnqueueToHead = list(workSpecsToEnqueueToHead_dict.values())
        retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval
        tmpQueLog.debug('done')
        return retVal

    # wrapper for checkWorkers
    def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, from_fifo):
        # check timeout value
        try:
            checkTimeout = mon_core.checkTimeout
        except Exception:
            try:
                checkTimeout = harvester_config.monitor.checkTimeout
            except Exception:
                checkTimeout = None
        try:
            workerQueueTimeLimit = harvester_config.monitor.workerQueueTimeLimit
        except AttributeError:
            workerQueueTimeLimit = 172800
        workersToCheck = []
        thingsToPostProcess = []
        retMap = dict()
        for workSpec in all_workers:
            eventsRequestParams = {}
            eventsToUpdate = []
            pandaIDs = []
            workStatus = None
            workAttributes = None
            filesToStageOut = []
            nJobsToReFill = None
            if workSpec.has_work_params('finalMonStatus'):
                # to post-process
                _bool, finalMonStatus = workSpec.get_work_params('finalMonStatus')
                _thing = (workSpec, (finalMonStatus, ''))
                thingsToPostProcess.append(_thing)
            else:
                # job-level late binding
                if workSpec.hasJob == 0 and workSpec.mapType != WorkSpec.MT_NoJob:
                    # check if job is requested
                    jobRequested = messenger.job_requested(workSpec)
                    if jobRequested:
                        # set ready when job is requested
                        workStatus = WorkSpec.ST_ready
                    else:
                        workStatus = workSpec.status
                elif workSpec.nJobsToReFill in [0, None]:
                    # check if job is requested to refill free slots
                    jobRequested = messenger.job_requested(workSpec)
                    if jobRequested:
                        nJobsToReFill = jobRequested
                    workersToCheck.append(workSpec)
                else:
                    workersToCheck.append(workSpec)
            # add
            retMap[workSpec.workerID] = {'oldStatus': workSpec.status,
                                         'newStatus': workStatus,
                                         'monStatus': workStatus,
                                         'workAttributes': workAttributes,
                                         'filesToStageOut': filesToStageOut,
                                         'eventsRequestParams': eventsRequestParams,
                                         'eventsToUpdate': eventsToUpdate,
                                         'diagMessage': '',
                                         'pandaIDs': pandaIDs,
                                         'nJobsToReFill': nJobsToReFill,
                                         'isChecked': True}
        # check workers
        tmp_log.debug('checking workers with plugin')
        try:
            if workersToCheck:
                tmpStat, tmpOut = mon_core.check_workers(workersToCheck)
                if not tmpStat:
                    tmp_log.error('failed to check workers with: {0}'.format(tmpOut))
                    workersToCheck = []
                    tmpOut = []
                else:
                    tmp_log.debug('checked')
            else:
                tmp_log.debug('Nothing to be checked with plugin')
                tmpOut = []
            timeNow = datetime.datetime.utcnow()
            for workSpec, (newStatus, diagMessage) in itertools.chain(
                    zip(workersToCheck, tmpOut), thingsToPostProcess):
                workerID = workSpec.workerID
                tmp_log.debug('Going to check workerID={0}'.format(workerID))
                pandaIDs = []
                if workerID in retMap:
                    # failed to check status
                    if newStatus is None:
                        tmp_log.warning('Failed to check workerID={0} with {1}'.format(workerID, diagMessage))
                        retMap[workerID]['isChecked'] = False
                        # set status
                        if workSpec.checkTime is not None and checkTimeout is not None and \
                                timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout):
                            # kill due to timeout
                            tmp_log.debug('kill workerID={0} due to consecutive check failures'.format(workerID))
                            self.dbProxy.kill_worker(workSpec.workerID)
                            newStatus = WorkSpec.ST_cancelled
                            diagMessage = 'Killed by Harvester due to consecutive worker check failures. ' + diagMessage
                            workSpec.set_pilot_error(PilotErrors.ERR_FAILEDBYSERVER, diagMessage)
                        else:
                            # use original status
                            newStatus = workSpec.status
                    # request kill
                    if messenger.kill_requested(workSpec):
                        tmp_log.debug('kill workerID={0} as requested'.format(workerID))
                        self.dbProxy.kill_worker(workSpec.workerID)
                    # stuck queuing for too long
                    if workSpec.status == WorkSpec.ST_submitted \
                        and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit):
                        tmp_log.debug('kill workerID={0} due to queuing longer than {1} seconds'.format(
                                        workerID, workerQueueTimeLimit))
                        self.dbProxy.kill_worker(workSpec.workerID)
                        diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage
                        workSpec.set_pilot_error(PilotErrors.ERR_FAILEDBYSERVER, diagMessage)
                    # expired heartbeat - only when requested in the configuration
                    try:
                        # check if the queue configuration requires checking for worker heartbeat
                        worker_heartbeat_limit = int(queue_config.messenger['worker_heartbeat'])
                    except (AttributeError, KeyError):
                        worker_heartbeat_limit = None
                    tmp_log.debug(
                        'workerID={0} heartbeat limit is configured to {1}'.format(workerID,
                                                                                   worker_heartbeat_limit))
                    if worker_heartbeat_limit:
                        if messenger.is_alive(workSpec, worker_heartbeat_limit):
                            tmp_log.debug('heartbeat for workerID={0} is valid'.format(workerID))
                        else:
                            tmp_log.debug('heartbeat for workerID={0} expired: sending kill request'.format(
                                workerID))
                            self.dbProxy.kill_worker(workSpec.workerID)
                            diagMessage = 'Killed by Harvester due to worker heartbeat expired. ' + diagMessage
                            workSpec.set_pilot_error(PilotErrors.ERR_FAILEDBYSERVER, diagMessage)
                    # get work attributes
                    workAttributes = messenger.get_work_attributes(workSpec)
                    retMap[workerID]['workAttributes'] = workAttributes
                    # get output files
                    filesToStageOut = messenger.get_files_to_stage_out(workSpec)
                    retMap[workerID]['filesToStageOut'] = filesToStageOut
                    # get events to update
                    if workSpec.eventsRequest in [WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents]:
                        eventsToUpdate = messenger.events_to_update(workSpec)
                        retMap[workerID]['eventsToUpdate'] = eventsToUpdate
                    # request events
                    if workSpec.eventsRequest == WorkSpec.EV_useEvents:
                        eventsRequestParams = messenger.events_requested(workSpec)
                        retMap[workerID]['eventsRequestParams'] = eventsRequestParams
                    # get PandaIDs for pull model
                    if workSpec.mapType == WorkSpec.MT_NoJob:
                        pandaIDs = messenger.get_panda_ids(workSpec)
                    retMap[workerID]['pandaIDs'] = pandaIDs
                    # keep original new status
                    retMap[workerID]['monStatus'] = newStatus
                    # set running or idle while there are events to update or files to stage out
                    if newStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled]:
                        isOK = True
                        if len(retMap[workerID]['filesToStageOut']) > 0 or \
                                len(retMap[workerID]['eventsToUpdate']) > 0:
                            if workSpec.status == WorkSpec.ST_running:
                                newStatus = WorkSpec.ST_running
                            else:
                                newStatus = WorkSpec.ST_idle
                        elif not workSpec.is_post_processed():
                            if not queue_config.is_no_heartbeat_status(newStatus):
                                # post processing unless heartbeat is suppressed
                                jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID,
                                                                                None, True,
                                                                                only_running=True,
                                                                                slim=True)
                                # post processing
                                tmpStat = messenger.post_processing(workSpec, jobSpecs, workSpec.mapType)
                                if tmpStat is None:
                                    # retry
                                    ppTimeOut = getattr(harvester_config.monitor, 'postProcessTimeout', 0)
                                    if ppTimeOut > 0:
                                        timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=ppTimeOut)
                                        if workSpec.endTime is None or workSpec.endTime > timeLimit:
                                            isOK = False
                                            # set end time just in case for timeout
                                            workSpec.set_end_time()
                            if isOK:
                                workSpec.post_processed()
                            if workSpec.status == WorkSpec.ST_running:
                                newStatus = WorkSpec.ST_running
                            else:
                                newStatus = WorkSpec.ST_idle
                        # reset modification time to immediately trigger subsequent lookup
                        if isOK and not self.monitor_fifo.enabled:
                            workSpec.trigger_next_lookup()
                    retMap[workerID]['newStatus'] = newStatus
                    retMap[workerID]['diagMessage'] = diagMessage
                else:
                    tmp_log.debug('workerID={0} not in retMap'.format(workerID))
            return True, retMap
        except Exception:
            core_utils.dump_error_message(tmp_log)
            return False, None

    # ask plugin for workers to update, get workspecs, and queue the event
    def monitor_event_deliverer(self, time_window):
        tmpLog = self.make_logger(_logger, 'id=monitor-{0}'.format(self.get_pid()), method_name='monitor_event_deliverer')
        tmpLog.debug('start')
        for mon_core in self.eventBasedMonCoreList:
            tmpLog.debug('run with {0}'.format(mon_core.__class__.__name__))
            worker_update_list = mon_core.report_updated_workers(time_window=time_window)
            for workerID, updateTimestamp in worker_update_list:
                retVal = self.monitor_event_fifo.putbyid(id=workerID, item=True, score=updateTimestamp)
                if not retVal:
                    retVal = self.monitor_event_fifo.update(id=workerID, score=updateTimestamp, temporary=0, cond_score='gt')
                    if retVal:
                        tmpLog.debug('updated event with workerID={0}'.format(workerID))
                    else:
                        tmpLog.debug('event with workerID={0} is updated. Skipped'.format(workerID))
                else:
                    tmpLog.debug('put event with workerID={0}'.format(workerID))
        tmpLog.debug('done')

    # get events and check workers
    def monitor_event_digester(self, locked_by, max_events):
        tmpLog = self.make_logger(_logger, 'id=monitor-{0}'.format(self.get_pid()), method_name='monitor_event_digester')
        tmpLog.debug('start')
        timeNow_timestamp = time.time()
        retMap = {}
        obj_gotten_list = self.monitor_event_fifo.getmany(mode='first', count=max_events, protective=True)
        workerID_list = [ obj_gotten.id for obj_gotten in obj_gotten_list ]
        tmpLog.debug('got {0} worker events'.format(len(workerID_list)))
        if len(workerID_list) > 0:
            updated_workers_dict = self.dbProxy.get_workers_from_ids(workerID_list)
            tmpLog.debug('got workspecs for worker events')
            for queueName, _val in iteritems(updated_workers_dict):
                for configID, workSpecsList in iteritems(_val):
                    qc_key = (queueName, configID)
                    tmpLog.debug('checking workers of queueName={0} configID={1}'.format(*qc_key))
                    retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList,
                                                        from_fifo=True, config_id=configID,
                                                        check_source='Event')
                    retMap[qc_key] = retVal
        tmpLog.debug('done')
        return retMap

    # remove outdated events
    def monitor_event_disposer(self, event_lifetime, max_events):
        tmpLog = self.make_logger(_logger, 'id=monitor-{0}'.format(self.get_pid()), method_name='monitor_event_disposer')
        tmpLog.debug('start')
        timeNow_timestamp = time.time()
        obj_gotten_list = self.monitor_event_fifo.getmany(mode='first',
                                                            maxscore=(timeNow_timestamp-event_lifetime),
                                                            count=max_events, temporary=True)
        tmpLog.debug('removed {0} events'.format(len(obj_gotten_list)))
        n_events = self.monitor_event_fifo.size()
        tmpLog.debug('now {0} events in monitor-event fifo'.format(n_events))
        tmpLog.debug('done')
コード例 #5
0
class CredManager(AgentBase):

    # constructor
    def __init__(self, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.pluginFactory = PluginFactory()
        self.dbProxy = DBProxy()
        # get module and class names
        moduleNames = self.get_list(harvester_config.credmanager.moduleName)
        classNames = self.get_list(harvester_config.credmanager.className)
        # file names of original certificates
        if hasattr(harvester_config.credmanager, 'inCertFile'):
            inCertFiles = self.get_list(
                harvester_config.credmanager.inCertFile)
        else:
            inCertFiles = self.get_list(harvester_config.credmanager.certFile)
        # file names of certificates to be generated
        if hasattr(harvester_config.credmanager, 'outCertFile'):
            outCertFiles = self.get_list(
                harvester_config.credmanager.outCertFile)
        else:
            # use the file name of the certificate for panda connection as output name
            outCertFiles = self.get_list(harvester_config.pandacon.cert_file)
        # VOMS
        vomses = self.get_list(harvester_config.credmanager.voms)
        # get plugin
        self.exeCores = []
        for moduleName, className, inCertFile, outCertFile, voms in \
                zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses):
            pluginPar = {}
            pluginPar['module'] = moduleName
            pluginPar['name'] = className
            pluginPar['inCertFile'] = inCertFile
            pluginPar['outCertFile'] = outCertFile
            pluginPar['voms'] = voms
            exeCore = self.pluginFactory.get_plugin(pluginPar)
            self.exeCores.append(exeCore)

    # get list
    def get_list(self, data):
        if isinstance(data, list):
            return data
        else:
            return [data]

    # main loop
    def run(self):
        while True:
            # execute
            self.execute()
            # check if being terminated
            if self.terminated(harvester_config.credmanager.sleepTime,
                               randomize=False):
                return

    # main
    def execute(self):
        # get lock
        locked = self.dbProxy.get_process_lock(
            'credmanager', self.get_pid(),
            harvester_config.credmanager.sleepTime)
        if not locked:
            return
        # loop over all plugins
        for exeCore in self.exeCores:
            # do nothing
            if exeCore is None:
                continue

            # make logger
            mainLog = self.make_logger(
                _logger,
                "{0} {1} {2}".format(exeCore.__class__.__name__,
                                     exeCore.inCertFile, exeCore.outCertFile),
                method_name='execute')
            try:
                # check credential
                mainLog.debug('check credential')
                isValid = exeCore.check_credential()
                if isValid:
                    mainLog.debug('valid')
                elif not isValid:
                    # renew it if necessary
                    mainLog.debug('invalid')
                    mainLog.debug('renew credential')
                    tmpStat, tmpOut = exeCore.renew_credential()
                    if not tmpStat:
                        mainLog.error('failed : {0}'.format(tmpOut))
                        continue
            except Exception:
                core_utils.dump_error_message(mainLog)
            mainLog.debug('done')
コード例 #6
0
class CommandManager(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.db_proxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self.nodeName = socket.gethostname()
        self.lastHeartbeat = None

    # set single mode
    def set_single_mode(self, single_mode):
        self.singleMode = single_mode

    def convert_to_command_specs(self, commands):
        """
        Generates a list of CommandSpec objects
        """
        command_specs = []
        for command in commands:
            command_spec = CommandSpec()
            command_spec.convert_command_json(command)
            for comStr, receiver in iteritems(CommandSpec.receiver_map):
                if command_spec.command.startswith(comStr):
                    command_spec.receiver = receiver
                    break
            if command_spec.receiver is not None:
                command_specs.append(command_spec)
        return command_specs

    def run(self):
        """
        main
        """
        main_log = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
        bulk_size = harvester_config.commandmanager.commands_bulk_size
        locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(),
                                                harvester_config.commandmanager.sleepTime)
        if locked:
            # send command list to be received
            siteNames = set()
            commandList = []
            for queueName, queueConfig in iteritems(self.queueConfigMapper.get_active_queues()):
                if queueConfig is None or queueConfig.runMode != 'slave':
                    continue
                # one command for all queues in one site
                if queueConfig.siteName not in siteNames:
                    commandItem = {'command': CommandSpec.COM_reportWorkerStats,
                                   'computingSite': queueConfig.siteName,
                                   'resourceType': queueConfig.resourceType
                                   }
                    commandList.append(commandItem)
                siteNames.add(queueConfig.siteName)
                # one command for each queue
                commandItem = {'command': CommandSpec.COM_setNWorkers,
                               'computingSite': queueConfig.siteName,
                               'resourceType': queueConfig.resourceType
                               }
                commandList.append(commandItem)
            data = {'startTime': datetime.datetime.utcnow(),
                    'sw_version': panda_pkg_info.release_version,
                    'commit_stamp': commit_timestamp.timestamp}
            if len(commandList) > 0:
                main_log.debug('sending command list to receive')
                data['commands'] = commandList
            self.communicator.is_alive(data)

        # main loop
        while True:
            # get lock
            locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(),
                                                    harvester_config.commandmanager.sleepTime)
            if locked or self.singleMode:

                main_log.debug('polling commands loop')

                # send heartbeat
                if self.lastHeartbeat is None \
                        or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                    self.lastHeartbeat = datetime.datetime.utcnow()
                    self.communicator.is_alive({})

                continuous_loop = True  # as long as there are commands, retrieve them

                while continuous_loop:

                    # get commands from panda server for this harvester instance
                    commands = self.communicator.get_commands(bulk_size)
                    main_log.debug('got {0} commands (bulk size: {1})'.format(len(commands), bulk_size))
                    command_specs = self.convert_to_command_specs(commands)

                    # cache commands in internal DB
                    self.db_proxy.store_commands(command_specs)
                    main_log.debug('cached {0} commands in internal DB'.format(len(command_specs)))

                    # retrieve processed commands from harvester cache
                    command_ids_ack = self.db_proxy.get_commands_ack()

                    for shard in core_utils.create_shards(command_ids_ack, bulk_size):
                        # post acknowledgements to panda server
                        self.communicator.ack_commands(shard)
                        main_log.debug('acknowledged {0} commands to panda server'.format(len(shard)))

                        # clean acknowledged commands
                        self.db_proxy.clean_commands_by_id(shard)

                    # clean commands that have been processed and do not need acknowledgement
                    self.db_proxy.clean_processed_commands()

                    # if we didn't collect the full bulk, give panda server a break
                    if len(commands) < bulk_size:
                        continuous_loop = False

            # check if being terminated
            if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False):
                main_log.debug('terminated')
                return
コード例 #7
0
ファイル: sweeper.py プロジェクト: dougbenjamin/harvester
class Sweeper(AgentBase):
    # constructor
    def __init__(self, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.queueConfigMapper = queue_config_mapper
        self.pluginFactory = PluginFactory()

    # main loop
    def run(self):
        lockedBy = 'sweeper-{0}'.format(self.get_pid())
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger,
                                       'id={0}'.format(lockedBy),
                                       method_name='run')
            # get commands to kill
            sw_getcomm = core_utils.get_stopwatch()
            mainLog.debug('try to get commands')
            comStr = CommandSpec.COM_killWorkers
            commandSpecs = self.dbProxy.get_commands_for_receiver(
                'sweeper', comStr)
            mainLog.debug('got {0} {1} commands'.format(
                len(commandSpecs), comStr))
            for commandSpec in commandSpecs:
                n_to_kill = self.dbProxy.kill_workers_by_query(
                    commandSpec.params)
                mainLog.debug('will kill {0} workers with {1}'.format(
                    n_to_kill, commandSpec.params))
            mainLog.debug('done handling commands' +
                          sw_getcomm.get_elapsed_time())
            # killing stage
            sw_kill = core_utils.get_stopwatch()
            mainLog.debug('try to get workers to kill')
            # get workers to kill
            workersToKill = self.dbProxy.get_workers_to_kill(
                harvester_config.sweeper.maxWorkers,
                harvester_config.sweeper.checkInterval)
            mainLog.debug('got {0} queues to kill workers'.format(
                len(workersToKill)))
            # loop over all workers
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(workersToKill):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(
                            queueName, configID):
                        mainLog.error(
                            'queue config for {0}/{1} not found'.format(
                                queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(
                        queueName, configID)
                    try:
                        sweeperCore = self.pluginFactory.get_plugin(
                            queueConfig.sweeper)
                    except Exception:
                        mainLog.error(
                            'failed to launch sweeper plugin for {0}/{1}'.
                            format(queueName, configID))
                        core_utils.dump_error_message(mainLog)
                        continue
                    sw.reset()
                    n_workers = len(workspec_list)
                    try:
                        # try bulk method
                        tmpLog = self.make_logger(_logger,
                                                  'id={0}'.format(lockedBy),
                                                  method_name='run')
                        tmpLog.debug('start killing')
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger,
                                                      'workerID={0}'.format(
                                                          workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpLog.debug('start killing one worker')
                                tmpStat, tmpOut = sweeperCore.kill_worker(
                                    workspec)
                                tmpLog.debug(
                                    'done killing with status={0} diag={1}'.
                                    format(tmpStat, tmpOut))
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    else:
                        # bulk method
                        n_killed = 0
                        for workspec, (tmpStat,
                                       tmpOut) in zip(workspec_list, tmpList):
                            tmpLog.debug(
                                'done killing workerID={0} with status={1} diag={2}'
                                .format(workspec.workerID, tmpStat, tmpOut))
                            if tmpStat:
                                n_killed += 1
                        tmpLog.debug('killed {0}/{1} workers'.format(
                            n_killed, n_workers))
                    mainLog.debug(
                        'done killing {0} workers'.format(n_workers) +
                        sw.get_elapsed_time())
            mainLog.debug('done all killing' + sw_kill.get_elapsed_time())
            # cleanup stage
            sw_cleanup = core_utils.get_stopwatch()
            # timeout for missed
            try:
                keepMissed = harvester_config.sweeper.keepMissed
            except Exception:
                keepMissed = 24
            try:
                keepPending = harvester_config.sweeper.keepPending
            except Exception:
                keepPending = 24
            # get workers for cleanup
            statusTimeoutMap = {
                'finished': harvester_config.sweeper.keepFinished,
                'failed': harvester_config.sweeper.keepFailed,
                'cancelled': harvester_config.sweeper.keepCancelled,
                'missed': keepMissed,
                'pending': keepPending
            }
            workersForCleanup = self.dbProxy.get_workers_for_cleanup(
                harvester_config.sweeper.maxWorkers, statusTimeoutMap)
            mainLog.debug('got {0} queues for workers cleanup'.format(
                len(workersForCleanup)))
            sw = core_utils.get_stopwatch()
            for queueName, configIdWorkSpecList in iteritems(
                    workersForCleanup):
                for configID, workspec_list in iteritems(configIdWorkSpecList):
                    # get sweeper
                    if not self.queueConfigMapper.has_queue(
                            queueName, configID):
                        mainLog.error(
                            'queue config for {0}/{1} not found'.format(
                                queueName, configID))
                        continue
                    queueConfig = self.queueConfigMapper.get_queue(
                        queueName, configID)
                    sweeperCore = self.pluginFactory.get_plugin(
                        queueConfig.sweeper)
                    messenger = self.pluginFactory.get_plugin(
                        queueConfig.messenger)
                    sw.reset()
                    n_workers = len(workspec_list)
                    # make sure workers to clean up are all terminated
                    mainLog.debug(
                        'making sure workers to clean up are all terminated')
                    try:
                        # try bulk method
                        tmpList = sweeperCore.kill_workers(workspec_list)
                    except AttributeError:
                        # fall back to single-worker method
                        for workspec in workspec_list:
                            tmpLog = self.make_logger(_logger,
                                                      'workerID={0}'.format(
                                                          workspec.workerID),
                                                      method_name='run')
                            try:
                                tmpStat, tmpOut = sweeperCore.kill_worker(
                                    workspec)
                            except Exception:
                                core_utils.dump_error_message(tmpLog)
                    except Exception:
                        core_utils.dump_error_message(mainLog)
                    mainLog.debug(
                        'made sure workers to clean up are all terminated')
                    # start cleanup
                    for workspec in workspec_list:
                        tmpLog = self.make_logger(_logger,
                                                  'workerID={0}'.format(
                                                      workspec.workerID),
                                                  method_name='run')
                        try:
                            tmpLog.debug('start cleaning up one worker')
                            # sweep worker
                            tmpStat, tmpOut = sweeperCore.sweep_worker(
                                workspec)
                            tmpLog.debug(
                                'swept_worker with status={0} diag={1}'.format(
                                    tmpStat, tmpOut))
                            tmpLog.debug('start messenger cleanup')
                            mc_tmpStat, mc_tmpOut = messenger.clean_up(
                                workspec)
                            tmpLog.debug(
                                'messenger cleaned up with status={0} diag={1}'
                                .format(mc_tmpStat, mc_tmpOut))
                            if tmpStat:
                                self.dbProxy.delete_worker(workspec.workerID)
                        except Exception:
                            core_utils.dump_error_message(tmpLog)
                    mainLog.debug(
                        'done cleaning up {0} workers'.format(n_workers) +
                        sw.get_elapsed_time())
            mainLog.debug('done all cleanup' + sw_cleanup.get_elapsed_time())
            # old-job-deletion stage
            sw_delete = core_utils.get_stopwatch()
            mainLog.debug('delete old jobs')
            jobTimeout = max(statusTimeoutMap.values()) + 1
            self.dbProxy.delete_old_jobs(jobTimeout)
            # delete orphaned job info
            self.dbProxy.delete_orphaned_job_info()
            mainLog.debug('done deletion of old jobs' +
                          sw_delete.get_elapsed_time())
            # disk cleanup
            if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \
                    hasattr(harvester_config.sweeper, 'diskHighWatermark'):
                locked = self.dbProxy.get_process_lock(
                    'sweeper', self.get_pid(),
                    harvester_config.sweeper.diskCleanUpInterval * 60 * 60)
                if locked:
                    try:
                        all_active_files = None
                        for item in harvester_config.sweeper.diskHighWatermark.split(
                                ','):
                            # dir name and watermark in GB
                            dir_name, watermark = item.split('|')
                            mainLog.debug(
                                'checking {0} for cleanup with watermark {1} GB'
                                .format(dir_name, watermark))
                            watermark = int(watermark) * 10**9
                            total_size = 0
                            file_dict = {}
                            # scan dir
                            for root, dirs, filenames in walk(dir_name):
                                for base_name in filenames:
                                    full_name = os.path.join(root, base_name)
                                    f_size = os.path.getsize(full_name)
                                    total_size += f_size
                                    mtime = os.path.getmtime(full_name)
                                    file_dict.setdefault(mtime, set())
                                    file_dict[mtime].add(
                                        (base_name, full_name, f_size))
                            # delete if necessary
                            if total_size < watermark:
                                mainLog.debug(
                                    'skip cleanup {0} due to total_size {1} GB < watermark {2} GB'
                                    .format(dir_name, total_size // (10**9),
                                            watermark // (10**9)))
                            else:
                                mainLog.debug(
                                    'cleanup {0} due to total_size {1} GB >= watermark {2} GB'
                                    .format(dir_name, total_size // (10**9),
                                            watermark // (10**9)))
                                # get active input files
                                if all_active_files is None:
                                    all_active_files = self.dbProxy.get_all_active_input_files(
                                    )
                                deleted_size = 0
                                mtimes = sorted(file_dict.keys())
                                for mtime in mtimes:
                                    for base_name, full_name, f_size in file_dict[
                                            mtime]:
                                        # keep if active
                                        if base_name in all_active_files:
                                            continue
                                        try:
                                            os.remove(full_name)
                                        except Exception:
                                            core_utils.dump_error_message(
                                                mainLog)
                                        deleted_size += f_size
                                        if total_size - deleted_size < watermark:
                                            break
                                    if total_size - deleted_size < watermark:
                                        break
                    except Exception:
                        core_utils.dump_error_message(mainLog)
            # time the cycle
            mainLog.debug('done a sweeper cycle' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.sweeper.sleepTime):
                mainLog.debug('terminated')
                return