Esempio n. 1
0
def HarvesterReport():
    log = ''
    try:
        from distutils.sysconfig import get_python_lib  # pylint: disable=import-error
        sys.path.append(get_python_lib() + '/pandacommon')

        os.environ['PANDA_HOME'] = os.environ['VIRTUAL_ENV']

        from collections import defaultdict  # pylint: disable=import-error
        from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy  # pylint: disable=import-error

        dbProxy = DBProxy()

        workers = dbProxy.get_worker_stats_bulk(None)
        rep = defaultdict(dict)

        rtot = defaultdict(int)

        for site, prodsourcelabels in workers.items():
            for prodsourcelabel, resources in prodsourcelabels.items():
                for resource, jobs in resources.items():
                    rep[f'{site}-{resource}'][prodsourcelabel
                                              or 'empty'] = jobs
                    for state, count in jobs.items():
                        rtot[state] += count
        log = f"All Harvester jobs: {sum(rtot.values())}       prodSourceLabel: submitted/running\n"
        for k in sorted(rep.keys()):
            log += f"{k:>28.28}:"
            for psl, jobs in rep[k].items():
                log += f"{psl:>10}: {jobs['submitted']}/{jobs['running']}"
            log += '\n'
        log += f"{'Totals':>28}:  submitted: {rtot['submitted']}  running: {rtot['running']}\n\n"
    except:
        pass

    return log
Esempio n. 2
0
class Propagator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self._last_stats_update = None
        self._last_metrics_update = None

    # main loop
    def run(self):
        while True:
            sw_main = core_utils.get_stopwatch()
            mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run')
            mainLog.debug('getting jobs to propagate')
            sw = core_utils.get_stopwatch()
            jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs,
                                                          harvester_config.propagator.lockInterval,
                                                          harvester_config.propagator.updateInterval,
                                                          self.get_pid())
            mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time()))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite,
                                                                       tmpJobSpec.configID)
                        hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status()
                    # heartbeat is suppressed
                    if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \
                            not tmpJobSpec.not_suppress_heartbeat():
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                sw.reset()
                retList += self.communicator.check_jobs(jobListToCheck)
                mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time()))
                sw.reset()
                retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid())
                mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate),
                                                                              sw.get_elapsed_time()))
                # logging
                for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                  tmpJobSpec.status))
                        else:
                            mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                        tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                            tmpJobSpec.modificationTime = datetime.datetime.utcnow()
                        elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done():
                            # trigger next propagation to update remaining events
                            tmpJobSpec.trigger_propagation()
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec)
                                if tmpEvStat:
                                    if tmpEvRet is not None:
                                        tmpJobSpec.nRemainingEvents = tmpEvRet
                                    if tmpEvRet == 0:
                                        mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID))
                                        tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']:
                                nWorkers = self.dbProxy.kill_workers_with_job(tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.set_pilot_error(PilotErrors.ERR_PANDAKILL,
                                                               PilotErrors.pilotError[PilotErrors.ERR_PANDAKILL])
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow()
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()})
                    else:
                        mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID,
                                                                                       tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            sw.reset()
            workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers,
                                                              harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time()))
            # update workers in central database
            sw.reset()
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nWorkers]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error('failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                   tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj,
                                                                                    logOffset, logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID,
                                                                                            tmpWorkSpec.status))
            mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers,
                                                                      sw.get_elapsed_time()))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error('failed to get worker stats for {0}'.format(siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats)
                        if tmpRet:
                            mainLog.debug('updated worker stats (command) for {0}'.format(siteName))
                        else:
                            mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName,
                                                                                                           tmpStr))

            if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD:

                # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking
                # care of them
                active_ups_queues = self.queueConfigMapper.get_active_ups_queues()

                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues)
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name,
                                                                                 worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name,
                                                                                                        tmp_str))

            if not self._last_metrics_update \
                    or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD):
                # get latest metrics from DB
                service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update)
                if not service_metrics_list:
                    mainLog.error('failed to get service metrics')
                    self._last_metrics_update = datetime.datetime.utcnow()
                else:
                    tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list)
                    if tmp_ret:
                        mainLog.debug('update of service metrics OK')
                        self._last_metrics_update = datetime.datetime.utcnow()
                    else:
                        mainLog.error('failed to update service metrics err={0}'.format(tmp_str))

            # send dialog messages
            mainLog.debug('getting dialog messages to propagate')
            try:
                maxDialogs = harvester_config.propagator.maxDialogs
            except Exception:
                maxDialogs = 50
            diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs,
                                                                 harvester_config.propagator.lockInterval)
            mainLog.debug('got {0} dialogs'.format(len(diagSpecs)))
            if len(diagSpecs) > 0:
                tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs)
                if tmpStat:
                    diagIDs = [diagSpec.diagID for diagSpec in diagSpecs]
                    self.dbProxy.delete_dialog_messages(diagIDs)
                    mainLog.debug('sent {0} dialogs'.format(len(diagSpecs)))

                else:
                    mainLog.error('failed to send dialogs err={0}'.format(tmpStr))
            if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval:
                mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time())
            else:
                mainLog.debug('done' + sw_main.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return
Esempio n. 3
0
class aCTReport:
    '''Print summary info on jobs in DB. Use --web to print html that is
    automatically refreshed. Add filenames to query more than one aCT DB'''

    def __init__(self, args):
        self.output = ""
        self.harvester = args.harvester
        self.outfile = args.web
        self.actconfs = args.conffiles or [''] # empty string for default behaviour

        self.logger=aCTLogger.aCTLogger("aCTReport")
        self.actlog=self.logger()
        self.actlog.logger.setLevel(logging.ERROR)
        self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False)
        self.criticallog = self.criticallogger()

        if self.outfile:
            self.log('<META HTTP-EQUIV="refresh" CONTENT="60"><pre>')
            self.log(time.asctime() + '\n')

        self.db=aCTDBArc.aCTDBArc(self.actlog)
        self.pandadb=aCTDBPanda.aCTDBPanda(self.actlog)

    def log(self, message=''):
        self.output += message + '\n'

    def ProcessReport(self):
        if self.actconfs != ['']:
            return # don't print processes for combined report
        actprocscmd = 'ps ax -ww -o pid,etime,args'
        try:
            out = subprocess.run(actprocscmd.split(), check=True, encoding='utf-8', stdout=subprocess.PIPE).stdout
        except subprocess.CalledProcessError as e:
            self.log('Error: could not run ps command: %s' % e.stderr)
            return

        # Group processes by cluster
        cluster_procs = {}
        longprocesses = []
        for line in out.split('\n'):
            reg = re.match(r'\s*(\d*)\s*(.*) .*python.* .*(aCT\w*)\.py\s?(\S*)', line)
            if reg:
                pid, runningtime, process, cluster = reg.groups()
                # ignore Main and this process
                if process in ['aCTReport', 'aCTMain', 'aCTHeartbeatWatchdog']:
                    continue
                if cluster == '':
                    cluster = '(no cluster defined)'
                elif not re.match(r'\d\d:\d\d$', runningtime):
                    # Check for overrunning processes
                    longprocesses.append((process, pid, cluster, runningtime))
                if cluster in cluster_procs:
                    cluster_procs[cluster].append(process)
                else:
                    cluster_procs[cluster] = [process]

        for proc in longprocesses:
            self.log('WARNING: %s (pid %s) for %s running for more than one hour (%s), this process will be killed' % proc)
            # Kill process and log a critical message to send email
            # Too many emails, disable
            #self.criticallog.critical('Killing process %s (pid %s) for %s running for more than one hour (%s)' % proc)
            try:
                os.kill(int(proc[1]), signal.SIGKILL)
            except OSError:
                pass
        self.log()
        self.log('Active processes per cluster:')
        for cluster in sorted(cluster_procs):
            procs = cluster_procs[cluster]
            procs.sort()
            self.log(f'{cluster:>38.38}: {" ".join(procs)}')
        self.log()

    def PandaReport(self):
        rep={}
        rtot={}
        states = ["sent", "starting", "running", "slots", "tovalidate", "toresubmit",
                  "toclean", "finished", "done", "failed", "donefailed",
                  "tobekilled", "cancelled", "donecancelled"]

        for conf in self.actconfs:
            if conf:
                os.environ['ACTCONFIGARC'] = conf

            db=aCTDBArc.aCTDBArc(self.actlog)
            c=db.db.conn.cursor()
            c.execute("select sitename, actpandastatus, corecount from pandajobs")
            rows=c.fetchall()
            for r in rows:

                site, state = (str(r[0]), str(r[1]))
                if r[2] is None:
                    corecount=1
                else:
                    corecount=int(r[2])

                try:
                    rep[site][state]+=1
                    if state == "running":
                        rep[site]["slots"]+=1*corecount
                except:
                    try:
                        rep[site][state]=1
                        if state == "running":
                            try:
                                rep[site]["slots"]+=1*corecount
                            except:
                                rep[site]["slots"]=corecount
                    except:
                        rep[site]={}
                        rep[site][state]=1
                        if state == "running":
                            rep[site]["slots"]=corecount
                try:
                    rtot[state]+=1
                    if state == "running":
                        rtot["slots"]+=1*corecount
                except:
                    rtot[state]=1
                    if state == "running":
                        rtot["slots"]=corecount

        self.log(f"All Panda jobs: {sum([v for k,v in rtot.items() if k != 'slots'])}")
        self.log(f"{'':29} {' '.join([f'{s:>9}' for s in states])}")

        for k in sorted(rep.keys()):
            log=f"{k:>28.28}:"
            for s in states:
                try:
                    log += f'{rep[k][s]:>10}'
                except KeyError:
                    log += f'{"-":>10}'
            self.log(log)
        log = f'{"Totals":>28}:'
        for s in states:
            try:
                log += f'{rtot[s]:>10}'
            except:
                log += f'{"-":>10}'
        self.log(log+'\n\n')

    def ArcJobReport(self):
        rep={}
        rtot={}
        states = ["Undefined", "Accepted", "Preparing", "Submitting",
                 "Queuing", "Running", "Finishing", "Finished", "Hold", "Killed",
                 "Failed", "Deleted", "Other"]

        for conf in self.actconfs:
            if conf:
                os.environ['ACTCONFIGARC'] = conf

            db=aCTDBArc.aCTDBArc(self.actlog)
            c=db.db.conn.cursor()
            c.execute("select jobid,state from arcjobs")
            rows=c.fetchall()
            for r in rows:

                reg=re.search('.+//([^:]+)',str(r[0]))
                cl=""
                try:
                    cl=reg.group(1)
                except:
                    cl='WaitingSubmission'

                jid=str(r[1])
                if jid == 'None':
                    jid="Other"

                try:
                    rep[cl][jid]+=1
                except:
                    try:
                        rep[cl][jid]=1
                    except:
                        rep[cl]={}
                        rep[cl][jid]=1
                try:
                    rtot[jid]+=1
                except:
                    rtot[jid]=1

        self.log(f"All ARC jobs: {sum(rtot.values())}")
        self.log(f"{'':39} {' '.join([f'{s:>9}' for s in states])}")

        for k in sorted(rep, key=lambda x: x.split('.')[-1]):
            log=f"{k:>38.38}:"
            for s in states:
                try:
                    log += f'{rep[k][s]:>10}'
                except KeyError:
                    log += f'{"-":>10}'
            self.log(log)
        log = f"{'Totals':>38}:"
        for s in states:
            try:
                log += f'{rtot[s]:>10}'
            except:
                log += f'{"-":>10}'
        self.log(log+'\n\n')

    def CondorJobReport(self):

        rep = {}
        rtot = {}
        condorjobstatemap = ['Undefined', # used before real state is known
                             'Idle',
                             'Running',
                             'Removed',
                             'Completed',
                             'Held',
                             'Transferring',
                             'Suspended']

        for conf in self.actconfs:
            if conf:
                os.environ['ACTCONFIGARC'] = conf

            db=aCTDBArc.aCTDBArc(self.actlog)
            c = db.db.conn.cursor()
            c.execute("select cluster, JobStatus from condorjobs")
            rows = c.fetchall()

            for r in rows:

                cl = str(r[0])
                if not cl:
                    cl = 'WaitingSubmission'

                jid = r[1]

                try:
                    rep[cl][jid]+=1
                except:
                    try:
                        rep[cl][jid]=1
                    except:
                        rep[cl]={}
                        rep[cl][jid]=1
                try:
                    rtot[jid]+=1
                except:
                    rtot[jid]=1

        self.log(f"All Condor jobs: {sum(rtot.values())}")
        self.log(f"{'':39} {' '.join([f'{s:>9}' for s in condorjobstatemap])}")
        for k in sorted(rep, key=lambda x: x.split('.')[-1]):
            log=f"{k:>38.38}:"
            for s in range(8):
                try:
                    log += f'{rep[k][s]:>10}'
                except KeyError:
                    log += f'{"-":>10}'
            self.log(log)
        log = f"{'Totals':>38}:"
        for s in range(8):
            try:
                log += f'{rtot[s]:>10}'
            except:
                log += f'{"-":>10}'
        self.log(log+'\n\n')


    def StuckReport(self):

        # Query for lost jobs older than lostlimit
        lostlimit = 86400
        select = "(arcstate='submitted' or arcstate='running') and " \
                 + self.db.timeStampLessThan("tarcstate", lostlimit) + \
                 " and sendhb=1 and arcjobs.id=pandajobs.arcjobid order by tarcstate"
        columns = ['cluster']
        jobs = self.db.getArcJobsInfo(select, columns, tables='arcjobs,pandajobs')

        if jobs:
            self.log('Found %d jobs not updated in over %d seconds:\n' % (len(jobs), lostlimit))

            clustercount = {}
            for job in jobs:
                try:
                    host = re.search('.+//([^:]+)', job['cluster']).group(1)
                except:
                    pass
                if host in clustercount:
                    clustercount[host] += 1
                else:
                    clustercount[host] = 1

            for cluster, count in clustercount.items():
                self.log(f'{count} {cluster}')
            self.log()

    def HarvesterReport(self):
        try:
            from distutils.sysconfig import get_python_lib # pylint: disable=import-error
            sys.path.append(get_python_lib()+'/pandacommon')

            os.environ['PANDA_HOME']=os.environ['VIRTUAL_ENV']

            from collections import defaultdict # pylint: disable=import-error
            from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy # pylint: disable=import-error

            self.dbProxy = DBProxy()

            workers = self.dbProxy.get_worker_stats_bulk(None)
            rep = defaultdict(dict)

            rtot = defaultdict(int)

            for site, prodsourcelabels in workers.items():
                for prodsourcelabel, resources in prodsourcelabels.items():
                    for resource, jobs in resources.items():
                        rep[f'{site}-{resource}'][prodsourcelabel or 'empty'] = jobs
                        for state, count in jobs.items():
                            rtot[state] += count
            self.log(f"All Harvester jobs: {sum(rtot.values())}       prodSourceLabel: submitted/running")
            for k in sorted(rep.keys()):
                log=f"{k:>28.28}:"
                for psl, jobs in rep[k].items():
                    log += f"{psl:>10}: {jobs['submitted']}/{jobs['running']}"
                self.log(log)
            log = f"{'Totals':>28}:  submitted: {rtot['submitted']}  running: {rtot['running']}"
            self.log(log+'\n\n')
        except:
            pass

    def end(self):
        if self.outfile:
            self.log('</pre>')
Esempio n. 4
0
class Propagator(AgentBase):
    # constructor
    def __init__(self, communicator, queue_config_mapper, single_mode=False):
        AgentBase.__init__(self, single_mode)
        self.dbProxy = DBProxy()
        self.communicator = communicator
        self.queueConfigMapper = queue_config_mapper
        self._last_stats_update = None

    # main loop
    def run(self):
        while True:
            sw = core_utils.get_stopwatch()
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(self.ident),
                                             method_name='run')
            mainLog.debug('getting jobs to propagate')
            jobSpecs = self.dbProxy.get_jobs_to_propagate(
                harvester_config.propagator.maxJobs,
                harvester_config.propagator.lockInterval,
                harvester_config.propagator.updateInterval, self.ident)
            mainLog.debug('got {0} jobs'.format(len(jobSpecs)))
            # update jobs in central database
            iJobs = 0
            nJobs = harvester_config.propagator.nJobsInBulk
            hbSuppressMap = dict()
            while iJobs < len(jobSpecs):
                jobList = jobSpecs[iJobs:iJobs + nJobs]
                iJobs += nJobs
                # collect jobs to update or check
                jobListToSkip = []
                jobListToUpdate = []
                jobListToCheck = []
                retList = []
                for tmpJobSpec in jobList:
                    if tmpJobSpec.computingSite not in hbSuppressMap:
                        queueConfig = self.queueConfigMapper.get_queue(
                            tmpJobSpec.computingSite)
                        hbSuppressMap[
                            tmpJobSpec.
                            computingSite] = queueConfig.get_no_heartbeat_status(
                            )
                    # heartbeat is suppressed
                    if tmpJobSpec.status in hbSuppressMap[
                            tmpJobSpec.computingSite]:
                        # check running job to detect lost heartbeat
                        if tmpJobSpec.status == 'running':
                            jobListToCheck.append(tmpJobSpec)
                        else:
                            jobListToSkip.append(tmpJobSpec)
                            retList.append({'StatusCode': 0, 'command': None})
                    else:
                        jobListToUpdate.append(tmpJobSpec)
                retList += self.communicator.check_jobs(jobListToCheck)
                retList += self.communicator.update_jobs(jobListToUpdate)
                # logging
                for tmpJobSpec, tmpRet in zip(
                        jobListToSkip + jobListToCheck + jobListToUpdate,
                        retList):
                    if tmpRet['StatusCode'] == 0:
                        if tmpJobSpec in jobListToUpdate:
                            mainLog.debug(
                                'updated PandaID={0} status={1}'.format(
                                    tmpJobSpec.PandaID, tmpJobSpec.status))
                        else:
                            mainLog.debug(
                                'skip updating PandaID={0} status={1}'.format(
                                    tmpJobSpec.PandaID, tmpJobSpec.status))
                        # release job
                        tmpJobSpec.propagatorLock = None
                        if tmpJobSpec.is_final_status(
                        ) and tmpJobSpec.status == tmpJobSpec.get_status():
                            # unset to disable further updating
                            tmpJobSpec.propagatorTime = None
                            tmpJobSpec.subStatus = 'done'
                        else:
                            # check event availability
                            if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                    tmpJobSpec.subStatus != 'submitted':
                                tmpEvStat, tmpEvRet = self.communicator.check_event_availability(
                                    tmpJobSpec)
                                if tmpEvStat and tmpEvRet == 0:
                                    mainLog.debug(
                                        'kill PandaID={0} due to no event'.
                                        format(tmpJobSpec.PandaID))
                                    tmpRet['command'] = 'tobekilled'
                            # got kill command
                            if 'command' in tmpRet and tmpRet['command'] in [
                                    'tobekilled'
                            ]:
                                nWorkers = self.dbProxy.kill_workers_with_job(
                                    tmpJobSpec.PandaID)
                                if nWorkers == 0:
                                    # no remaining workers
                                    tmpJobSpec.status = 'cancelled'
                                    tmpJobSpec.subStatus = 'killed'
                                    tmpJobSpec.stateChangeTime = datetime.datetime.utcnow(
                                    )
                                    tmpJobSpec.trigger_propagation()
                        self.dbProxy.update_job(tmpJobSpec,
                                                {'propagatorLock': self.ident})
                    else:
                        mainLog.error(
                            'failed to update PandaID={0} status={1}'.format(
                                tmpJobSpec.PandaID, tmpJobSpec.status))
            mainLog.debug('getting workers to propagate')
            workSpecs = self.dbProxy.get_workers_to_propagate(
                harvester_config.propagator.maxWorkers,
                harvester_config.propagator.updateInterval)
            mainLog.debug('got {0} workers'.format(len(workSpecs)))
            # update workers in central database
            iWorkers = 0
            nWorkers = harvester_config.propagator.nWorkersInBulk
            while iWorkers < len(workSpecs):
                workList = workSpecs[iWorkers:iWorkers + nJobs]
                iWorkers += nWorkers
                retList, tmpErrStr = self.communicator.update_workers(workList)
                # logging
                if retList is None:
                    mainLog.error(
                        'failed to update workers with {0}'.format(tmpErrStr))
                else:
                    for tmpWorkSpec, tmpRet in zip(workList, retList):
                        if tmpRet:
                            mainLog.debug(
                                'updated workerID={0} status={1}'.format(
                                    tmpWorkSpec.workerID, tmpWorkSpec.status))
                            # update logs
                            for logFilePath, logOffset, logSize, logRemoteName in \
                                    tmpWorkSpec.get_log_files_to_upload():
                                with open(logFilePath, 'rb') as logFileObj:
                                    tmpStat, tmpErr = self.communicator.upload_file(
                                        logRemoteName, logFileObj, logOffset,
                                        logSize)
                                    if tmpStat:
                                        tmpWorkSpec.update_log_files_to_upload(
                                            logFilePath, logOffset + logSize)
                            # disable further update
                            if tmpWorkSpec.is_final_status():
                                tmpWorkSpec.disable_propagation()
                            self.dbProxy.update_worker(
                                tmpWorkSpec,
                                {'workerID': tmpWorkSpec.workerID})
                        else:
                            mainLog.error(
                                'failed to update workerID={0} status={1}'.
                                format(tmpWorkSpec.workerID,
                                       tmpWorkSpec.status))
            mainLog.debug('getting commands')
            commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
            mainLog.debug('got {0} commands'.format(len(commandSpecs)))
            for commandSpec in commandSpecs:
                if commandSpec.command.startswith(
                        CommandSpec.COM_reportWorkerStats):
                    # get worker stats
                    siteName = commandSpec.command.split(':')[-1]
                    workerStats = self.dbProxy.get_worker_stats(siteName)
                    if len(workerStats) == 0:
                        mainLog.error(
                            'failed to get worker stats for {0}'.format(
                                siteName))
                    else:
                        # report worker stats
                        tmpRet, tmpStr = self.communicator.update_worker_stats(
                            siteName, workerStats)
                        if tmpRet:
                            mainLog.debug(
                                'updated worker stats (command) for {0}'.
                                format(siteName))
                        else:
                            mainLog.error(
                                'failed to update worker stats (command) for {0} err={1}'
                                .format(siteName, tmpStr))

            if not self._last_stats_update or time.time(
            ) - self._last_stats_update > STATS_PERIOD:
                # update worker stats for all sites
                worker_stats_bulk = self.dbProxy.get_worker_stats_bulk()
                if not worker_stats_bulk:
                    mainLog.error('failed to get worker stats in bulk')
                else:
                    for site_name in worker_stats_bulk:
                        tmp_ret, tmp_str = self.communicator.update_worker_stats(
                            site_name, worker_stats_bulk[site_name])
                        if tmp_ret:
                            mainLog.debug(
                                'update of worker stats (bulk) for {0}'.format(
                                    site_name))
                            self._last_stats_update = time.time()
                        else:
                            mainLog.error(
                                'failed to update worker stats (bulk) for {0} err={1}'
                                .format(site_name, tmp_str))

            mainLog.debug('done' + sw.get_elapsed_time())
            # check if being terminated
            if self.terminated(harvester_config.propagator.sleepTime):
                mainLog.debug('terminated')
                return