Python apply_retrial_rules Exemples, taskbuffer.retryModule.apply_retrial_rules Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Watcher.py Projet : wguanicedew/panda-server

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],
                                               fromDefined=False,
                                               fromArchived=False,
                                               fromWaiting=False)[0]
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in [
                        'running', 'sent', 'starting', 'holding', 'stagein',
                        'stageout'
                ]:
                    if job.jobStatus == 'transferring' and job.prodSourceLabel in [
                            'user', 'panda'
                    ]:
                        pass
                    else:
                        _logger.debug('%s escape : %s' %
                                      (self.pandaID, job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (
                        job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug(
                        '%s %s lastmod:%s endtime:%s' %
                        (job.PandaID, job.jobStatus, str(
                            job.modificationTime), str(job.endTime)))
                    # retry ES merge jobs
                    if EventServiceUtils.isEventServiceMerge(job):
                        self.taskBuffer.retryJob(job.PandaID, {},
                                                 getNewPandaID=True,
                                                 attemptNr=job.attemptNr,
                                                 recoverableEsMerge=True)
                        # read back
                        job = self.taskBuffer.peekJobs([self.pandaID],
                                                       fromDefined=False,
                                                       fromArchived=False,
                                                       fromWaiting=False)[0]
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.modificationTime)
                            else:
                                # job recovery failed
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.endTime)
                                if job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorDiag += ' in transferring'
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (
                            self.sleepTime / 60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(
                            job
                    ) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(
                            job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job], False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug(
                                "Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job.PandaID, source,
                                error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs(
                                [job.PandaID],
                                fromDefined=False,
                                fromArchived=True,
                                fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug(
                                    "Watcher.run 2 will call apply_retrial_rules"
                                )
                                retryModule.apply_retrial_rules(
                                    self.taskBuffer, job_tmp.PandaID, source,
                                    error_code, error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer, destDBList, job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60 * self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type, value))
            return

Exemple #2

0

Afficher le fichier

Fichier : AdderGen.py Projet : wguanicedew/panda-server

    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(
                    self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled(
                ) and not self.job.taskBufferErrorCode in [
                        taskbuffer.ErrorCode.EC_PilotRetried
                ]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI == None:
                        raise RuntimeError, 'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(),
                                            fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError, 'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, source,
                                error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, source,
                                error_code, error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID, assDBlocks in assDBlockMap.iteritems(
                            ):
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type, value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type, value))
                self.logger.error("cannot unlock XML")

Exemple #3

0

Afficher le fichier

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],
                                               fromDefined=False,
                                               fromArchived=False,
                                               fromWaiting=False)[0]
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in [
                        'running', 'sent', 'starting', 'holding', 'stagein',
                        'stageout'
                ]:
                    if job.jobStatus == 'transferring' and job.prodSourceLabel in [
                            'user', 'panda'
                    ]:
                        pass
                    else:
                        _logger.debug('%s escape : %s' %
                                      (self.pandaID, job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (
                        job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug(
                        '%s %s lastmod:%s endtime:%s' %
                        (job.PandaID, job.jobStatus, str(
                            job.modificationTime), str(job.endTime)))
                    # retry ES merge jobs
                    if EventServiceUtils.isEventServiceMerge(job):
                        self.taskBuffer.retryJob(job.PandaID, {},
                                                 getNewPandaID=True,
                                                 attemptNr=job.attemptNr,
                                                 recoverableEsMerge=True)
                        # read back
                        job = self.taskBuffer.peekJobs([self.pandaID],
                                                       fromDefined=False,
                                                       fromArchived=False,
                                                       fromWaiting=False)[0]
                    destDBList = []
                    # retry analysis jobs
                    if (job.prodSourceLabel in ['user','panda']) and (job.attemptNr<2 or job.jobStatus == 'sent') \
                             and job.commandToPilot != 'tobekilled' and (not job.processingType in ['ITB_INTEGRATION']) \
                             and not job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_Reassigned,
                                                                 taskbuffer.ErrorCode.EC_Retried,
                                                                 taskbuffer.ErrorCode.EC_PilotRetried] \
                             and not job.processingType.startswith('gangarobot') \
                             and not job.processingType.startswith('hammercloud'):
                        # reset
                        _logger.debug(
                            ' -> reset %s job with %s : PandaID:%s #%s' %
                            (job.prodSourceLabel, job.jobStatus, job.PandaID,
                             job.attemptNr))
                        job.jobStatus = 'activated'
                        job.startTime = None
                        job.endTime = None
                        job.attemptNr = job.attemptNr + 1
                        # remove flag regarding to pledge-resource handling
                        if not job.specialHandling in [None, 'NULL', '']:
                            newSpecialHandling = re.sub(
                                ',*localpool', '', job.specialHandling)
                            if newSpecialHandling == '':
                                job.specialHandling = None
                            else:
                                job.specialHandling = newSpecialHandling
                        # TEMPORARY : send it to long queue
                        oldComputingSite = job.computingSite
                        if job.jobStatus != 'sent' and job.computingSite.startswith(
                                'ANALY') and (not job.computingSite.startswith(
                                    'ANALY_LONG_')):
                            tmpLongSiteList = []
                            tmpLongSite = re.sub('^ANALY_', 'ANALY_LONG_',
                                                 job.computingSite)
                            tmpLongSite = re.sub('_\d+$', '', tmpLongSite)
                            tmpLongSiteList.append(tmpLongSite)
                            tmpLongSite = job.computingSite + '_LONG'
                            tmpLongSiteList.append(tmpLongSite)
                            tmpLongSite = re.sub('SHORT', 'LONG',
                                                 job.computingSite)
                            if tmpLongSite != job.computingSite:
                                tmpLongSiteList.append(tmpLongSite)
                            for longSite in tmpLongSiteList:
                                if self.siteMapper.checkSite(longSite):
                                    tmpSiteSpec = self.siteMapper.getSite(
                                        longSite)
                                    if tmpSiteSpec.status == 'online':
                                        job.computingSite = longSite
                                        _logger.debug(
                                            ' -> sending PandaID:%s to %s' %
                                            (job.PandaID, job.computingSite))
                                        # set destinationSE
                                        if job.destinationSE == oldComputingSite:
                                            job.destinationSE = job.computingSite
                                        break
                        # modify LFNs and destinationSE
                        for file in job.Files:
                            modTypes = ('output', 'log')
                            if file.type in modTypes:
                                # set destinationSE
                                if file.destinationSE == oldComputingSite:
                                    file.destinationSE = job.computingSite
                            if job.prodSourceLabel == 'panda':
                                # doesn't change output for buildJob
                                modTypes = ('log', )
                            if file.type in modTypes:
                                # set new GUID
                                if file.type == 'log':
                                    file.GUID = commands.getoutput('uuidgen')
                                # add attempt nr
                                oldName = file.lfn
                                file.lfn = re.sub("\.\d+$", "", file.lfn)
                                file.lfn = "%s.%d" % (file.lfn, job.attemptNr)
                                newName = file.lfn
                                # modify jobParameters
                                sepPatt = "(\'|\"|%20|:)" + oldName + "(\'|\"|%20| )"
                                matches = re.findall(sepPatt,
                                                     job.jobParameters)
                                for match in matches:
                                    oldPatt = match[0] + oldName + match[-1]
                                    newPatt = match[0] + newName + match[-1]
                                    job.jobParameters = re.sub(
                                        oldPatt, newPatt, job.jobParameters)
                    else:
                        if job.jobStatus == 'sent':
                            # sent job didn't receive reply from pilot within 30 min
                            job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                            job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                        elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                            # lost heartbeat
                            job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                            if job.jobDispatcherErrorDiag == 'NULL':
                                if job.endTime == 'NULL':
                                    # normal lost heartbeat
                                    job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                        job.modificationTime)
                                else:
                                    # job recovery failed
                                    job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                        job.endTime)
                                    if job.jobStatus == 'transferring':
                                        job.jobDispatcherErrorDiag += ' in transferring'
                        else:
                            # job recovery failed
                            job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                            job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (
                                self.sleepTime / 60)
                        # set job status
                        job.jobStatus = 'failed'
                        # set endTime for lost heartbeat
                        if job.endTime == 'NULL':
                            # normal lost heartbeat
                            job.endTime = job.modificationTime
                        # set files status
                        for file in job.Files:
                            if file.type == 'output' or file.type == 'log':
                                file.status = 'failed'
                                if not file.destinationDBlock in destDBList:
                                    destDBList.append(file.destinationDBlock)
                    # update job
                    self.taskBuffer.updateJobs([job], False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug(
                                "Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job.PandaID, source,
                                error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s)"
                                % (e))

                        cThr = Closer(self.taskBuffer, destDBList, job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60 * self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type, value))
            return

Exemple #4

0

Afficher le fichier

Fichier : AdderGen.py Projet : PanDAWMS/panda-server

    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                    if fileCheckInJEDI == None:
                        raise RuntimeError,'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI 
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug("going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError,'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                        
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
            
                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))
                    
                    if source and error_code:
                        try:
                            self.logger.debug("AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))
                    
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled','closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                      extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()                            
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True,
                                                           fromWaiting=False)[0]
                        self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus,
                                                                                                                job_tmp.taskBufferErrorCode,
                                                                                                                job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug("AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                            error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['',None,'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                                 'checksum':file.checksum,'md5sum':file.md5sum,
                                                 'fsize':file.fsize,'scope':file.scope})
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID,
                                                                                            destDBList)
                            for assJobID,assDBlocks in assDBlockMap.iteritems():
                                assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False,
                                                                  fromArchived=False,
                                                                  fromWaiting=False,
                                                                  forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID))
                                else:
                                    cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob)
                                    self.logger.debug("start Closer for PandaID={0}".format(assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug("end Closer for PandaID={0}".format(assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type,value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                self.logger.error("cannot unlock XML")

Exemple #5

0

Afficher le fichier

Fichier : Watcher.py Projet : PanDAWMS/panda-server

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False,
                                               fromArchived=False,fromWaiting=False)[0]
                _logger.debug('%s in %s' % (self.pandaID, job.jobStatus))
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in ['running','sent','starting','holding',
                                         'stagein','stageout']:
                    if job.jobStatus == 'transferring' and (job.prodSourceLabel in ['user','panda'] or job.jobSubStatus not in [None, 'NULL', '']):
                        pass
                    else:
                        _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus,
                                                                   str(job.modificationTime),
                                                                   str(job.endTime)))
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime)
                            else:
                                # job recovery failed
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime)
                                if job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorDiag += ' in transferring'
                            # get worker
                            workerSpecs = self.taskBuffer.getWorkersForJob(job.PandaID)
                            if len(workerSpecs) > 0:
                                workerSpec = workerSpecs[0]
                                if workerSpec.status in ['finished', 'failed', 'cancelled', 'missed']:
                                    job.supErrorCode = SupErrors.error_codes['WORKER_ALREADY_DONE']
                                    job.supErrorDiag = 'worker already {0} at {1} with {2}'.format(workerSpec.status, str(workerSpec.endTime),
                                                                                                   workerSpec.diagMessage)
                                    job.supErrorDiag = JobSpec.truncateStringAttr('supErrorDiag', job.supErrorDiag)
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(job) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job],False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug("Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs([job.PandaID], fromDefined=False, fromArchived=True,
                                                               fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug("Watcher.run 2 will call apply_retrial_rules")
                                retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                                error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer,destDBList,job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)                        
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60*self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type,value))
            return