Python EventServiceUtils.isEventServiceJob Examples

Programming Language: Python

Namespace/Package Name: taskbuffer

Method/Function: isEventServiceJob

Examples at hotexamples.com: 13

Python EventServiceUtils.isEventServiceJob - 13 examples found. These are the top rated real world Python examples of taskbuffer.EventServiceUtils.isEventServiceJob extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

isEventServiceMerge(7)

isEventServiceJob(5)

isJobCloningJob(3)

isJumboJob(3)

getJobCloningType(1)

isCoJumboJob(1)

Example #1

Show file

File: Watcher.py Project: wguanicedew/panda-server

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],
                                               fromDefined=False,
                                               fromArchived=False,
                                               fromWaiting=False)[0]
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in [
                        'running', 'sent', 'starting', 'holding', 'stagein',
                        'stageout'
                ]:
                    if job.jobStatus == 'transferring' and job.prodSourceLabel in [
                            'user', 'panda'
                    ]:
                        pass
                    else:
                        _logger.debug('%s escape : %s' %
                                      (self.pandaID, job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (
                        job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug(
                        '%s %s lastmod:%s endtime:%s' %
                        (job.PandaID, job.jobStatus, str(
                            job.modificationTime), str(job.endTime)))
                    # retry ES merge jobs
                    if EventServiceUtils.isEventServiceMerge(job):
                        self.taskBuffer.retryJob(job.PandaID, {},
                                                 getNewPandaID=True,
                                                 attemptNr=job.attemptNr,
                                                 recoverableEsMerge=True)
                        # read back
                        job = self.taskBuffer.peekJobs([self.pandaID],
                                                       fromDefined=False,
                                                       fromArchived=False,
                                                       fromWaiting=False)[0]
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.modificationTime)
                            else:
                                # job recovery failed
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.endTime)
                                if job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorDiag += ' in transferring'
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (
                            self.sleepTime / 60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(
                            job
                    ) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(
                            job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job], False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug(
                                "Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job.PandaID, source,
                                error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs(
                                [job.PandaID],
                                fromDefined=False,
                                fromArchived=True,
                                fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug(
                                    "Watcher.run 2 will call apply_retrial_rules"
                                )
                                retryModule.apply_retrial_rules(
                                    self.taskBuffer, job_tmp.PandaID, source,
                                    error_code, error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer, destDBList, job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60 * self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type, value))
            return

Example #2

Show file

File: JobDispatcher.py Project: ruslan33/panda-server-ornl

 def getJob(self, siteName, prodSourceLabel, cpu, mem, diskSpace, node,
            timeout, computingElement, atlasRelease, prodUserID,
            getProxyKey, countryGroup, workingGroup, allowOtherCountry,
            realDN, taskID, nJobs, acceptJson):
     jobs = []
     useGLEXEC = False
     useProxyCache = False
     try:
         tmpNumJobs = int(nJobs)
     except:
         tmpNumJobs = None
     if tmpNumJobs == None:
         tmpNumJobs = 1
     # wrapper function for timeout
     if hasattr(panda_config,
                'global_shares') and panda_config.global_shares == True:
         tmpWrapper = _TimedMethod(self.taskBuffer.getJobsGShare, timeout)
     else:
         tmpWrapper = _TimedMethod(self.taskBuffer.getJobs, timeout)
     tmpWrapper.run(tmpNumJobs, siteName, prodSourceLabel, cpu, mem,
                    diskSpace, node, timeout, computingElement,
                    atlasRelease, prodUserID, getProxyKey, countryGroup,
                    workingGroup, allowOtherCountry, taskID)
     if isinstance(tmpWrapper.result, types.ListType):
         jobs = jobs + tmpWrapper.result
     # make response
     if len(jobs) > 0:
         proxyKey = jobs[-1]
         nSent = jobs[-2]
         jobs = jobs[:-2]
     if len(jobs) != 0:
         # succeed
         self.siteMapperCache.update()
         responseList = []
         # append Jobs
         for tmpJob in jobs:
             response = Protocol.Response(Protocol.SC_Success)
             response.appendJob(tmpJob, self.siteMapperCache)
             # append nSent
             response.appendNode('nSent', nSent)
             # set proxy key
             if getProxyKey:
                 response.setProxyKey(proxyKey)
             # check if glexec or proxy cache is used
             if hasattr(panda_config, 'useProxyCache'
                        ) and panda_config.useProxyCache == True:
                 self.specialDispatchParams.update()
                 if not 'glexecSites' in self.specialDispatchParams:
                     glexecSites = {}
                 else:
                     glexecSites = self.specialDispatchParams['glexecSites']
                 if siteName in glexecSites:
                     if glexecSites[siteName] == 'True':
                         useGLEXEC = True
                     elif glexecSites[siteName] == 'test' and \
                             (prodSourceLabel in ['test','prod_test'] or \
                                  (tmpJob.processingType in ['gangarobot'])):
                         useGLEXEC = True
                 if not 'proxyCacheSites' in self.specialDispatchParams:
                     proxyCacheSites = {}
                 else:
                     proxyCacheSites = self.specialDispatchParams[
                         'proxyCacheSites']
                 if siteName in proxyCacheSites:
                     useProxyCache = True
             # set proxy
             if useGLEXEC or useProxyCache:
                 try:
                     #  get compact
                     compactDN = self.taskBuffer.cleanUserID(realDN)
                     # check permission
                     self.specialDispatchParams.update()
                     if not 'allowProxy' in self.specialDispatchParams:
                         allowProxy = []
                     else:
                         allowProxy = self.specialDispatchParams[
                             'allowProxy']
                     if not compactDN in allowProxy:
                         _logger.warning(
                             "getJob : %s %s '%s' no permission to retrive user proxy"
                             % (siteName, node, compactDN))
                     else:
                         if useProxyCache:
                             tmpStat, tmpOut = response.setUserProxy(
                                 proxyCacheSites[siteName]['dn'],
                                 proxyCacheSites[siteName]['role'])
                         else:
                             tmpStat, tmpOut = response.setUserProxy()
                         if not tmpStat:
                             _logger.warning(
                                 "getJob : %s %s failed to get user proxy : %s"
                                 % (siteName, node, tmpOut))
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     _logger.warning(
                         "getJob : %s %s failed to get user proxy with %s:%s"
                         % (siteName, node, errtype.__name__, errvalue))
             # panda proxy
             if 'pandaProxySites' in self.specialDispatchParams and siteName in self.specialDispatchParams['pandaProxySites'] \
                     and (EventServiceUtils.isEventServiceJob(tmpJob) or EventServiceUtils.isEventServiceMerge(tmpJob)):
                 # get secret key
                 tmpSecretKey, tmpErrMsg = DispatcherUtils.getSecretKey(
                     tmpJob.PandaID)
                 if tmpSecretKey == None:
                     _logger.warning(
                         "getJob : PandaID=%s site=%s failed to get panda proxy secret key : %s"
                         % (tmpJob.PandaID, siteName, tmpErrMsg))
                 else:
                     # set secret key
                     _logger.debug("getJob : PandaID=%s key=%s" %
                                   (tmpJob.PandaID, tmpSecretKey))
                     response.setPandaProxySecretKey(tmpSecretKey)
             # add
             responseList.append(response.data)
         # make response for bulk
         if nJobs != None:
             response = Protocol.Response(Protocol.SC_Success)
             if not acceptJson:
                 response.appendNode('jobs', json.dumps(responseList))
             else:
                 response.appendNode('jobs', responseList)
     else:
         if tmpWrapper.result == Protocol.TimeOutToken:
             # timeout
             response = Protocol.Response(Protocol.SC_TimeOut)
         else:
             # no available jobs
             response = Protocol.Response(Protocol.SC_NoJobs)
             _pilotReqLogger.info('method=noJob,site=%s,node=%s,type=%s' %
                                  (siteName, node, prodSourceLabel))
     # return
     _logger.debug("getJob : %s %s useGLEXEC=%s ret -> %s" %
                   (siteName, node, useGLEXEC, response.encode(acceptJson)))
     return response.encode(acceptJson)

Example #3

Show file

File: AdderGen.py Project: wguanicedew/panda-server

    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(
                    self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled(
                ) and not self.job.taskBufferErrorCode in [
                        taskbuffer.ErrorCode.EC_PilotRetried
                ]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI == None:
                        raise RuntimeError, 'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(),
                                            fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError, 'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, source,
                                error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, source,
                                error_code, error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID, assDBlocks in assDBlockMap.iteritems(
                            ):
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type, value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type, value))
                self.logger.error("cannot unlock XML")

Example #4

Show file

File: Protocol.py Project: EntityOfPlague/panda-server

 def appendJob(self,job):
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''                
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn  = ''
     strScopeOut = ''
     strScopeLog = ''        
     logFile = ''
     logGUID = ''        
     for file in job.Files:
         if file.type == 'input':
             if strIFiles != '':
                 strIFiles += ','
             strIFiles += file.lfn
             if strDispatch != '':
                 strDispatch += ','
             strDispatch += file.dispatchDBlock
             if strDisToken != '':
                 strDisToken += ','
             strDisToken += file.dispatchDBlockToken
             strProdDBlock += '%s,' % file.prodDBlock 
             if strProdToken != '':
                 strProdToken += ','
             strProdToken += file.prodDBlockToken
             if strGUID != '':
                 strGUID += ','
             strGUID += file.GUID
             strRealDatasetIn += '%s,' % file.dataset
             strFSize += '%s,' % file.fsize
             if not file.checksum in ['','NULL',None]:
                 strCheckSum += '%s,' % file.checksum
             else:
                 strCheckSum += '%s,' % file.md5sum
             strScopeIn += '%s,' % file.scope    
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope                        
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += file.destinationDBlockToken.split(',')[0]
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'] = job.jobParameters
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn']  = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     self.data['cmtConfig'] = job.cmtConfig
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # debug mode
     if job.specialHandling != None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service
     if EventServiceUtils.isEventServiceJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         self.data['eventServiceMerge'] = 'True'
         # write to file
         writeToFileStr = ''
         try:
             for outputName,inputList in job.metadata.iteritems():
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except:
             pass
         self.data['writeToFile'] = writeToFileStr

Example #5

Show file

File: AdderAtlasPlugin.py Project: EntityOfPlague/panda-server

 def execute(self):
     try:
         self.logger.debug("start plugin : %s" % self.jobStatus)
         # backend
         self.ddmBackEnd = self.job.getDdmBackEnd()
         if self.ddmBackEnd == None:
             self.ddmBackEnd = 'rucio'
         # instantiate DQ2
         if self.ddmBackEnd != None:
             self.dq2api = DQ2.DQ2(force_backend=self.ddmBackEnd)
         else:
             self.dq2api = DQ2.DQ2()
         self.logger.debug("ddm backend = {0}".format(self.ddmBackEnd))
         # add files only to top-level datasets for transferring jobs
         if self.job.jobStatus == 'transferring':
             self.addToTopOnly = True
             self.logger.debug("adder for transferring")
         # use PandaDDM for ddm jobs                                                                                                                
         if self.job.prodSourceLabel == 'ddm':
             self.pandaDDM = True
         # check if the job goes to merging
         if self.job.produceUnMerge():
             self.goToMerging = True
         # check if the job should go to trasnferring
         tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
         tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
         destSEwasSet = False
         brokenSched = False
         if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE):
             # DQ2 ID was set by using --destSE for analysis job to transfer output
             destSEwasSet = True
             tmpDstDDM = self.job.destinationSE
             tmpDstSEs = self.job.destinationSE
         else:
             tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm
             tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se)
             # protection against disappearance of dest from schedconfig
             if not self.siteMapper.checkSite(self.job.destinationSE) and self.job.destinationSE != 'local':
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "destinaitonSE %s is unknown in schedconfig" % self.job.destinationSE
                 self.logger.error("%s" % self.job.ddmErrorDiag)
                 # set fatal error code and return
                 self.result.setFatal()
                 return 
         # protection against disappearance of src from schedconfig        
         if not self.siteMapper.checkSite(self.job.computingSite):
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "computingSite %s is unknown in schedconfig" % self.job.computingSite
             self.logger.error("%s" % self.job.ddmErrorDiag)
             # set fatal error code and return
             self.result.setFatal()
             return
         self.logger.debug('DDM src:%s dst:%s' % (tmpSrcDDM,tmpDstDDM))
         self.logger.debug('SE src:%s dst:%s' % (tmpSrcSEs,tmpDstSEs))
         if re.search('^ANALY_',self.job.computingSite) != None:
             # analysis site
             pass
         elif self.job.computingSite == self.job.destinationSE:
             # same site ID for computingSite and destinationSE
             pass
         elif tmpSrcDDM == tmpDstDDM:
             # same DQ2ID for src/dest
             pass
         elif tmpSrcSEs == tmpDstSEs:
             # same SEs
             pass
         elif self.addToTopOnly:
             # already in transferring
             pass
         elif self.goToMerging:
             # no transferring for merging
             pass
         elif self.job.jobStatus == 'failed':
             # failed jobs
             if self.job.prodSourceLabel in ['managed','test']:
                 self.logTransferring = True
         elif self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job):
             # transfer only log file for ES jobs 
             self.logTransferring = True
         else:
             self.goToTransferring = True
         self.logger.debug('goToTransferring=%s' % self.goToTransferring)
         self.logger.debug('logTransferring=%s' % self.logTransferring)
         self.logger.debug('goToMerging=%s' % self.goToMerging)
         retOut = self._updateOutputs()
         self.logger.debug('added outputs with %s' % retOut)
         if retOut != 0:
             self.logger.debug('terminated when adding')
             return
         # remove unmerged
         if self.job.processingType == 'usermerge' and self.job.prodSourceLabel == 'user' and \
                self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL':
             retMerge = self._removeUnmerged()
             # failed
             if not retMerge:
                 self.logger.debug('terminated when removing unmerged')
                 return
         # succeeded    
         self.result.setSucceeded()    
         self.logger.debug("end plugin")
     except:
         type, value, traceBack = sys.exc_info()
         self.logger.debug(": %s %s" % (type,value))
         # set fatal error code
         self.result.setFatal()
     # return
     return

Example #6

Show file

File: AdderGen.py Project: wguanicedew/panda-server

 def parseXML(self):
     # get LFN and GUID
     self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     if self.job.Files == []:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns = []
     guids = []
     fsizes = []
     md5sums = []
     chksums = []
     surls = []
     fullLfnMap = {}
     nEventsMap = {}
     try:
         root = xml.dom.minidom.parse(self.xmlFile)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize = None
             md5sum = None
             adler32 = None
             surl = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # endpoints
             self.extraInfo['endpoint'][lfn] = []
             for epNode in file.getElementsByTagName('endpoint'):
                 self.extraInfo['endpoint'][lfn].append(
                     str(epNode.firstChild.data))
             # error check
             if (not lfn in inputLFNs) and (fsize == None or
                                            (md5sum == None
                                             and adler32 == None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 != None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN != None:
                 fullLfnMap[lfn] = fullLFN
     except:
         # parse json
         try:
             import json
             with open(self.xmlFile) as tmpF:
                 jsonDict = json.load(tmpF)
                 for lfn, fileData in jsonDict.iteritems():
                     lfn = str(lfn)
                     fsize = None
                     md5sum = None
                     adler32 = None
                     surl = None
                     fullLFN = None
                     guid = str(fileData['guid'])
                     if 'fsize' in fileData:
                         fsize = long(fileData['fsize'])
                     if 'md5sum' in fileData:
                         md5sum = str(fileData['md5sum'])
                         # check
                         if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                             md5sum = None
                     if 'adler32' in fileData:
                         adler32 = str(fileData['adler32'])
                     if 'surl' in fileData:
                         surl = str(fileData['surl'])
                     if 'full_lfn' in fileData:
                         fullLFN = str(fileData['full_lfn'])
                     # endpoints
                     self.extraInfo['endpoint'][lfn] = []
                     if 'endpoint' in fileData:
                         self.extraInfo['endpoint'][lfn] = fileData[
                             'endpoint']
                     # error check
                     if (not lfn in inputLFNs) and (fsize == None or
                                                    (md5sum == None
                                                     and adler32 == None)):
                         if EventServiceUtils.isEventServiceMerge(self.job):
                             continue
                         else:
                             raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                     # append
                     lfns.append(lfn)
                     guids.append(guid)
                     fsizes.append(fsize)
                     md5sums.append(md5sum)
                     surls.append(surl)
                     if adler32 != None:
                         # use adler32 if available
                         chksums.append("ad:%s" % adler32)
                     else:
                         chksums.append("md5:%s" % md5sum)
                     if fullLFN != None:
                         fullLfnMap[lfn] = fullLFN
         except:
             # check if file exists
             if os.path.exists(self.xmlFile):
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
                 # set failed anyway
                 self.job.jobStatus = 'failed'
                 # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                 if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                    (self.job.transExitCode  in [0,'0','NULL']):
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                 return 2
             else:
                 # XML was deleted
                 return 1
     # parse metadata to get nEvents
     try:
         root = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
     except:
         pass
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     # parse json
     try:
         import json
         jsonDict = json.loads(self.job.metadata)
         for jsonFileItem in jsonDict['files']['output']:
             for jsonSubFileItem in jsonFileItem['subFiles']:
                 lfn = str(jsonSubFileItem['name'])
                 try:
                     nevents = long(jsonSubFileItem['nentries'])
                     nEventsMap[lfn] = nevents
                 except:
                     pass
     except:
         pass
     self.logger.debug('nEventsMapJson=%s' % str(nEventsMap))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # copy files for variable number of outputs
     tmpStat = self.copyFilesForVariableNumOutputs(lfns)
     if not tmpStat:
         self.logger.error(
             "failed to copy files for variable number of outputs")
         return 2
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user', 'panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in [
                         'managed', 'test', 'rc_test', 'ptest'
                 ]:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if not file.lfn in lfns:
                 if self.job.jobStatus == 'finished' and \
                         (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                     # unset file status for ES jobs
                     pass
                 elif file.isAllowedNoOutput():
                     # allowed not to be produced
                     file.status = 'nooutput'
                     self.logger.debug('set {0} to status={1}'.format(
                         file.lfn, file.status))
                 else:
                     file.status = 'failed'
                     self.job.jobStatus = 'failed'
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                         file.lfn)
                     self.logger.error(self.job.ddmErrorDiag)
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID = guids[i]
                 file.fsize = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if fullLfnMap.has_key(file.lfn):
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents
                 if nEventsMap.has_key(file.lfn):
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[
                         file.lfn]
             except:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
             # set lumi block number
             if lumiBlockNr != None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
     # check consistency between XML and filesTable
     for lfn in lfns:
         if not lfn in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                 lfn)
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0

Example #7

Show file

 def appendJob(self, job, siteMapperCache=None):
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         isEventServiceMerge = True
     else:
         isEventServiceMerge = False
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn = ''
     strScopeOut = ''
     strScopeLog = ''
     logFile = ''
     logGUID = ''
     ddmEndPointIn = []
     ddmEndPointOut = []
     noOutput = []
     siteSpec = None
     inDsLfnMap = {}
     if siteMapperCache != None:
         siteMapper = siteMapperCache.getObj()
         siteSpec = siteMapper.getSite(job.computingSite)
         # resove destSE
         try:
             job.destinationSE = siteMapper.resolveNucleus(
                 job.destinationSE)
             for tmpFile in job.Files:
                 tmpFile.destinationSE = siteMapper.resolveNucleus(
                     tmpFile.destinationSE)
         except:
             pass
         siteMapperCache.releaseObj()
     for file in job.Files:
         if file.type == 'input':
             if strIFiles != '':
                 strIFiles += ','
             strIFiles += file.lfn
             if strDispatch != '':
                 strDispatch += ','
             strDispatch += file.dispatchDBlock
             if strDisToken != '':
                 strDisToken += ','
             strDisToken += file.dispatchDBlockToken
             strProdDBlock += '%s,' % file.prodDBlock
             if not isEventServiceMerge:
                 strProdToken += '%s,' % file.prodDBlockToken
             else:
                 strProdToken += '%s,' % job.metadata[1][file.lfn]
             if strGUID != '':
                 strGUID += ','
             strGUID += file.GUID
             strRealDatasetIn += '%s,' % file.dataset
             strFSize += '%s,' % file.fsize
             if not file.checksum in ['', 'NULL', None]:
                 strCheckSum += '%s,' % file.checksum
             else:
                 strCheckSum += '%s,' % file.md5sum
             strScopeIn += '%s,' % file.scope
             ddmEndPointIn.append(
                 self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken))
             if not file.dataset in inDsLfnMap:
                 inDsLfnMap[file.dataset] = []
             inDsLfnMap[file.dataset].append(file.lfn)
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += re.sub(
                 '^ddd:', 'dst:',
                 file.destinationDBlockToken.split(',')[0])
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
             ddmEndPointOut.append(
                 self.getDdmEndpoint(
                     siteSpec,
                     file.destinationDBlockToken.split(',')[0]))
             if file.isAllowedNoOutput():
                 noOutput.append(file.lfn)
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken[:-1]
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'] = job.jobParameters
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn'] = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # DDM endpoints
     self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn)
     self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut)
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     self.data['cmtConfig'] = job.cmtConfig
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # debug mode
     if job.specialHandling != None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service or job cloning
     if EventServiceUtils.isJobCloningJob(job):
         self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job)
     elif EventServiceUtils.isEventServiceJob(
             job) or EventServiceUtils.isJumboJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if isEventServiceMerge:
         self.data['eventServiceMerge'] = 'True'
         # write to file for ES merge
         writeToFileStr = ''
         try:
             for outputName, inputList in job.metadata[0].iteritems():
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except:
             pass
         self.data['writeToFile'] = writeToFileStr
     elif job.writeInputToFile():
         try:
             # write input to file
             writeToFileStr = ''
             for inDS, inputList in inDsLfnMap.iteritems():
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 writeToFileStr += 'tmpin_{0}:'.format(inDS)
                 writeToFileStr += ','.join(inputList)
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
             self.data['writeToFile'] = writeToFileStr
         except:
             pass
     # no output
     if noOutput != []:
         self.data['allowNoOutput'] = ','.join(noOutput)
     # alternative stage-out
     if job.getAltStgOut() != None:
         self.data['altStageOut'] = job.getAltStgOut()
     # log to OS
     if job.putLogToOS():
         self.data['putLogToOS'] = 'True'

Example #8

Show file

File: AdderAtlasPlugin.py Project: EntityOfPlague/panda-server

 def _updateOutputs(self):
     # return if non-DQ2
     if self.pandaDDM or self.job.destinationSE == 'local':
         return 0
     # check files
     idMap = {}
     fileList = []
     subMap = {}        
     dsDestMap = {}
     for file in self.job.Files:
         if file.type == 'output' or file.type == 'log':
             # append to fileList
             fileList.append(file.lfn)
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 continue
             # add only log file for successful ES jobs
             if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job) and file.type != 'log':
                 continue
             try:
                 # fsize
                 fsize = None
                 if not file.fsize in ['NULL','',0]:
                     try:
                         fsize = long(file.fsize)
                     except:
                         type, value, traceBack = sys.exc_info()
                         self.logger.error("%s : %s %s" % (self.jobID,type,value))
                 # append to map
                 if not idMap.has_key(file.destinationDBlock):
                     idMap[file.destinationDBlock] = []
                 fileAttrs = {'guid'     : file.GUID,
                              'lfn'      : file.lfn,
                              'size'     : fsize,
                              'checksum' : file.checksum}
                 # add SURLs if LFC registration is required
                 if self.useCentralLFC():
                     fileAttrs['surl'] = self.extraInfo['surl'][file.lfn]
                     if fileAttrs['surl'] == None:
                         raise TypeError,"{0} has SURL=None".format(file.lfn)
                     # get destination
                     if not dsDestMap.has_key(file.destinationDBlock):
                         if file.destinationDBlockToken in ['',None,'NULL']:
                             tmpDestList = [self.siteMapper.getSite(self.job.computingSite).ddm]
                         elif DataServiceUtils.getDestinationSE(file.destinationDBlockToken) != None and \
                                 self.siteMapper.getSite(self.job.computingSite).ddm == self.siteMapper.getSite(file.destinationSE).ddm:
                             tmpDestList = [DataServiceUtils.getDestinationSE(file.destinationDBlockToken)]
                         elif self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \
                                 (not self.siteMapper.getSite(self.job.computingSite).ddm.endswith('PRODDISK')) and  \
                                 (not self.job.prodSourceLabel in ['user','panda']):
                             # T1 used as T2
                             tmpDestList = [self.siteMapper.getSite(self.job.computingSite).ddm]
                         else:
                             tmpDestList = []
                             tmpSeTokens = self.siteMapper.getSite(self.job.computingSite).setokens
                             for tmpDestToken in file.destinationDBlockToken.split(','):
                                 if tmpSeTokens.has_key(tmpDestToken):
                                     tmpDest = tmpSeTokens[tmpDestToken]
                                 else:
                                     tmpDest = self.siteMapper.getSite(self.job.computingSite).ddm
                                 if not tmpDest in tmpDestList:
                                     tmpDestList.append(tmpDest)
                         dsDestMap[file.destinationDBlock] = tmpDestList
                 # extra meta data
                 if self.ddmBackEnd == 'rucio':
                     if file.lfn in self.extraInfo['lbnr']:
                         fileAttrs['lumiblocknr'] = self.extraInfo['lbnr'][file.lfn]
                     if file.lfn in self.extraInfo['nevents']:
                         fileAttrs['events'] = self.extraInfo['nevents'][file.lfn]
                     elif self.extraInfo['nevents'] != {}:
                         fileAttrs['events'] = None
                     #if not file.jediTaskID in [0,None,'NULL']:
                     #    fileAttrs['task_id'] = file.jediTaskID
                     #fileAttrs['panda_id'] = file.PandaID
                 idMap[file.destinationDBlock].append(fileAttrs)
                 # for subscription
                 if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user','rucio_test'] and \
                        re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \
                        self.job.destinationSE != 'local':
                     if self.siteMapper == None:
                         self.logger.error("SiteMapper==None")
                     else:
                         # get dataset spec
                         if not self.datasetMap.has_key(file.destinationDBlock):
                             tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock})
                             self.datasetMap[file.destinationDBlock] = tmpDS
                         # check if valid dataset        
                         if self.datasetMap[file.destinationDBlock] == None:
                             self.logger.error(": cannot find %s in DB" % file.destinationDBlock)
                         else:
                             if not self.datasetMap[file.destinationDBlock].status in ['defined']:
                                 # not a fresh dataset
                                 self.logger.debug(": subscription was already made for %s:%s" % \
                                               (self.datasetMap[file.destinationDBlock].status,
                                                file.destinationDBlock))
                             else:
                                 # get DQ2 IDs
                                 tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
                                 tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
                                 if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE):
                                     # DQ2 ID was set by using --destSE for analysis job to transfer output
                                     tmpDstDDM = file.destinationSE
                                     tmpDstSEs = file.destinationSE
                                 else:
                                     if DataServiceUtils.getDestinationSE(file.destinationDBlockToken) != None:
                                         tmpDstDDM = DataServiceUtils.getDestinationSE(file.destinationDBlockToken)
                                     else:
                                         tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm
                                     tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se)
                                 # if src != dest or multi-token
                                 if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \
                                    (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0):
                                     optSub = {'DATASET_COMPLETE_EVENT' : ['http://%s:%s/server/panda/datasetCompleted' % \
                                                                           (panda_config.pserverhosthttp,panda_config.pserverporthttp)]}
                                     # append
                                     if not subMap.has_key(file.destinationDBlock):
                                         subMap[file.destinationDBlock] = []
                                         # sources
                                         optSource = {}
                                         # set sources
                                         if file.destinationDBlockToken in ['NULL','']:
                                             # use default DQ2 ID as source
                                             optSource[tmpSrcDDM] = {'policy' : 0}
                                         else:
                                             # convert token to DQ2 ID
                                             dq2ID = tmpSrcDDM
                                             # use the first token's location as source for T1D1
                                             tmpSrcToken = file.destinationDBlockToken.split(',')[0]
                                             if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken):
                                                 dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken]
                                             optSource[dq2ID] = {'policy' : 0}
                                         # T1 used as T2
                                         if self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \
                                            (not tmpSrcDDM.endswith('PRODDISK')) and  \
                                            (not self.job.prodSourceLabel in ['user','panda']):
                                             # register both DATADISK and PRODDISK as source locations
                                             if self.siteMapper.getSite(self.job.computingSite).setokens.has_key('ATLASPRODDISK'):
                                                 dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens['ATLASPRODDISK']
                                                 optSource[dq2ID] = {'policy' : 0}
                                             if not optSource.has_key(tmpSrcDDM):
                                                 optSource[tmpSrcDDM] = {'policy' : 0}
                                         # use another location when token is set
                                         if not file.destinationDBlockToken in ['NULL','']:
                                             tmpDQ2IDList = []
                                             tmpDstTokens = file.destinationDBlockToken.split(',')
                                             # remove the first one because it is already used as a location
                                             if tmpSrcDDM == tmpDstDDM:
                                                 tmpDstTokens = tmpDstTokens[1:]
                                             # loop over all tokens
                                             for idxToken,tmpDstToken in enumerate(tmpDstTokens):
                                                 dq2ID = tmpDstDDM
                                                 if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken):
                                                     dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken]
                                                 # keep the fist destination for multi-hop
                                                 if idxToken == 0:
                                                     firstDestDDM = dq2ID
                                                 else:
                                                     # use the fist destination as source for T1D1
                                                     optSource = {}                                                        
                                                     optSource[firstDestDDM] = {'policy' : 0}
                                                 # remove looping subscription
                                                 if dq2ID == tmpSrcDDM:
                                                     continue
                                                 # avoid duplication
                                                 if not dq2ID in tmpDQ2IDList:
                                                     subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
                                         else:
                                             # use default DDM
                                             for dq2ID in tmpDstDDM.split(','):
                                                 subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
             except:
                 errStr = '%s %s' % sys.exc_info()[:2]
                 self.logger.error(errStr)
                 self.result.setFatal()
                 self.job.ddmErrorDiag = 'failed before adding files : ' + errStr
                 return 1
     # cleanup submap
     tmpKeys = subMap.keys()
     for tmpKey in tmpKeys:
         if subMap[tmpKey] == []:
             del subMap[tmpKey]
     # add data to original dataset
     for destinationDBlock in idMap.keys():
         origDBlock = None
         match = re.search('^(.+)_sub\d+$',destinationDBlock)
         if match != None:
             # add files to top-level datasets
             origDBlock = match.group(1)
             if not self.goToTransferring:
                 idMap[origDBlock] = idMap[destinationDBlock]
         # add files to top-level datasets only 
         if self.addToTopOnly or self.goToMerging:
             del idMap[destinationDBlock]
         # skip sub unless getting transferred
         if origDBlock != None:
             if not self.goToTransferring and not self.logTransferring \
                    and idMap.has_key(destinationDBlock):
                 del idMap[destinationDBlock]
     # print idMap
     self.logger.debug("idMap = %s" % idMap)
     self.logger.debug("subMap = %s" % subMap)
     self.logger.debug("dsDestMap = %s" % dsDestMap)
     self.logger.debug("extraInfo = %s" % str(self.extraInfo))
     # check consistency of destinationDBlock
     hasSub = False
     for destinationDBlock in idMap.keys():
         match = re.search('^(.+)_sub\d+$',destinationDBlock)
         if match != None:
             hasSub = True
             break
     if idMap != {} and self.goToTransferring and not hasSub:
         errStr = 'no sub datasets for transferring. destinationDBlock may be wrong'
         self.logger.error(errStr)
         self.result.setFatal()
         self.job.ddmErrorDiag = 'failed before adding files : ' + errStr
         return 1
     # add data
     self.logger.debug("addFiles start")
     # count the number of files
     regNumFiles = 0
     regFileList = []
     for tmpRegDS,tmpRegList in idMap.iteritems():
         for tmpRegItem in tmpRegList:
             if not tmpRegItem['lfn'] in regFileList:
                 regNumFiles += 1
                 regFileList.append(tmpRegItem['lfn'])
     # decompose idMap
     if not self.useCentralLFC():
         destIdMap = {None:idMap}
     else:
         destIdMap = self.decomposeIdMap(idMap,dsDestMap)          
     # add files
     nTry = 3
     for iTry in range(nTry):
         isFatal  = False
         isFailed = False
         regStart = datetime.datetime.utcnow()
         try:
             if not self.useCentralLFC():
                 regMsgStr = "DQ2 registraion for %s files " % regNumFiles                    
             else:
                 regMsgStr = "LFC+DQ2 registraion with backend={0} for {1} files ".format(self.ddmBackEnd,
                                                                                          regNumFiles)
             self.logger.debug('%s %s' % ('registerFilesInDatasets',str(destIdMap)))
             out = rucioAPI.registerFilesInDataset(destIdMap)
         except (DQ2.DQClosedDatasetException,
                 DQ2.DQFrozenDatasetException,
                 DQ2.DQUnknownDatasetException,
                 DQ2.DQDatasetExistsException,
                 DQ2.DQFileMetaDataMismatchException,
                 FileCatalogUnknownFactory,
                 FileCatalogException,
                 DataIdentifierNotFound,
                 RucioFileCatalogException,
                 FileConsistencyMismatch,
                 UnsupportedOperation,
                 exceptions.KeyError):
             # fatal errors
             errType,errValue = sys.exc_info()[:2]
             out = '%s : %s' % (errType,errValue)
             isFatal = True
             isFailed = True
         except:
             # unknown errors
             errType,errValue = sys.exc_info()[:2]
             out = '%s : %s' % (errType,errValue)
             isFatal = False
             isFailed = True                
         regTime = datetime.datetime.utcnow() - regStart
         self.logger.debug(regMsgStr + \
                               'took %s.%03d sec' % (regTime.seconds,regTime.microseconds/1000))
         # failed
         if isFailed or isFatal:
             self.logger.error('%s' % out)
             if (iTry+1) == nTry or isFatal:
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 # extract important error string
                 extractedErrStr = DataServiceUtils.extractImportantError(out)
                 errMsg = "Could not add files to DDM: "
                 if extractedErrStr == '':
                     self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
                 else:
                     self.job.ddmErrorDiag = errMsg + extractedErrStr
                 if isFatal:
                     self.result.setFatal()
                 else:
                     self.result.setTemporary()
                 return 1
             self.logger.error("Try:%s" % iTry)
             # sleep
             time.sleep(10)                    
         else:
             self.logger.debug('%s' % str(out))
             break
     # register dataset subscription
     subActivity = 'Production Output'
     if not self.job.prodSourceLabel in ['user']:
         for tmpName,tmpVal in subMap.iteritems():
             for dq2ID,optSub,optSource in tmpVal:
                 if not self.goToMerging:
                     # make DQ2 subscription for prod jobs
                     self.logger.debug("%s %s %s" % ('registerDatasetSubscription',
                                                     (tmpName,dq2ID),
                                                     {'version':0,'archived':0,'callbacks':optSub,
                                                      'sources':optSource,'sources_policy':(001000 | 010000),
                                                      'wait_for_sources':0,'destination':None,'query_more_sources':0,
                                                      'sshare':"production",'group':None,'activity':subActivity,
                                                      'acl_alias':None,'replica_lifetime':"14 days"}))
                     for iDDMTry in range(3):
                         out = 'OK'
                         isFailed = False                        
                         try:
                             self.dq2api.registerDatasetSubscription(tmpName,dq2ID,version=0,archived=0,callbacks=optSub,
                                                                     sources=optSource,sources_policy=(001000 | 010000),
                                                                     wait_for_sources=0,destination=None,query_more_sources=0,
                                                                     sshare="production",group=None,activity=subActivity,
                                                                     acl_alias=None,replica_lifetime="14 days")
                         except DQ2.DQSubscriptionExistsException:
                             # harmless error
                             errType,errValue = sys.exc_info()[:2]
                             out = '%s : %s' % (errType,errValue)
                         except:
                             # unknown errors
                             errType,errValue = sys.exc_info()[:2]
                             out = '%s : %s' % (errType,errValue)
                             isFailed = True
                             if 'is not a Tiers of Atlas Destination' in str(errValue) or \
                                     'is not in Tiers of Atlas' in str(errValue) or \
                                     'RSE Expression resulted in an empty set' in str(errValue) or \
                                     'RSE excluded due to write blacklisting' in str(errValue) or \
                                     'used/quota' in str(errValue):
                                 # fatal error
                                 self.job.ddmErrorCode = ErrorCode.EC_Subscription
                             else:
                                 # retry for temporary errors
                                 time.sleep(10)
                         else:
                             break
                     if isFailed:
                         self.logger.error('%s' % out)
                         # extract important error string
                         extractedErrStr = DataServiceUtils.extractImportantError(out)
                         if self.job.ddmErrorCode == ErrorCode.EC_Subscription:
                             # fatal error
                             if extractedErrStr == '':
                                 self.job.ddmErrorDiag = "subscription failure with %s" % out
                             else:
                                 self.logger.error(extractedErrStr)
                                 self.job.ddmErrorDiag = "subscription failure with %s" % extractedErrStr
                             self.result.setFatal()
                         else:
                             # temoprary errors
                             self.job.ddmErrorCode = ErrorCode.EC_Adder                
                             self.job.ddmErrorDiag = "could not register subscription : %s" % tmpName
                             self.result.setTemporary()
                         return 1
                     self.logger.debug('%s' % str(out))
                 else:
                     # register location
                     tmpDsNameLoc = re.sub('_sub\d+$','',tmpName)
                     for tmpLocName in optSource.keys():
                         self.logger.debug("%s %s %s %s" % ('registerDatasetLocation',tmpDsNameLoc,tmpLocName,
                                                            {'lifetime':"14 days"}))
                         for iDDMTry in range(3):
                             out = 'OK'
                             isFailed = False                        
                             try:                        
                                 self.dq2api.registerDatasetLocation(tmpDsNameLoc,tmpLocName,lifetime="14 days")
                             except DQ2.DQLocationExistsException:
                                 # harmless error
                                 errType,errValue = sys.exc_info()[:2]
                                 out = '%s : %s' % (errType,errValue)
                             except:
                                 # unknown errors
                                 errType,errValue = sys.exc_info()[:2]
                                 out = '%s : %s' % (errType,errValue)
                                 isFailed = True
                                 # retry for temporary errors
                                 time.sleep(10)
                             else:
                                 break
                         if isFailed:
                             self.logger.error('%s' % out)
                             if self.job.ddmErrorCode == ErrorCode.EC_Location:
                                 # fatal error
                                 self.job.ddmErrorDiag = "location registration failure with %s" % out
                                 self.result.setFatal()
                             else:
                                 # temoprary errors
                                 self.job.ddmErrorCode = ErrorCode.EC_Adder                
                                 self.job.ddmErrorDiag = "could not register location : %s" % tmpDsNameLoc
                                 self.result.setTemporary()
                             return 1
                         self.logger.debug('%s' % str(out))
                 # set dataset status
                 self.datasetMap[tmpName].status = 'running'

Example #9

Show file

File: AdderGen.py Project: PanDAWMS/panda-server

    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                    if fileCheckInJEDI == None:
                        raise RuntimeError,'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI 
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug("going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError,'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                        
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
            
                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))
                    
                    if source and error_code:
                        try:
                            self.logger.debug("AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))
                    
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled','closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                      extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()                            
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True,
                                                           fromWaiting=False)[0]
                        self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus,
                                                                                                                job_tmp.taskBufferErrorCode,
                                                                                                                job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug("AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                            error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['',None,'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                                 'checksum':file.checksum,'md5sum':file.md5sum,
                                                 'fsize':file.fsize,'scope':file.scope})
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID,
                                                                                            destDBList)
                            for assJobID,assDBlocks in assDBlockMap.iteritems():
                                assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False,
                                                                  fromArchived=False,
                                                                  fromWaiting=False,
                                                                  forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID))
                                else:
                                    cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob)
                                    self.logger.debug("start Closer for PandaID={0}".format(assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug("end Closer for PandaID={0}".format(assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type,value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                self.logger.error("cannot unlock XML")

Example #10

Show file

File: AdderGen.py Project: PanDAWMS/panda-server

 def parseXML(self):
     # get LFN and GUID
     self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     if self.job.Files == []:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns    = []
     guids   = []
     fsizes  = []
     md5sums = []
     chksums = []
     surls   = []
     fullLfnMap = {}
     nEventsMap = {}
     guidMap = dict()
     try:
         root  = xml.dom.minidom.parse(self.xmlFile)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize   = None
             md5sum  = None
             adler32 = None
             surl    = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # endpoints
             self.extraInfo['endpoint'][lfn] = []
             for epNode in file.getElementsByTagName('endpoint'):
                 self.extraInfo['endpoint'][lfn].append(str(epNode.firstChild.data))
             # error check
             if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 != None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN != None:
                 fullLfnMap[lfn] = fullLFN
     except:
         # parse json
         try:
             import json
             with open(self.xmlFile) as tmpF:
                 jsonDict = json.load(tmpF)
                 for lfn, fileData in jsonDict.iteritems():
                     lfn = str(lfn)
                     fsize   = None
                     md5sum  = None
                     adler32 = None
                     surl    = None
                     fullLFN = None
                     guid = str(fileData['guid'])
                     if 'fsize' in fileData:
                         fsize = long(fileData['fsize'])
                     if 'md5sum' in fileData:
                         md5sum = str(fileData['md5sum'])
                         # check
                         if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                             md5sum = None
                     if 'adler32' in fileData:
                         adler32 = str(fileData['adler32'])
                     if 'surl' in fileData:
                         surl = str(fileData['surl'])
                     if 'full_lfn' in fileData:
                         fullLFN = str(fileData['full_lfn'])
                     # endpoints
                     self.extraInfo['endpoint'][lfn] = []
                     if 'endpoint' in fileData:
                         self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                     # error check
                     if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                         if EventServiceUtils.isEventServiceMerge(self.job):
                             continue
                         else:
                             raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                     # append
                     lfns.append(lfn)
                     guids.append(guid)
                     fsizes.append(fsize)
                     md5sums.append(md5sum)
                     surls.append(surl)
                     if adler32 != None:
                         # use adler32 if available
                         chksums.append("ad:%s" % adler32)
                     else:
                         chksums.append("md5:%s" % md5sum)
                     if fullLFN != None:
                         fullLfnMap[lfn] = fullLFN
         except:
             # check if file exists
             if os.path.exists(self.xmlFile):
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type,value))
                 # set failed anyway
                 self.job.jobStatus = 'failed'
                 # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                 if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                    (self.job.taskBufferErrorCode not in [taskbuffer.ErrorCode.EC_WorkerDone]) and \
                    (self.job.transExitCode  in [0,'0','NULL']):
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                 return 2
             else:
                 # XML was deleted
                 return 1
     # parse metadata to get nEvents
     try:
         root  = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             guidMap[lfn] = guid
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
     except:
         pass
     # parse json
     try:
         import json
         jsonDict = json.loads(self.job.metadata)
         for jsonFileItem in jsonDict['files']['output']:
             for jsonSubFileItem in jsonFileItem['subFiles']:
                 lfn = str(jsonSubFileItem['name'])
                 try:
                     nevents = long(jsonSubFileItem['nentries'])
                     nEventsMap[lfn] = nevents
                 except:
                     pass
                 try:
                     guid = str(jsonSubFileItem['file_guid'])
                     guidMap[lfn] = guid
                 except:
                     pass
     except:
         pass
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     self.logger.debug('guidMap=%s' % str(guidMap))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # copy files for variable number of outputs
     tmpStat = self.copyFilesForVariableNumOutputs(lfns)
     if not tmpStat:
         self.logger.error("failed to copy files for variable number of outputs")
         return 2
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user','panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in ['managed','test'] + JobUtils.list_ptest_prod_sources:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if not file.lfn in lfns:
                 if self.job.jobStatus == 'finished' and \
                         (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                     # unset file status for ES jobs
                     pass
                 elif file.isAllowedNoOutput():
                     # allowed not to be produced
                     file.status = 'nooutput'
                     self.logger.debug('set {0} to status={1}'.format(file.lfn,file.status))
                 else:
                     file.status = 'failed'
                     self.job.jobStatus = 'failed'
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(file.lfn)
                     self.logger.error(self.job.ddmErrorDiag)
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID   = guids[i]
                 file.fsize  = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if fullLfnMap.has_key(file.lfn):
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents 
                 if nEventsMap.has_key(file.lfn):
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
             except:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type,value))
             # set lumi block number
             if lumiBlockNr != None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
     self.extraInfo['guid'] = guidMap
     # check consistency between XML and filesTable
     for lfn in lfns:
         if not lfn in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(lfn)
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0

Example #11

Show file

File: AdderGen.py Project: lukewayne123/panda-server

 def run(self):
     try:
         self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
         # lock XML
         self.lockXML = open(self.xmlFile)
         try:
             fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
         except:
             self.logger.debug("cannot get lock : %s" % self.xmlFile)
             self.lockXML.close()
             # remove XML just in case for the final attempt
             if not self.ignoreTmpError:
                 try:
                     # remove Catalog
                     os.remove(self.xmlFile)
                 except:
                     pass
             return
         # check if file exists
         if not os.path.exists(self.xmlFile):
             self.logger.debug("not exist : %s" % self.xmlFile)
             try:
                 fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                 self.lockXML.close()
             except:
                 pass
             return
         # query job
         self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                             fromArchived=False,
                                             fromWaiting=False,
                                             forAnal=True)[0]
         # check if job has finished
         if self.job == None:
             self.logger.debug(': job not found in DB')
         elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']:
             self.logger.error(': invalid state -> %s' % self.job.jobStatus)
         elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
             self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
         else:
             # check file status in JEDI
             fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
             self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
             if fileCheckInJEDI == None:
                 raise RuntimeError,'failed to check file status in JEDI'
             if fileCheckInJEDI == False:
                 # set job status to failed since some file status is wrong in JEDI 
                 self.jobStatus = 'failed'
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "wrong file status in JEDI"
                 self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus))
             # keep old status
             oldJobStatus = self.job.jobStatus
             # set job status
             if not self.job.jobStatus in ['transferring']:
                 self.job.jobStatus = self.jobStatus
             addResult = None
             adderPlugin = None
             # parse XML
             parseResult = self.parseXML()
             if parseResult < 2:
                 # intraction with DDM
                 try:
                     # set VO=local for DDM free
                     if self.job.destinationSE == 'local':
                         tmpVO = 'local'
                     else:
                         tmpVO = self.job.VO
                     # instantiate concrete plugin
                     adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
                     if adderPluginClass == None:
                         # use ATLAS plugin by default
                         from AdderAtlasPlugin import AdderAtlasPlugin
                         adderPluginClass = AdderAtlasPlugin
                     self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
                     adderPlugin = adderPluginClass(self.job,
                                                    taskBuffer=self.taskBuffer,
                                                    siteMapper=self.siteMapper,
                                                    extraInfo=self.extraInfo,
                                                    logger=self.logger)
                     # execute
                     self.logger.debug('plugin is ready')
                     adderPlugin.execute()
                     addResult = adderPlugin.result
                     self.logger.debug('plugin done with %s' % (addResult.statusCode))
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO,
                                                                                                      errtype,
                                                                                                      errvalue)) 
                     addResult = None
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "AdderPlugin failure"
                 # ignore temporary errors
                 if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                     self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                     self.logger.debug('escape')
                     # unlock XML
                     try:
                         fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                         self.lockXML.close()
                     except:
                         type, value, traceBack = sys.exc_info()
                         self.logger.debug(": %s %s" % (type,value))
                         self.logger.debug("cannot unlock XML")
                     return
                 # failed
                 if addResult == None or not addResult.isSucceeded():
                     self.job.jobStatus = 'failed'
             # set file status for failed jobs or failed transferring jobs
             if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                 self.job.jobStatus = 'failed'
                 for file in self.job.Files:
                     if file.type in ['output','log']:
                         if addResult != None and file.lfn in addResult.mergingFiles:
                             file.status = 'merging'
                         else:
                             file.status = 'failed'
             else:
                 # reset errors
                 self.job.jobDispatcherErrorCode = 0
                 self.job.jobDispatcherErrorDiag = 'NULL'
                 # set status
                 if addResult != None and addResult.mergingFiles != []:
                     # set status for merging:                        
                     for file in self.job.Files:
                         if file.lfn in addResult.mergingFiles:
                             file.status = 'merging'
                     self.job.jobStatus = 'merging'
                     # propagate transition to prodDB
                     self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                 elif addResult != None and addResult.transferringFiles != []:
                     # set status for transferring
                     for file in self.job.Files:
                         if file.lfn in addResult.transferringFiles:
                             file.status = 'transferring'
                     self.job.jobStatus = 'transferring'
                     # propagate transition to prodDB
                     self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                 else:
                     self.job.jobStatus = 'finished'
             # endtime
             if self.job.endTime=='NULL':
                 self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
             # output size and # of outputs
             self.job.nOutputDataFiles = 0
             self.job.outputFileBytes = 0
             for tmpFile in self.job.Files:
                 if tmpFile.type == 'output':
                     self.job.nOutputDataFiles += 1
                     try:
                         self.job.outputFileBytes += tmpFile.fsize
                     except:
                         pass
             # protection
             maxOutputFileBytes = 99999999999
             if self.job.outputFileBytes > maxOutputFileBytes:
                 self.job.outputFileBytes = maxOutputFileBytes
             # set cancelled state
             if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                 self.job.jobStatus = 'cancelled'
             # update job
             self.logger.debug("updating DB")
             retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                               extraInfo=self.extraInfo)
             self.logger.debug("retU: %s" % retU)
             # failed
             if not retU[0]:
                 self.logger.error('failed to update DB')
                 # unlock XML
                 try:
                     fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                     self.lockXML.close()                            
                 except:
                     type, value, traceBack = sys.exc_info()
                     self.logger.debug(": %s %s" % (type,value))
                     self.logger.debug("cannot unlock XML")
                 return
             # setup for closer
             if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'):
                 destDBList = []
                 guidList = []
                 for file in self.job.Files:
                     # ignore inputs
                     if file.type == 'input':
                         continue
                     # skip pseudo datasets
                     if file.destinationDBlock in ['',None,'NULL']:
                         continue
                     # start closer for output/log datasets
                     if not file.destinationDBlock in destDBList:
                         destDBList.append(file.destinationDBlock)
                     # collect GUIDs
                     if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                               self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                               and file.type == 'output':
                         # extract base LFN since LFN was changed to full LFN for CMS
                         baseLFN = file.lfn.split('/')[-1]
                         guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                          'checksum':file.checksum,'md5sum':file.md5sum,
                                          'fsize':file.fsize,'scope':file.scope})
                 if guidList != []:
                     retG = self.taskBuffer.setGUIDs(guidList)
                 if destDBList != []:
                     # start Closer
                     if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                         cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                     else:
                         cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                     self.logger.debug("start Closer")
                     cThr.start()
                     cThr.join()
                     self.logger.debug("end Closer")
         self.logger.debug("end")
         try:
             # remove Catalog
             os.remove(self.xmlFile)
         except:
             pass
         # unlock XML
         if self.lockXML != None:
             fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
             self.lockXML.close()            
     except:
         type, value, traceBack = sys.exc_info()
         self.logger.debug(": %s %s" % (type,value))
         self.logger.debug("except")
         # unlock XML just in case
         try:
             if self.lockXML != None:
                 fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
         except:
             type, value, traceBack = sys.exc_info()
             self.logger.debug(": %s %s" % (type,value))
             self.logger.debug("cannot unlock XML")

Example #12

Show file

File: AdderGen.py Project: lukewayne123/panda-server

 def parseXML(self):
     # get LFN and GUID
     self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     if self.job.Files == []:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns    = []
     guids   = []
     fsizes  = []
     md5sums = []
     chksums = []
     surls   = []
     fullLfnMap = {}
     nEventsMap = {}
     try:
         root  = xml.dom.minidom.parse(self.xmlFile)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize   = None
             md5sum  = None
             adler32 = None
             surl    = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # error check
             if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 != None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN != None:
                 fullLfnMap[lfn] = fullLFN
     except:
         # check if file exists
         if os.path.exists(self.xmlFile):
             type, value, traceBack = sys.exc_info()
             self.logger.error(": %s %s" % (type,value))
             # set failed anyway
             self.job.jobStatus = 'failed'
             # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
             if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                (self.job.transExitCode  in [0,'0','NULL']):
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
             return 2
         else:
             # XML was deleted
             return 1
     # parse metadata to get nEvents
     try:
         root  = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
     except:
         pass
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user','panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if not file.lfn in lfns:
                 if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job):
                     # unset file status for ES jobs
                     pass
                 else:
                     file.status = 'failed'
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID   = guids[i]
                 file.fsize  = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if fullLfnMap.has_key(file.lfn):
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents 
                 if nEventsMap.has_key(file.lfn):
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
             except:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type,value))
             # set lumi block number
             if lumiBlockNr != None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
     # check consistency between XML and filesTable
     for lfn in lfns:
         if not lfn in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot XML is inconsistent with filesTable"
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0

Example #13

Show file

File: Watcher.py Project: PanDAWMS/panda-server

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False,
                                               fromArchived=False,fromWaiting=False)[0]
                _logger.debug('%s in %s' % (self.pandaID, job.jobStatus))
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in ['running','sent','starting','holding',
                                         'stagein','stageout']:
                    if job.jobStatus == 'transferring' and (job.prodSourceLabel in ['user','panda'] or job.jobSubStatus not in [None, 'NULL', '']):
                        pass
                    else:
                        _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus,
                                                                   str(job.modificationTime),
                                                                   str(job.endTime)))
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime)
                            else:
                                # job recovery failed
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime)
                                if job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorDiag += ' in transferring'
                            # get worker
                            workerSpecs = self.taskBuffer.getWorkersForJob(job.PandaID)
                            if len(workerSpecs) > 0:
                                workerSpec = workerSpecs[0]
                                if workerSpec.status in ['finished', 'failed', 'cancelled', 'missed']:
                                    job.supErrorCode = SupErrors.error_codes['WORKER_ALREADY_DONE']
                                    job.supErrorDiag = 'worker already {0} at {1} with {2}'.format(workerSpec.status, str(workerSpec.endTime),
                                                                                                   workerSpec.diagMessage)
                                    job.supErrorDiag = JobSpec.truncateStringAttr('supErrorDiag', job.supErrorDiag)
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(job) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job],False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug("Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs([job.PandaID], fromDefined=False, fromArchived=True,
                                                               fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug("Watcher.run 2 will call apply_retrial_rules")
                                retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                                error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer,destDBList,job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)                        
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60*self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type,value))
            return