def run(self): try: while True: _logger.debug('%s start' % self.pandaID) # query job job = self.taskBuffer.peekJobs([self.pandaID], fromDefined=False, fromArchived=False, fromWaiting=False)[0] # check job status if job == None: _logger.debug('%s escape : not found' % self.pandaID) return if not job.jobStatus in [ 'running', 'sent', 'starting', 'holding', 'stagein', 'stageout' ]: if job.jobStatus == 'transferring' and job.prodSourceLabel in [ 'user', 'panda' ]: pass else: _logger.debug('%s escape : %s' % (self.pandaID, job.jobStatus)) return # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( minutes=self.sleepTime) if job.modificationTime < timeLimit or ( job.endTime != 'NULL' and job.endTime < timeLimit): _logger.debug( '%s %s lastmod:%s endtime:%s' % (job.PandaID, job.jobStatus, str( job.modificationTime), str(job.endTime))) # retry ES merge jobs if EventServiceUtils.isEventServiceMerge(job): self.taskBuffer.retryJob(job.PandaID, {}, getNewPandaID=True, attemptNr=job.attemptNr, recoverableEsMerge=True) # read back job = self.taskBuffer.peekJobs([self.pandaID], fromDefined=False, fromArchived=False, fromWaiting=False)[0] destDBList = [] if job.jobStatus == 'sent': # sent job didn't receive reply from pilot within 30 min job.jobDispatcherErrorCode = ErrorCode.EC_SendError job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min" elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL': # lost heartbeat job.jobDispatcherErrorCode = ErrorCode.EC_Watcher if job.jobDispatcherErrorDiag == 'NULL': if job.endTime == 'NULL': # normal lost heartbeat job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str( job.modificationTime) else: # job recovery failed job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str( job.endTime) if job.jobStatus == 'transferring': job.jobDispatcherErrorDiag += ' in transferring' else: # job recovery failed job.jobDispatcherErrorCode = ErrorCode.EC_Recovery job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % ( self.sleepTime / 60) # set job status job.jobStatus = 'failed' # set endTime for lost heartbeat if job.endTime == 'NULL': # normal lost heartbeat job.endTime = job.modificationTime # set files status for file in job.Files: if file.type == 'output' or file.type == 'log': file.status = 'failed' if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # event service if EventServiceUtils.isEventServiceJob( job ) and not EventServiceUtils.isJobCloningJob(job): eventStat = self.taskBuffer.getEventStat( job.jediTaskID, job.PandaID) # set sub status when no sucessful events if EventServiceUtils.ST_finished not in eventStat: job.jobSubStatus = 'es_heartbeat' # update job self.taskBuffer.updateJobs([job], False) # start closer if job.jobStatus == 'failed': source = 'jobDispatcherErrorCode' error_code = job.jobDispatcherErrorCode error_diag = job.jobDispatcherErrorDiag try: _logger.debug( "Watcher will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr) _logger.debug("apply_retrial_rules is back") except Exception as e: _logger.debug( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # updateJobs was successful and it failed a job with taskBufferErrorCode try: _logger.debug("Watcher.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] if job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag _logger.debug( "Watcher.run 2 will call apply_retrial_rules" ) retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) _logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) cThr = Closer(self.taskBuffer, destDBList, job) cThr.start() cThr.join() _logger.debug('%s end' % job.PandaID) return # single action if self.single: return # sleep time.sleep(60 * self.sleepTime) except: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type, value)) return
def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus, self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in [ 'finished', 'failed', 'unknown', 'merging' ]: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr, self.attemptNr)) elif self.attemptNr is not None and self.job.jobStatus == 'transferring': errMsg = 'XML with attemptNr for {0}'.format( self.job.jobStatus) self.logger.error(errMsg) # FIXME raise RuntimeError, errMsg elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled( ) and not self.job.taskBufferErrorCode in [ taskbuffer.ErrorCode.EC_PilotRetried ]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI( self.job) self.logger.debug("check file status in JEDI : {0}".format( fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError, 'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug( "set jobStatus={0} since input is inconsistent between Panda and JEDI" .format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug( "going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID], 'pilot', '60', True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError, 'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug( "set jobStatus={0} since did not get semaphore for job cloning" .format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass( self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype, errvalue = sys.exc_info()[:2] self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}:{2}" .format(self.job.VO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary( ): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug( "status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag elif self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug( "AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error( "apply_retrial_rules excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output', 'log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime == 'NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled', 'closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs( [self.job], False, oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error( 'failed to update DB for pandaid={0}'.format( self.job.PandaID)) # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs( [self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug( "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}" .format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag self.logger.debug( "AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error( "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['', None, 'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({ 'lfn': baseLFN, 'guid': file.GUID, 'type': file.type, 'checksum': file.checksum, 'md5sum': file.md5sum, 'fsize': file.fsize, 'scope': file.scope }) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr( adderPlugin, 'datasetMap' ) and adderPlugin.datasetMap != {}: cThr = Closer.Closer( self.taskBuffer, destDBList, self.job, datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer, destDBList, self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer( self.job.jediTaskID, self.job.PandaID, destDBList) for assJobID, assDBlocks in assDBlockMap.iteritems( ): assJob = self.taskBuffer.peekJobs( [assJobID], fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug( ': associated job PandaID={0} not found in DB' .format(assJobID)) else: cThr = Closer.Closer( self.taskBuffer, assDBlocks, assJob) self.logger.debug( "start Closer for PandaID={0}".format( assJobID)) cThr.start() cThr.join() self.logger.debug( "end Closer for PandaID={0}".format( assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() errStr = ": %s %s " % (type, value) errStr += traceback.format_exc() self.logger.error(errStr) self.logger.error("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type, value)) self.logger.error("cannot unlock XML")
def run(self): try: while True: _logger.debug('%s start' % self.pandaID) # query job job = self.taskBuffer.peekJobs([self.pandaID], fromDefined=False, fromArchived=False, fromWaiting=False)[0] # check job status if job == None: _logger.debug('%s escape : not found' % self.pandaID) return if not job.jobStatus in [ 'running', 'sent', 'starting', 'holding', 'stagein', 'stageout' ]: if job.jobStatus == 'transferring' and job.prodSourceLabel in [ 'user', 'panda' ]: pass else: _logger.debug('%s escape : %s' % (self.pandaID, job.jobStatus)) return # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( minutes=self.sleepTime) if job.modificationTime < timeLimit or ( job.endTime != 'NULL' and job.endTime < timeLimit): _logger.debug( '%s %s lastmod:%s endtime:%s' % (job.PandaID, job.jobStatus, str( job.modificationTime), str(job.endTime))) # retry ES merge jobs if EventServiceUtils.isEventServiceMerge(job): self.taskBuffer.retryJob(job.PandaID, {}, getNewPandaID=True, attemptNr=job.attemptNr, recoverableEsMerge=True) # read back job = self.taskBuffer.peekJobs([self.pandaID], fromDefined=False, fromArchived=False, fromWaiting=False)[0] destDBList = [] # retry analysis jobs if (job.prodSourceLabel in ['user','panda']) and (job.attemptNr<2 or job.jobStatus == 'sent') \ and job.commandToPilot != 'tobekilled' and (not job.processingType in ['ITB_INTEGRATION']) \ and not job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_Reassigned, taskbuffer.ErrorCode.EC_Retried, taskbuffer.ErrorCode.EC_PilotRetried] \ and not job.processingType.startswith('gangarobot') \ and not job.processingType.startswith('hammercloud'): # reset _logger.debug( ' -> reset %s job with %s : PandaID:%s #%s' % (job.prodSourceLabel, job.jobStatus, job.PandaID, job.attemptNr)) job.jobStatus = 'activated' job.startTime = None job.endTime = None job.attemptNr = job.attemptNr + 1 # remove flag regarding to pledge-resource handling if not job.specialHandling in [None, 'NULL', '']: newSpecialHandling = re.sub( ',*localpool', '', job.specialHandling) if newSpecialHandling == '': job.specialHandling = None else: job.specialHandling = newSpecialHandling # TEMPORARY : send it to long queue oldComputingSite = job.computingSite if job.jobStatus != 'sent' and job.computingSite.startswith( 'ANALY') and (not job.computingSite.startswith( 'ANALY_LONG_')): tmpLongSiteList = [] tmpLongSite = re.sub('^ANALY_', 'ANALY_LONG_', job.computingSite) tmpLongSite = re.sub('_\d+$', '', tmpLongSite) tmpLongSiteList.append(tmpLongSite) tmpLongSite = job.computingSite + '_LONG' tmpLongSiteList.append(tmpLongSite) tmpLongSite = re.sub('SHORT', 'LONG', job.computingSite) if tmpLongSite != job.computingSite: tmpLongSiteList.append(tmpLongSite) for longSite in tmpLongSiteList: if self.siteMapper.checkSite(longSite): tmpSiteSpec = self.siteMapper.getSite( longSite) if tmpSiteSpec.status == 'online': job.computingSite = longSite _logger.debug( ' -> sending PandaID:%s to %s' % (job.PandaID, job.computingSite)) # set destinationSE if job.destinationSE == oldComputingSite: job.destinationSE = job.computingSite break # modify LFNs and destinationSE for file in job.Files: modTypes = ('output', 'log') if file.type in modTypes: # set destinationSE if file.destinationSE == oldComputingSite: file.destinationSE = job.computingSite if job.prodSourceLabel == 'panda': # doesn't change output for buildJob modTypes = ('log', ) if file.type in modTypes: # set new GUID if file.type == 'log': file.GUID = commands.getoutput('uuidgen') # add attempt nr oldName = file.lfn file.lfn = re.sub("\.\d+$", "", file.lfn) file.lfn = "%s.%d" % (file.lfn, job.attemptNr) newName = file.lfn # modify jobParameters sepPatt = "(\'|\"|%20|:)" + oldName + "(\'|\"|%20| )" matches = re.findall(sepPatt, job.jobParameters) for match in matches: oldPatt = match[0] + oldName + match[-1] newPatt = match[0] + newName + match[-1] job.jobParameters = re.sub( oldPatt, newPatt, job.jobParameters) else: if job.jobStatus == 'sent': # sent job didn't receive reply from pilot within 30 min job.jobDispatcherErrorCode = ErrorCode.EC_SendError job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min" elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL': # lost heartbeat job.jobDispatcherErrorCode = ErrorCode.EC_Watcher if job.jobDispatcherErrorDiag == 'NULL': if job.endTime == 'NULL': # normal lost heartbeat job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str( job.modificationTime) else: # job recovery failed job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str( job.endTime) if job.jobStatus == 'transferring': job.jobDispatcherErrorDiag += ' in transferring' else: # job recovery failed job.jobDispatcherErrorCode = ErrorCode.EC_Recovery job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % ( self.sleepTime / 60) # set job status job.jobStatus = 'failed' # set endTime for lost heartbeat if job.endTime == 'NULL': # normal lost heartbeat job.endTime = job.modificationTime # set files status for file in job.Files: if file.type == 'output' or file.type == 'log': file.status = 'failed' if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # update job self.taskBuffer.updateJobs([job], False) # start closer if job.jobStatus == 'failed': source = 'jobDispatcherErrorCode' error_code = job.jobDispatcherErrorCode error_diag = job.jobDispatcherErrorDiag try: _logger.debug( "Watcher will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr) _logger.debug("apply_retrial_rules is back") except Exception as e: _logger.debug( "apply_retrial_rules excepted and needs to be investigated (%s)" % (e)) cThr = Closer(self.taskBuffer, destDBList, job) cThr.start() cThr.join() _logger.debug('%s end' % job.PandaID) return # single action if self.single: return # sleep time.sleep(60 * self.sleepTime) except: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type, value)) return
def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in ['finished','failed','unknown','merging']: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr)) elif self.attemptNr is not None and self.job.jobStatus == 'transferring': errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus) self.logger.error(errMsg) # FIXME raise RuntimeError, errMsg elif self.jobStatus == EventServiceUtils.esRegStatus: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, logger=self.logger) # execute self.logger.debug('plugin is ready for ES file registration') adderPlugin.registerEventServiceFiles() else: # check file status in JEDI if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job) self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError,'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder errStr = "inconsistent file status between Panda and JEDI. " errStr += "failed to avoid duplicated processing caused by synchronization failure" self.job.ddmErrorDiag = errStr self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug("going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError,'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # instantiate concrete plugin adderPluginClass = self.getPluginClass(self.job.VO) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary(): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus)) if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag elif self.job.transExitCode: source = 'transExitCode' error_code = self.job.transExitCode error_diag = '' # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag)) if source and error_code: try: self.logger.debug("AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc())) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output','log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' self.job.jobSubStatus = None # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime=='NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled','closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID)) # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return try: # updateJobs was successful and it failed a job with taskBufferErrorCode self.logger.debug("AdderGen.run will peek the job") job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus, job_tmp.taskBufferErrorCode, job_tmp.taskBufferErrorDiag)) if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag self.logger.debug("AdderGen.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) self.logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['',None,'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type, 'checksum':file.checksum,'md5sum':file.md5sum, 'fsize':file.fsize,'scope':file.scope}) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID, destDBList) for assJobID,assDBlocks in assDBlockMap.iteritems(): assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID)) else: cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob) self.logger.debug("start Closer for PandaID={0}".format(assJobID)) cThr.start() cThr.join() self.logger.debug("end Closer for PandaID={0}".format(assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() errStr = ": %s %s " % (type,value) errStr += traceback.format_exc() self.logger.error(errStr) self.logger.error("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.error(": %s %s" % (type,value)) self.logger.error("cannot unlock XML")
def run(self): try: while True: _logger.debug('%s start' % self.pandaID) # query job job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False, fromArchived=False,fromWaiting=False)[0] _logger.debug('%s in %s' % (self.pandaID, job.jobStatus)) # check job status if job == None: _logger.debug('%s escape : not found' % self.pandaID) return if not job.jobStatus in ['running','sent','starting','holding', 'stagein','stageout']: if job.jobStatus == 'transferring' and (job.prodSourceLabel in ['user','panda'] or job.jobSubStatus not in [None, 'NULL', '']): pass else: _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus)) return # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime) if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit): _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus, str(job.modificationTime), str(job.endTime))) destDBList = [] if job.jobStatus == 'sent': # sent job didn't receive reply from pilot within 30 min job.jobDispatcherErrorCode = ErrorCode.EC_SendError job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min" elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL': # lost heartbeat job.jobDispatcherErrorCode = ErrorCode.EC_Watcher if job.jobDispatcherErrorDiag == 'NULL': if job.endTime == 'NULL': # normal lost heartbeat job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime) else: # job recovery failed job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime) if job.jobStatus == 'transferring': job.jobDispatcherErrorDiag += ' in transferring' # get worker workerSpecs = self.taskBuffer.getWorkersForJob(job.PandaID) if len(workerSpecs) > 0: workerSpec = workerSpecs[0] if workerSpec.status in ['finished', 'failed', 'cancelled', 'missed']: job.supErrorCode = SupErrors.error_codes['WORKER_ALREADY_DONE'] job.supErrorDiag = 'worker already {0} at {1} with {2}'.format(workerSpec.status, str(workerSpec.endTime), workerSpec.diagMessage) job.supErrorDiag = JobSpec.truncateStringAttr('supErrorDiag', job.supErrorDiag) else: # job recovery failed job.jobDispatcherErrorCode = ErrorCode.EC_Recovery job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60) # set job status job.jobStatus = 'failed' # set endTime for lost heartbeat if job.endTime == 'NULL': # normal lost heartbeat job.endTime = job.modificationTime # set files status for file in job.Files: if file.type == 'output' or file.type == 'log': file.status = 'failed' if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # event service if EventServiceUtils.isEventServiceJob(job) and not EventServiceUtils.isJobCloningJob(job): eventStat = self.taskBuffer.getEventStat(job.jediTaskID, job.PandaID) # set sub status when no sucessful events if EventServiceUtils.ST_finished not in eventStat: job.jobSubStatus = 'es_heartbeat' # update job self.taskBuffer.updateJobs([job],False) # start closer if job.jobStatus == 'failed': source = 'jobDispatcherErrorCode' error_code = job.jobDispatcherErrorCode error_diag = job.jobDispatcherErrorDiag try: _logger.debug("Watcher will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr) _logger.debug("apply_retrial_rules is back") except Exception as e: _logger.debug("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc())) # updateJobs was successful and it failed a job with taskBufferErrorCode try: _logger.debug("Watcher.run will peek the job") job_tmp = self.taskBuffer.peekJobs([job.PandaID], fromDefined=False, fromArchived=True, fromWaiting=False)[0] if job_tmp.taskBufferErrorCode: source = 'taskBufferErrorCode' error_code = job_tmp.taskBufferErrorCode error_diag = job_tmp.taskBufferErrorDiag _logger.debug("Watcher.run 2 will call apply_retrial_rules") retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code, error_diag, job_tmp.attemptNr) _logger.debug("apply_retrial_rules 2 is back") except IndexError: pass except Exception as e: self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc())) cThr = Closer(self.taskBuffer,destDBList,job) cThr.start() cThr.join() _logger.debug('%s end' % job.PandaID) return # single action if self.single: return # sleep time.sleep(60*self.sleepTime) except: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type,value)) return