Python EventServiceUtilsの例

プログラミング言語: Python

名前空間/パッケージ名: taskbuffer

クラス/型: EventServiceUtils

hotexamples.comのコード掲載数: 18

Python EventServiceUtils - 18件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtaskbuffer.EventServiceUtilsの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

isEventServiceMerge(7)

isEventServiceJob(5)

isJobCloningJob(3)

isJumboJob(3)

getJobCloningType(1)

isCoJumboJob(1)

コード例 #1

ファイルを表示

 def updateJobs(self, jobList, tmpLog):
     updateJobs = []
     failedJobs = []
     activateJobs = []
     waitingJobs = []
     closeJobs = []
     # sort out jobs
     for job in jobList:
         # failed jobs
         if job.jobStatus in ['failed', 'cancelled']:
             failedJobs.append(job)
         # waiting
         elif job.jobStatus == 'waiting':
             waitingJobs.append(job)
         # no input jobs
         elif job.dispatchDBlock == 'NULL':
             activateJobs.append(job)
         # normal jobs
         else:
             # change status
             job.jobStatus = "assigned"
             updateJobs.append(job)
     # update DB
     tmpLog.debug('# of activated jobs : {0}'.format(len(activateJobs)))
     self.taskBuffer.activateJobs(activateJobs)
     tmpLog.debug('# of updated jobs : {0}'.format(len(updateJobs)))
     self.taskBuffer.updateJobs(updateJobs, True)
     tmpLog.debug('# of failed jobs : {0}'.format(len(failedJobs)))
     self.taskBuffer.updateJobs(failedJobs, True)
     tmpLog.debug('# of waiting jobs : {0}'.format(len(waitingJobs)))
     self.taskBuffer.keepJobs(waitingJobs)
     # to trigger merge generation if all events are done
     finishedJobs = []
     for job in activateJobs:
         if job.notDiscardEvents() and job.allOkEvents(
         ) and not EventServiceUtils.isEventServiceMerge(job):
             # change status
             job.jobStatus = "finished"
             finishedJobs.append(job)
     tmpLog.debug('# of finished jobs in activated : {0}'.format(
         len(finishedJobs)))
     self.taskBuffer.updateJobs(finishedJobs, False)
     finishedJobs = []
     for job in updateJobs:
         if job.notDiscardEvents() and job.allOkEvents(
         ) and not EventServiceUtils.isEventServiceMerge(job):
             # change status
             job.jobStatus = "finished"
             finishedJobs.append(job)
     tmpLog.debug('# of finished jobs in defined : {0}'.format(
         len(finishedJobs)))
     self.taskBuffer.updateJobs(finishedJobs, True)
     # delete local values
     del updateJobs
     del failedJobs
     del activateJobs
     del waitingJobs

コード例 #2

ファイルを表示

 def __init__(self,taskBuffer,jobs,logger,params,defaultMap):
     self.jobs = []
     self.jumboJobs = []
     # separate normal and jumbo jobs
     for tmpJob in jobs:
         if EventServiceUtils.isJumboJob(tmpJob):
             self.jumboJobs.append(tmpJob)
         else:
             self.jobs.append(tmpJob)
     self.taskBuffer = taskBuffer
     self.logger = logger
     # set named parameters
     for tmpKey,tmpVal in params.iteritems():
         setattr(self,tmpKey,tmpVal)
     # set defaults
     for tmpKey,tmpVal in defaultMap.iteritems():
         if not hasattr(self,tmpKey):
             setattr(self,tmpKey,tmpVal)

コード例 #3

ファイルを表示

ファイル: AdderAtlasPlugin.py プロジェクト: EntityOfPlague/panda-server

 def _updateOutputs(self):
     # return if non-DQ2
     if self.pandaDDM or self.job.destinationSE == 'local':
         return 0
     # check files
     idMap = {}
     fileList = []
     subMap = {}        
     dsDestMap = {}
     for file in self.job.Files:
         if file.type == 'output' or file.type == 'log':
             # append to fileList
             fileList.append(file.lfn)
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 continue
             # add only log file for successful ES jobs
             if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job) and file.type != 'log':
                 continue
             try:
                 # fsize
                 fsize = None
                 if not file.fsize in ['NULL','',0]:
                     try:
                         fsize = long(file.fsize)
                     except:
                         type, value, traceBack = sys.exc_info()
                         self.logger.error("%s : %s %s" % (self.jobID,type,value))
                 # append to map
                 if not idMap.has_key(file.destinationDBlock):
                     idMap[file.destinationDBlock] = []
                 fileAttrs = {'guid'     : file.GUID,
                              'lfn'      : file.lfn,
                              'size'     : fsize,
                              'checksum' : file.checksum}
                 # add SURLs if LFC registration is required
                 if self.useCentralLFC():
                     fileAttrs['surl'] = self.extraInfo['surl'][file.lfn]
                     if fileAttrs['surl'] == None:
                         raise TypeError,"{0} has SURL=None".format(file.lfn)
                     # get destination
                     if not dsDestMap.has_key(file.destinationDBlock):
                         if file.destinationDBlockToken in ['',None,'NULL']:
                             tmpDestList = [self.siteMapper.getSite(self.job.computingSite).ddm]
                         elif DataServiceUtils.getDestinationSE(file.destinationDBlockToken) != None and \
                                 self.siteMapper.getSite(self.job.computingSite).ddm == self.siteMapper.getSite(file.destinationSE).ddm:
                             tmpDestList = [DataServiceUtils.getDestinationSE(file.destinationDBlockToken)]
                         elif self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \
                                 (not self.siteMapper.getSite(self.job.computingSite).ddm.endswith('PRODDISK')) and  \
                                 (not self.job.prodSourceLabel in ['user','panda']):
                             # T1 used as T2
                             tmpDestList = [self.siteMapper.getSite(self.job.computingSite).ddm]
                         else:
                             tmpDestList = []
                             tmpSeTokens = self.siteMapper.getSite(self.job.computingSite).setokens
                             for tmpDestToken in file.destinationDBlockToken.split(','):
                                 if tmpSeTokens.has_key(tmpDestToken):
                                     tmpDest = tmpSeTokens[tmpDestToken]
                                 else:
                                     tmpDest = self.siteMapper.getSite(self.job.computingSite).ddm
                                 if not tmpDest in tmpDestList:
                                     tmpDestList.append(tmpDest)
                         dsDestMap[file.destinationDBlock] = tmpDestList
                 # extra meta data
                 if self.ddmBackEnd == 'rucio':
                     if file.lfn in self.extraInfo['lbnr']:
                         fileAttrs['lumiblocknr'] = self.extraInfo['lbnr'][file.lfn]
                     if file.lfn in self.extraInfo['nevents']:
                         fileAttrs['events'] = self.extraInfo['nevents'][file.lfn]
                     elif self.extraInfo['nevents'] != {}:
                         fileAttrs['events'] = None
                     #if not file.jediTaskID in [0,None,'NULL']:
                     #    fileAttrs['task_id'] = file.jediTaskID
                     #fileAttrs['panda_id'] = file.PandaID
                 idMap[file.destinationDBlock].append(fileAttrs)
                 # for subscription
                 if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user','rucio_test'] and \
                        re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \
                        self.job.destinationSE != 'local':
                     if self.siteMapper == None:
                         self.logger.error("SiteMapper==None")
                     else:
                         # get dataset spec
                         if not self.datasetMap.has_key(file.destinationDBlock):
                             tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock})
                             self.datasetMap[file.destinationDBlock] = tmpDS
                         # check if valid dataset        
                         if self.datasetMap[file.destinationDBlock] == None:
                             self.logger.error(": cannot find %s in DB" % file.destinationDBlock)
                         else:
                             if not self.datasetMap[file.destinationDBlock].status in ['defined']:
                                 # not a fresh dataset
                                 self.logger.debug(": subscription was already made for %s:%s" % \
                                               (self.datasetMap[file.destinationDBlock].status,
                                                file.destinationDBlock))
                             else:
                                 # get DQ2 IDs
                                 tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
                                 tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
                                 if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE):
                                     # DQ2 ID was set by using --destSE for analysis job to transfer output
                                     tmpDstDDM = file.destinationSE
                                     tmpDstSEs = file.destinationSE
                                 else:
                                     if DataServiceUtils.getDestinationSE(file.destinationDBlockToken) != None:
                                         tmpDstDDM = DataServiceUtils.getDestinationSE(file.destinationDBlockToken)
                                     else:
                                         tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm
                                     tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se)
                                 # if src != dest or multi-token
                                 if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \
                                    (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0):
                                     optSub = {'DATASET_COMPLETE_EVENT' : ['http://%s:%s/server/panda/datasetCompleted' % \
                                                                           (panda_config.pserverhosthttp,panda_config.pserverporthttp)]}
                                     # append
                                     if not subMap.has_key(file.destinationDBlock):
                                         subMap[file.destinationDBlock] = []
                                         # sources
                                         optSource = {}
                                         # set sources
                                         if file.destinationDBlockToken in ['NULL','']:
                                             # use default DQ2 ID as source
                                             optSource[tmpSrcDDM] = {'policy' : 0}
                                         else:
                                             # convert token to DQ2 ID
                                             dq2ID = tmpSrcDDM
                                             # use the first token's location as source for T1D1
                                             tmpSrcToken = file.destinationDBlockToken.split(',')[0]
                                             if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken):
                                                 dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken]
                                             optSource[dq2ID] = {'policy' : 0}
                                         # T1 used as T2
                                         if self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \
                                            (not tmpSrcDDM.endswith('PRODDISK')) and  \
                                            (not self.job.prodSourceLabel in ['user','panda']):
                                             # register both DATADISK and PRODDISK as source locations
                                             if self.siteMapper.getSite(self.job.computingSite).setokens.has_key('ATLASPRODDISK'):
                                                 dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens['ATLASPRODDISK']
                                                 optSource[dq2ID] = {'policy' : 0}
                                             if not optSource.has_key(tmpSrcDDM):
                                                 optSource[tmpSrcDDM] = {'policy' : 0}
                                         # use another location when token is set
                                         if not file.destinationDBlockToken in ['NULL','']:
                                             tmpDQ2IDList = []
                                             tmpDstTokens = file.destinationDBlockToken.split(',')
                                             # remove the first one because it is already used as a location
                                             if tmpSrcDDM == tmpDstDDM:
                                                 tmpDstTokens = tmpDstTokens[1:]
                                             # loop over all tokens
                                             for idxToken,tmpDstToken in enumerate(tmpDstTokens):
                                                 dq2ID = tmpDstDDM
                                                 if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken):
                                                     dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken]
                                                 # keep the fist destination for multi-hop
                                                 if idxToken == 0:
                                                     firstDestDDM = dq2ID
                                                 else:
                                                     # use the fist destination as source for T1D1
                                                     optSource = {}                                                        
                                                     optSource[firstDestDDM] = {'policy' : 0}
                                                 # remove looping subscription
                                                 if dq2ID == tmpSrcDDM:
                                                     continue
                                                 # avoid duplication
                                                 if not dq2ID in tmpDQ2IDList:
                                                     subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
                                         else:
                                             # use default DDM
                                             for dq2ID in tmpDstDDM.split(','):
                                                 subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
             except:
                 errStr = '%s %s' % sys.exc_info()[:2]
                 self.logger.error(errStr)
                 self.result.setFatal()
                 self.job.ddmErrorDiag = 'failed before adding files : ' + errStr
                 return 1
     # cleanup submap
     tmpKeys = subMap.keys()
     for tmpKey in tmpKeys:
         if subMap[tmpKey] == []:
             del subMap[tmpKey]
     # add data to original dataset
     for destinationDBlock in idMap.keys():
         origDBlock = None
         match = re.search('^(.+)_sub\d+$',destinationDBlock)
         if match != None:
             # add files to top-level datasets
             origDBlock = match.group(1)
             if not self.goToTransferring:
                 idMap[origDBlock] = idMap[destinationDBlock]
         # add files to top-level datasets only 
         if self.addToTopOnly or self.goToMerging:
             del idMap[destinationDBlock]
         # skip sub unless getting transferred
         if origDBlock != None:
             if not self.goToTransferring and not self.logTransferring \
                    and idMap.has_key(destinationDBlock):
                 del idMap[destinationDBlock]
     # print idMap
     self.logger.debug("idMap = %s" % idMap)
     self.logger.debug("subMap = %s" % subMap)
     self.logger.debug("dsDestMap = %s" % dsDestMap)
     self.logger.debug("extraInfo = %s" % str(self.extraInfo))
     # check consistency of destinationDBlock
     hasSub = False
     for destinationDBlock in idMap.keys():
         match = re.search('^(.+)_sub\d+$',destinationDBlock)
         if match != None:
             hasSub = True
             break
     if idMap != {} and self.goToTransferring and not hasSub:
         errStr = 'no sub datasets for transferring. destinationDBlock may be wrong'
         self.logger.error(errStr)
         self.result.setFatal()
         self.job.ddmErrorDiag = 'failed before adding files : ' + errStr
         return 1
     # add data
     self.logger.debug("addFiles start")
     # count the number of files
     regNumFiles = 0
     regFileList = []
     for tmpRegDS,tmpRegList in idMap.iteritems():
         for tmpRegItem in tmpRegList:
             if not tmpRegItem['lfn'] in regFileList:
                 regNumFiles += 1
                 regFileList.append(tmpRegItem['lfn'])
     # decompose idMap
     if not self.useCentralLFC():
         destIdMap = {None:idMap}
     else:
         destIdMap = self.decomposeIdMap(idMap,dsDestMap)          
     # add files
     nTry = 3
     for iTry in range(nTry):
         isFatal  = False
         isFailed = False
         regStart = datetime.datetime.utcnow()
         try:
             if not self.useCentralLFC():
                 regMsgStr = "DQ2 registraion for %s files " % regNumFiles                    
             else:
                 regMsgStr = "LFC+DQ2 registraion with backend={0} for {1} files ".format(self.ddmBackEnd,
                                                                                          regNumFiles)
             self.logger.debug('%s %s' % ('registerFilesInDatasets',str(destIdMap)))
             out = rucioAPI.registerFilesInDataset(destIdMap)
         except (DQ2.DQClosedDatasetException,
                 DQ2.DQFrozenDatasetException,
                 DQ2.DQUnknownDatasetException,
                 DQ2.DQDatasetExistsException,
                 DQ2.DQFileMetaDataMismatchException,
                 FileCatalogUnknownFactory,
                 FileCatalogException,
                 DataIdentifierNotFound,
                 RucioFileCatalogException,
                 FileConsistencyMismatch,
                 UnsupportedOperation,
                 exceptions.KeyError):
             # fatal errors
             errType,errValue = sys.exc_info()[:2]
             out = '%s : %s' % (errType,errValue)
             isFatal = True
             isFailed = True
         except:
             # unknown errors
             errType,errValue = sys.exc_info()[:2]
             out = '%s : %s' % (errType,errValue)
             isFatal = False
             isFailed = True                
         regTime = datetime.datetime.utcnow() - regStart
         self.logger.debug(regMsgStr + \
                               'took %s.%03d sec' % (regTime.seconds,regTime.microseconds/1000))
         # failed
         if isFailed or isFatal:
             self.logger.error('%s' % out)
             if (iTry+1) == nTry or isFatal:
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 # extract important error string
                 extractedErrStr = DataServiceUtils.extractImportantError(out)
                 errMsg = "Could not add files to DDM: "
                 if extractedErrStr == '':
                     self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
                 else:
                     self.job.ddmErrorDiag = errMsg + extractedErrStr
                 if isFatal:
                     self.result.setFatal()
                 else:
                     self.result.setTemporary()
                 return 1
             self.logger.error("Try:%s" % iTry)
             # sleep
             time.sleep(10)                    
         else:
             self.logger.debug('%s' % str(out))
             break
     # register dataset subscription
     subActivity = 'Production Output'
     if not self.job.prodSourceLabel in ['user']:
         for tmpName,tmpVal in subMap.iteritems():
             for dq2ID,optSub,optSource in tmpVal:
                 if not self.goToMerging:
                     # make DQ2 subscription for prod jobs
                     self.logger.debug("%s %s %s" % ('registerDatasetSubscription',
                                                     (tmpName,dq2ID),
                                                     {'version':0,'archived':0,'callbacks':optSub,
                                                      'sources':optSource,'sources_policy':(001000 | 010000),
                                                      'wait_for_sources':0,'destination':None,'query_more_sources':0,
                                                      'sshare':"production",'group':None,'activity':subActivity,
                                                      'acl_alias':None,'replica_lifetime':"14 days"}))
                     for iDDMTry in range(3):
                         out = 'OK'
                         isFailed = False                        
                         try:
                             self.dq2api.registerDatasetSubscription(tmpName,dq2ID,version=0,archived=0,callbacks=optSub,
                                                                     sources=optSource,sources_policy=(001000 | 010000),
                                                                     wait_for_sources=0,destination=None,query_more_sources=0,
                                                                     sshare="production",group=None,activity=subActivity,
                                                                     acl_alias=None,replica_lifetime="14 days")
                         except DQ2.DQSubscriptionExistsException:
                             # harmless error
                             errType,errValue = sys.exc_info()[:2]
                             out = '%s : %s' % (errType,errValue)
                         except:
                             # unknown errors
                             errType,errValue = sys.exc_info()[:2]
                             out = '%s : %s' % (errType,errValue)
                             isFailed = True
                             if 'is not a Tiers of Atlas Destination' in str(errValue) or \
                                     'is not in Tiers of Atlas' in str(errValue) or \
                                     'RSE Expression resulted in an empty set' in str(errValue) or \
                                     'RSE excluded due to write blacklisting' in str(errValue) or \
                                     'used/quota' in str(errValue):
                                 # fatal error
                                 self.job.ddmErrorCode = ErrorCode.EC_Subscription
                             else:
                                 # retry for temporary errors
                                 time.sleep(10)
                         else:
                             break
                     if isFailed:
                         self.logger.error('%s' % out)
                         # extract important error string
                         extractedErrStr = DataServiceUtils.extractImportantError(out)
                         if self.job.ddmErrorCode == ErrorCode.EC_Subscription:
                             # fatal error
                             if extractedErrStr == '':
                                 self.job.ddmErrorDiag = "subscription failure with %s" % out
                             else:
                                 self.logger.error(extractedErrStr)
                                 self.job.ddmErrorDiag = "subscription failure with %s" % extractedErrStr
                             self.result.setFatal()
                         else:
                             # temoprary errors
                             self.job.ddmErrorCode = ErrorCode.EC_Adder                
                             self.job.ddmErrorDiag = "could not register subscription : %s" % tmpName
                             self.result.setTemporary()
                         return 1
                     self.logger.debug('%s' % str(out))
                 else:
                     # register location
                     tmpDsNameLoc = re.sub('_sub\d+$','',tmpName)
                     for tmpLocName in optSource.keys():
                         self.logger.debug("%s %s %s %s" % ('registerDatasetLocation',tmpDsNameLoc,tmpLocName,
                                                            {'lifetime':"14 days"}))
                         for iDDMTry in range(3):
                             out = 'OK'
                             isFailed = False                        
                             try:                        
                                 self.dq2api.registerDatasetLocation(tmpDsNameLoc,tmpLocName,lifetime="14 days")
                             except DQ2.DQLocationExistsException:
                                 # harmless error
                                 errType,errValue = sys.exc_info()[:2]
                                 out = '%s : %s' % (errType,errValue)
                             except:
                                 # unknown errors
                                 errType,errValue = sys.exc_info()[:2]
                                 out = '%s : %s' % (errType,errValue)
                                 isFailed = True
                                 # retry for temporary errors
                                 time.sleep(10)
                             else:
                                 break
                         if isFailed:
                             self.logger.error('%s' % out)
                             if self.job.ddmErrorCode == ErrorCode.EC_Location:
                                 # fatal error
                                 self.job.ddmErrorDiag = "location registration failure with %s" % out
                                 self.result.setFatal()
                             else:
                                 # temoprary errors
                                 self.job.ddmErrorCode = ErrorCode.EC_Adder                
                                 self.job.ddmErrorDiag = "could not register location : %s" % tmpDsNameLoc
                                 self.result.setTemporary()
                             return 1
                         self.logger.debug('%s' % str(out))
                 # set dataset status
                 self.datasetMap[tmpName].status = 'running'

コード例 #4

ファイルを表示

ファイル: Watcher.py プロジェクト: wguanicedew/panda-server

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],
                                               fromDefined=False,
                                               fromArchived=False,
                                               fromWaiting=False)[0]
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in [
                        'running', 'sent', 'starting', 'holding', 'stagein',
                        'stageout'
                ]:
                    if job.jobStatus == 'transferring' and job.prodSourceLabel in [
                            'user', 'panda'
                    ]:
                        pass
                    else:
                        _logger.debug('%s escape : %s' %
                                      (self.pandaID, job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (
                        job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug(
                        '%s %s lastmod:%s endtime:%s' %
                        (job.PandaID, job.jobStatus, str(
                            job.modificationTime), str(job.endTime)))
                    # retry ES merge jobs
                    if EventServiceUtils.isEventServiceMerge(job):
                        self.taskBuffer.retryJob(job.PandaID, {},
                                                 getNewPandaID=True,
                                                 attemptNr=job.attemptNr,
                                                 recoverableEsMerge=True)
                        # read back
                        job = self.taskBuffer.peekJobs([self.pandaID],
                                                       fromDefined=False,
                                                       fromArchived=False,
                                                       fromWaiting=False)[0]
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.modificationTime)
                            else:
                                # job recovery failed
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                    job.endTime)
                                if job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorDiag += ' in transferring'
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (
                            self.sleepTime / 60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(
                            job
                    ) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(
                            job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job], False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug(
                                "Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job.PandaID, source,
                                error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs(
                                [job.PandaID],
                                fromDefined=False,
                                fromArchived=True,
                                fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug(
                                    "Watcher.run 2 will call apply_retrial_rules"
                                )
                                retryModule.apply_retrial_rules(
                                    self.taskBuffer, job_tmp.PandaID, source,
                                    error_code, error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer, destDBList, job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60 * self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type, value))
            return

コード例 #5

ファイルを表示

ファイル: JobDispatcher.py プロジェクト: ruslan33/panda-server-ornl

 def getJob(self, siteName, prodSourceLabel, cpu, mem, diskSpace, node,
            timeout, computingElement, atlasRelease, prodUserID,
            getProxyKey, countryGroup, workingGroup, allowOtherCountry,
            realDN, taskID, nJobs, acceptJson):
     jobs = []
     useGLEXEC = False
     useProxyCache = False
     try:
         tmpNumJobs = int(nJobs)
     except:
         tmpNumJobs = None
     if tmpNumJobs == None:
         tmpNumJobs = 1
     # wrapper function for timeout
     if hasattr(panda_config,
                'global_shares') and panda_config.global_shares == True:
         tmpWrapper = _TimedMethod(self.taskBuffer.getJobsGShare, timeout)
     else:
         tmpWrapper = _TimedMethod(self.taskBuffer.getJobs, timeout)
     tmpWrapper.run(tmpNumJobs, siteName, prodSourceLabel, cpu, mem,
                    diskSpace, node, timeout, computingElement,
                    atlasRelease, prodUserID, getProxyKey, countryGroup,
                    workingGroup, allowOtherCountry, taskID)
     if isinstance(tmpWrapper.result, types.ListType):
         jobs = jobs + tmpWrapper.result
     # make response
     if len(jobs) > 0:
         proxyKey = jobs[-1]
         nSent = jobs[-2]
         jobs = jobs[:-2]
     if len(jobs) != 0:
         # succeed
         self.siteMapperCache.update()
         responseList = []
         # append Jobs
         for tmpJob in jobs:
             response = Protocol.Response(Protocol.SC_Success)
             response.appendJob(tmpJob, self.siteMapperCache)
             # append nSent
             response.appendNode('nSent', nSent)
             # set proxy key
             if getProxyKey:
                 response.setProxyKey(proxyKey)
             # check if glexec or proxy cache is used
             if hasattr(panda_config, 'useProxyCache'
                        ) and panda_config.useProxyCache == True:
                 self.specialDispatchParams.update()
                 if not 'glexecSites' in self.specialDispatchParams:
                     glexecSites = {}
                 else:
                     glexecSites = self.specialDispatchParams['glexecSites']
                 if siteName in glexecSites:
                     if glexecSites[siteName] == 'True':
                         useGLEXEC = True
                     elif glexecSites[siteName] == 'test' and \
                             (prodSourceLabel in ['test','prod_test'] or \
                                  (tmpJob.processingType in ['gangarobot'])):
                         useGLEXEC = True
                 if not 'proxyCacheSites' in self.specialDispatchParams:
                     proxyCacheSites = {}
                 else:
                     proxyCacheSites = self.specialDispatchParams[
                         'proxyCacheSites']
                 if siteName in proxyCacheSites:
                     useProxyCache = True
             # set proxy
             if useGLEXEC or useProxyCache:
                 try:
                     #  get compact
                     compactDN = self.taskBuffer.cleanUserID(realDN)
                     # check permission
                     self.specialDispatchParams.update()
                     if not 'allowProxy' in self.specialDispatchParams:
                         allowProxy = []
                     else:
                         allowProxy = self.specialDispatchParams[
                             'allowProxy']
                     if not compactDN in allowProxy:
                         _logger.warning(
                             "getJob : %s %s '%s' no permission to retrive user proxy"
                             % (siteName, node, compactDN))
                     else:
                         if useProxyCache:
                             tmpStat, tmpOut = response.setUserProxy(
                                 proxyCacheSites[siteName]['dn'],
                                 proxyCacheSites[siteName]['role'])
                         else:
                             tmpStat, tmpOut = response.setUserProxy()
                         if not tmpStat:
                             _logger.warning(
                                 "getJob : %s %s failed to get user proxy : %s"
                                 % (siteName, node, tmpOut))
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     _logger.warning(
                         "getJob : %s %s failed to get user proxy with %s:%s"
                         % (siteName, node, errtype.__name__, errvalue))
             # panda proxy
             if 'pandaProxySites' in self.specialDispatchParams and siteName in self.specialDispatchParams['pandaProxySites'] \
                     and (EventServiceUtils.isEventServiceJob(tmpJob) or EventServiceUtils.isEventServiceMerge(tmpJob)):
                 # get secret key
                 tmpSecretKey, tmpErrMsg = DispatcherUtils.getSecretKey(
                     tmpJob.PandaID)
                 if tmpSecretKey == None:
                     _logger.warning(
                         "getJob : PandaID=%s site=%s failed to get panda proxy secret key : %s"
                         % (tmpJob.PandaID, siteName, tmpErrMsg))
                 else:
                     # set secret key
                     _logger.debug("getJob : PandaID=%s key=%s" %
                                   (tmpJob.PandaID, tmpSecretKey))
                     response.setPandaProxySecretKey(tmpSecretKey)
             # add
             responseList.append(response.data)
         # make response for bulk
         if nJobs != None:
             response = Protocol.Response(Protocol.SC_Success)
             if not acceptJson:
                 response.appendNode('jobs', json.dumps(responseList))
             else:
                 response.appendNode('jobs', responseList)
     else:
         if tmpWrapper.result == Protocol.TimeOutToken:
             # timeout
             response = Protocol.Response(Protocol.SC_TimeOut)
         else:
             # no available jobs
             response = Protocol.Response(Protocol.SC_NoJobs)
             _pilotReqLogger.info('method=noJob,site=%s,node=%s,type=%s' %
                                  (siteName, node, prodSourceLabel))
     # return
     _logger.debug("getJob : %s %s useGLEXEC=%s ret -> %s" %
                   (siteName, node, useGLEXEC, response.encode(acceptJson)))
     return response.encode(acceptJson)

コード例 #6

ファイルを表示

ファイル: Setupper.py プロジェクト: PanDAWMS/panda-server

 def updateJobs(self,jobList,tmpLog):
     updateJobs   = []
     failedJobs   = []
     activateJobs = []
     waitingJobs  = []
     closeJobs  = []
     # sort out jobs
     for job in jobList:
         # failed jobs
         if job.jobStatus in ['failed','cancelled']:
             failedJobs.append(job)
         # waiting
         elif job.jobStatus == 'waiting':
             waitingJobs.append(job)
         # no input jobs
         elif job.dispatchDBlock=='NULL':
             activateJobs.append(job)
         # normal jobs
         else:
             # change status
             job.jobStatus = "assigned"
             updateJobs.append(job)
     # trigger merge generation if all events are done
     newActivateJobs = []
     nFinished = 0
     for job in activateJobs:
         if job.notDiscardEvents() and job.allOkEvents() and not EventServiceUtils.isEventServiceMerge(job):
             self.taskBuffer.activateJobs([job])
             # change status
             job.jobStatus = "finished"
             self.taskBuffer.updateJobs([job], False)
             nFinished += 1
         else:
             newActivateJobs.append(job)
     activateJobs = newActivateJobs
     tmpLog.debug('# of finished jobs in activated : {0}'.format(nFinished))
     newUpdateJobs = []
     nFinished = 0
     for job in updateJobs:
         if job.notDiscardEvents() and job.allOkEvents() and not EventServiceUtils.isEventServiceMerge(job):
             self.taskBuffer.updateJobs([job], True)
             # change status
             job.jobStatus = "finished"
             self.taskBuffer.updateJobs([job], True)
             nFinished += 1
         else:
             newUpdateJobs.append(job)
     updateJobs = newUpdateJobs
     tmpLog.debug('# of finished jobs in defined : {0}'.format(nFinished))
     # update DB
     tmpLog.debug('# of activated jobs : {0}'.format(len(activateJobs)))
     self.taskBuffer.activateJobs(activateJobs)
     tmpLog.debug('# of updated jobs : {0}'.format(len(updateJobs)))
     self.taskBuffer.updateJobs(updateJobs,True)
     tmpLog.debug('# of failed jobs : {0}'.format(len(failedJobs)))
     self.taskBuffer.updateJobs(failedJobs,True)
     tmpLog.debug('# of waiting jobs : {0}'.format(len(waitingJobs)))
     self.taskBuffer.keepJobs(waitingJobs)
     # delete local values
     del updateJobs
     del failedJobs
     del activateJobs
     del waitingJobs

コード例 #7

ファイルを表示

ファイル: Protocol.py プロジェクト: EntityOfPlague/panda-server

 def appendJob(self,job):
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''                
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn  = ''
     strScopeOut = ''
     strScopeLog = ''        
     logFile = ''
     logGUID = ''        
     for file in job.Files:
         if file.type == 'input':
             if strIFiles != '':
                 strIFiles += ','
             strIFiles += file.lfn
             if strDispatch != '':
                 strDispatch += ','
             strDispatch += file.dispatchDBlock
             if strDisToken != '':
                 strDisToken += ','
             strDisToken += file.dispatchDBlockToken
             strProdDBlock += '%s,' % file.prodDBlock 
             if strProdToken != '':
                 strProdToken += ','
             strProdToken += file.prodDBlockToken
             if strGUID != '':
                 strGUID += ','
             strGUID += file.GUID
             strRealDatasetIn += '%s,' % file.dataset
             strFSize += '%s,' % file.fsize
             if not file.checksum in ['','NULL',None]:
                 strCheckSum += '%s,' % file.checksum
             else:
                 strCheckSum += '%s,' % file.md5sum
             strScopeIn += '%s,' % file.scope    
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope                        
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += file.destinationDBlockToken.split(',')[0]
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'] = job.jobParameters
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn']  = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     self.data['cmtConfig'] = job.cmtConfig
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # debug mode
     if job.specialHandling != None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service
     if EventServiceUtils.isEventServiceJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         self.data['eventServiceMerge'] = 'True'
         # write to file
         writeToFileStr = ''
         try:
             for outputName,inputList in job.metadata.iteritems():
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except:
             pass
         self.data['writeToFile'] = writeToFileStr

コード例 #8

ファイルを表示

ファイル: AdderGen.py プロジェクト: wguanicedew/panda-server

 def parseXML(self):
     # get LFN and GUID
     self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     if self.job.Files == []:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns = []
     guids = []
     fsizes = []
     md5sums = []
     chksums = []
     surls = []
     fullLfnMap = {}
     nEventsMap = {}
     try:
         root = xml.dom.minidom.parse(self.xmlFile)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize = None
             md5sum = None
             adler32 = None
             surl = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # endpoints
             self.extraInfo['endpoint'][lfn] = []
             for epNode in file.getElementsByTagName('endpoint'):
                 self.extraInfo['endpoint'][lfn].append(
                     str(epNode.firstChild.data))
             # error check
             if (not lfn in inputLFNs) and (fsize == None or
                                            (md5sum == None
                                             and adler32 == None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 != None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN != None:
                 fullLfnMap[lfn] = fullLFN
     except:
         # parse json
         try:
             import json
             with open(self.xmlFile) as tmpF:
                 jsonDict = json.load(tmpF)
                 for lfn, fileData in jsonDict.iteritems():
                     lfn = str(lfn)
                     fsize = None
                     md5sum = None
                     adler32 = None
                     surl = None
                     fullLFN = None
                     guid = str(fileData['guid'])
                     if 'fsize' in fileData:
                         fsize = long(fileData['fsize'])
                     if 'md5sum' in fileData:
                         md5sum = str(fileData['md5sum'])
                         # check
                         if re.search("^[a-fA-F0-9]{32}$", md5sum) == None:
                             md5sum = None
                     if 'adler32' in fileData:
                         adler32 = str(fileData['adler32'])
                     if 'surl' in fileData:
                         surl = str(fileData['surl'])
                     if 'full_lfn' in fileData:
                         fullLFN = str(fileData['full_lfn'])
                     # endpoints
                     self.extraInfo['endpoint'][lfn] = []
                     if 'endpoint' in fileData:
                         self.extraInfo['endpoint'][lfn] = fileData[
                             'endpoint']
                     # error check
                     if (not lfn in inputLFNs) and (fsize == None or
                                                    (md5sum == None
                                                     and adler32 == None)):
                         if EventServiceUtils.isEventServiceMerge(self.job):
                             continue
                         else:
                             raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                     # append
                     lfns.append(lfn)
                     guids.append(guid)
                     fsizes.append(fsize)
                     md5sums.append(md5sum)
                     surls.append(surl)
                     if adler32 != None:
                         # use adler32 if available
                         chksums.append("ad:%s" % adler32)
                     else:
                         chksums.append("md5:%s" % md5sum)
                     if fullLFN != None:
                         fullLfnMap[lfn] = fullLFN
         except:
             # check if file exists
             if os.path.exists(self.xmlFile):
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
                 # set failed anyway
                 self.job.jobStatus = 'failed'
                 # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                 if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                    (self.job.transExitCode  in [0,'0','NULL']):
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                 return 2
             else:
                 # XML was deleted
                 return 1
     # parse metadata to get nEvents
     try:
         root = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical = file.getElementsByTagName('logical')[0]
             lfnNode = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
     except:
         pass
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     # parse json
     try:
         import json
         jsonDict = json.loads(self.job.metadata)
         for jsonFileItem in jsonDict['files']['output']:
             for jsonSubFileItem in jsonFileItem['subFiles']:
                 lfn = str(jsonSubFileItem['name'])
                 try:
                     nevents = long(jsonSubFileItem['nentries'])
                     nEventsMap[lfn] = nevents
                 except:
                     pass
     except:
         pass
     self.logger.debug('nEventsMapJson=%s' % str(nEventsMap))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # copy files for variable number of outputs
     tmpStat = self.copyFilesForVariableNumOutputs(lfns)
     if not tmpStat:
         self.logger.error(
             "failed to copy files for variable number of outputs")
         return 2
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user', 'panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in [
                         'managed', 'test', 'rc_test', 'ptest'
                 ]:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if not file.lfn in lfns:
                 if self.job.jobStatus == 'finished' and \
                         (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                     # unset file status for ES jobs
                     pass
                 elif file.isAllowedNoOutput():
                     # allowed not to be produced
                     file.status = 'nooutput'
                     self.logger.debug('set {0} to status={1}'.format(
                         file.lfn, file.status))
                 else:
                     file.status = 'failed'
                     self.job.jobStatus = 'failed'
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                         file.lfn)
                     self.logger.error(self.job.ddmErrorDiag)
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID = guids[i]
                 file.fsize = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if fullLfnMap.has_key(file.lfn):
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents
                 if nEventsMap.has_key(file.lfn):
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[
                         file.lfn]
             except:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type, value))
             # set lumi block number
             if lumiBlockNr != None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
     # check consistency between XML and filesTable
     for lfn in lfns:
         if not lfn in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                 lfn)
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0

コード例 #9

ファイルを表示

ファイル: AdderGen.py プロジェクト: wguanicedew/panda-server

    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(
                    self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled(
                ) and not self.job.taskBufferErrorCode in [
                        taskbuffer.ErrorCode.EC_PilotRetried
                ]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI == None:
                        raise RuntimeError, 'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(),
                                            fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError, 'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, source,
                                error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, source,
                                error_code, error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID, assDBlocks in assDBlockMap.iteritems(
                            ):
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type, value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type, value))
                self.logger.error("cannot unlock XML")

コード例 #10

ファイルを表示

ファイル: Watcher.py プロジェクト: PanDAWMS/panda-server

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False,
                                               fromArchived=False,fromWaiting=False)[0]
                _logger.debug('%s in %s' % (self.pandaID, job.jobStatus))
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in ['running','sent','starting','holding',
                                         'stagein','stageout']:
                    if job.jobStatus == 'transferring' and (job.prodSourceLabel in ['user','panda'] or job.jobSubStatus not in [None, 'NULL', '']):
                        pass
                    else:
                        _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus,
                                                                   str(job.modificationTime),
                                                                   str(job.endTime)))
                    destDBList = []
                    if job.jobStatus == 'sent':
                        # sent job didn't receive reply from pilot within 30 min
                        job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                        job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                    elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                        # lost heartbeat
                        job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                        if job.jobDispatcherErrorDiag == 'NULL':
                            if job.endTime == 'NULL':
                                # normal lost heartbeat
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime)
                            else:
                                # job recovery failed
                                job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime)
                                if job.jobStatus == 'transferring':
                                    job.jobDispatcherErrorDiag += ' in transferring'
                            # get worker
                            workerSpecs = self.taskBuffer.getWorkersForJob(job.PandaID)
                            if len(workerSpecs) > 0:
                                workerSpec = workerSpecs[0]
                                if workerSpec.status in ['finished', 'failed', 'cancelled', 'missed']:
                                    job.supErrorCode = SupErrors.error_codes['WORKER_ALREADY_DONE']
                                    job.supErrorDiag = 'worker already {0} at {1} with {2}'.format(workerSpec.status, str(workerSpec.endTime),
                                                                                                   workerSpec.diagMessage)
                                    job.supErrorDiag = JobSpec.truncateStringAttr('supErrorDiag', job.supErrorDiag)
                    else:
                        # job recovery failed
                        job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                        job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60)
                    # set job status
                    job.jobStatus = 'failed'
                    # set endTime for lost heartbeat
                    if job.endTime == 'NULL':
                        # normal lost heartbeat
                        job.endTime = job.modificationTime
                    # set files status
                    for file in job.Files:
                        if file.type == 'output' or file.type == 'log':
                            file.status = 'failed'
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                    # event service
                    if EventServiceUtils.isEventServiceJob(job) and not EventServiceUtils.isJobCloningJob(job):
                        eventStat = self.taskBuffer.getEventStat(job.jediTaskID, job.PandaID)
                        # set sub status when no sucessful events
                        if EventServiceUtils.ST_finished not in eventStat:
                            job.jobSubStatus = 'es_heartbeat'
                    # update job
                    self.taskBuffer.updateJobs([job],False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug("Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job.PandaID, source, error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))

                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        try:

                            _logger.debug("Watcher.run will peek the job")
                            job_tmp = self.taskBuffer.peekJobs([job.PandaID], fromDefined=False, fromArchived=True,
                                                               fromWaiting=False)[0]
                            if job_tmp.taskBufferErrorCode:
                                source = 'taskBufferErrorCode'
                                error_code = job_tmp.taskBufferErrorCode
                                error_diag = job_tmp.taskBufferErrorDiag
                                _logger.debug("Watcher.run 2 will call apply_retrial_rules")
                                retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                                error_diag, job_tmp.attemptNr)
                                _logger.debug("apply_retrial_rules 2 is back")
                        except IndexError:
                            pass
                        except Exception as e:
                            self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                        cThr = Closer(self.taskBuffer,destDBList,job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)                        
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60*self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type,value))
            return

コード例 #11

ファイルを表示

ファイル: AdderAtlasPlugin.py プロジェクト: EntityOfPlague/panda-server

 def execute(self):
     try:
         self.logger.debug("start plugin : %s" % self.jobStatus)
         # backend
         self.ddmBackEnd = self.job.getDdmBackEnd()
         if self.ddmBackEnd == None:
             self.ddmBackEnd = 'rucio'
         # instantiate DQ2
         if self.ddmBackEnd != None:
             self.dq2api = DQ2.DQ2(force_backend=self.ddmBackEnd)
         else:
             self.dq2api = DQ2.DQ2()
         self.logger.debug("ddm backend = {0}".format(self.ddmBackEnd))
         # add files only to top-level datasets for transferring jobs
         if self.job.jobStatus == 'transferring':
             self.addToTopOnly = True
             self.logger.debug("adder for transferring")
         # use PandaDDM for ddm jobs                                                                                                                
         if self.job.prodSourceLabel == 'ddm':
             self.pandaDDM = True
         # check if the job goes to merging
         if self.job.produceUnMerge():
             self.goToMerging = True
         # check if the job should go to trasnferring
         tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
         tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
         destSEwasSet = False
         brokenSched = False
         if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE):
             # DQ2 ID was set by using --destSE for analysis job to transfer output
             destSEwasSet = True
             tmpDstDDM = self.job.destinationSE
             tmpDstSEs = self.job.destinationSE
         else:
             tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm
             tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se)
             # protection against disappearance of dest from schedconfig
             if not self.siteMapper.checkSite(self.job.destinationSE) and self.job.destinationSE != 'local':
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "destinaitonSE %s is unknown in schedconfig" % self.job.destinationSE
                 self.logger.error("%s" % self.job.ddmErrorDiag)
                 # set fatal error code and return
                 self.result.setFatal()
                 return 
         # protection against disappearance of src from schedconfig        
         if not self.siteMapper.checkSite(self.job.computingSite):
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "computingSite %s is unknown in schedconfig" % self.job.computingSite
             self.logger.error("%s" % self.job.ddmErrorDiag)
             # set fatal error code and return
             self.result.setFatal()
             return
         self.logger.debug('DDM src:%s dst:%s' % (tmpSrcDDM,tmpDstDDM))
         self.logger.debug('SE src:%s dst:%s' % (tmpSrcSEs,tmpDstSEs))
         if re.search('^ANALY_',self.job.computingSite) != None:
             # analysis site
             pass
         elif self.job.computingSite == self.job.destinationSE:
             # same site ID for computingSite and destinationSE
             pass
         elif tmpSrcDDM == tmpDstDDM:
             # same DQ2ID for src/dest
             pass
         elif tmpSrcSEs == tmpDstSEs:
             # same SEs
             pass
         elif self.addToTopOnly:
             # already in transferring
             pass
         elif self.goToMerging:
             # no transferring for merging
             pass
         elif self.job.jobStatus == 'failed':
             # failed jobs
             if self.job.prodSourceLabel in ['managed','test']:
                 self.logTransferring = True
         elif self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job):
             # transfer only log file for ES jobs 
             self.logTransferring = True
         else:
             self.goToTransferring = True
         self.logger.debug('goToTransferring=%s' % self.goToTransferring)
         self.logger.debug('logTransferring=%s' % self.logTransferring)
         self.logger.debug('goToMerging=%s' % self.goToMerging)
         retOut = self._updateOutputs()
         self.logger.debug('added outputs with %s' % retOut)
         if retOut != 0:
             self.logger.debug('terminated when adding')
             return
         # remove unmerged
         if self.job.processingType == 'usermerge' and self.job.prodSourceLabel == 'user' and \
                self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL':
             retMerge = self._removeUnmerged()
             # failed
             if not retMerge:
                 self.logger.debug('terminated when removing unmerged')
                 return
         # succeeded    
         self.result.setSucceeded()    
         self.logger.debug("end plugin")
     except:
         type, value, traceBack = sys.exc_info()
         self.logger.debug(": %s %s" % (type,value))
         # set fatal error code
         self.result.setFatal()
     # return
     return

コード例 #12

ファイルを表示

ファイル: AdderGen.py プロジェクト: PanDAWMS/panda-server

    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in ['finished','failed','unknown','merging']:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
            elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
                errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
                self.logger.error(errMsg)
                # FIXME
                raise RuntimeError, errMsg
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and not self.job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
                    if fileCheckInJEDI == None:
                        raise RuntimeError,'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI 
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug("set jobStatus={0} since input is inconsistent between Panda and JEDI".format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug("going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],'pilot','60',True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError,'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug("set jobStatus={0} since did not get semaphore for job cloning".format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(self.job.VO)
                        adderPlugin = adderPluginClass(self.job,
                                                       taskBuffer=self.taskBuffer,
                                                       siteMapper=self.siteMapper,
                                                       extraInfo=self.extraInfo,
                                                       logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' % (addResult.statusCode))
                    except:
                        errtype,errvalue = sys.exc_info()[:2]
                        self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(self.job.VO,
                                                                                                         errtype,
                                                                                                         errvalue)) 
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"
                        
                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                        self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug("status after plugin call :job.jobStatus=%s jobStatus=%s" % (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                    elif self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
            
                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))
                    
                    if source and error_code:
                        try:
                            self.logger.debug("AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error("apply_retrial_rules excepted and needs to be investigated (%s): %s"%(e, traceback.format_exc()))
                    
                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output','log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:                        
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime=='NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled','closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                                      extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error('failed to update DB for pandaid={0}'.format(self.job.PandaID))
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()                            
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type,value))
                            self.logger.debug("cannot unlock XML")
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs([self.job.PandaID], fromDefined=False, fromArchived=True,
                                                           fromWaiting=False)[0]
                        self.logger.debug("status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}".format(job_tmp.jobStatus,
                                                                                                                job_tmp.taskBufferErrorCode,
                                                                                                                job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            self.logger.debug("AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(self.taskBuffer, job_tmp.PandaID, source, error_code,
                                                            error_diag, job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error("apply_retrial_rules 2 excepted and needs to be investigated (%s): %s" % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['',None,'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                                 'checksum':file.checksum,'md5sum':file.md5sum,
                                                 'fsize':file.fsize,'scope':file.scope})
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(self.job.jediTaskID,self.job.PandaID,
                                                                                            destDBList)
                            for assJobID,assDBlocks in assDBlockMap.iteritems():
                                assJob = self.taskBuffer.peekJobs([assJobID],fromDefined=False,
                                                                  fromArchived=False,
                                                                  fromWaiting=False,
                                                                  forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(': associated job PandaID={0} not found in DB'.format(assJobID))
                                else:
                                    cThr = Closer.Closer(self.taskBuffer,assDBlocks,assJob)
                                    self.logger.debug("start Closer for PandaID={0}".format(assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug("end Closer for PandaID={0}".format(assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()            
        except:
            type, value, traceBack = sys.exc_info()
            errStr = ": %s %s " % (type,value)
            errStr += traceback.format_exc()
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.error(": %s %s" % (type,value))
                self.logger.error("cannot unlock XML")

コード例 #13

ファイルを表示

 def appendJob(self, job, siteMapperCache=None):
     # event service merge
     if EventServiceUtils.isEventServiceMerge(job):
         isEventServiceMerge = True
     else:
         isEventServiceMerge = False
     # PandaID
     self.data['PandaID'] = job.PandaID
     # prodSourceLabel
     self.data['prodSourceLabel'] = job.prodSourceLabel
     # swRelease
     self.data['swRelease'] = job.AtlasRelease
     # homepackage
     self.data['homepackage'] = job.homepackage
     # transformation
     self.data['transformation'] = job.transformation
     # job name
     self.data['jobName'] = job.jobName
     # job definition ID
     self.data['jobDefinitionID'] = job.jobDefinitionID
     # cloud
     self.data['cloud'] = job.cloud
     # files
     strIFiles = ''
     strOFiles = ''
     strDispatch = ''
     strDisToken = ''
     strDisTokenForOutput = ''
     strDestination = ''
     strRealDataset = ''
     strRealDatasetIn = ''
     strProdDBlock = ''
     strDestToken = ''
     strProdToken = ''
     strProdTokenForOutput = ''
     strGUID = ''
     strFSize = ''
     strCheckSum = ''
     strFileDestinationSE = ''
     strScopeIn = ''
     strScopeOut = ''
     strScopeLog = ''
     logFile = ''
     logGUID = ''
     ddmEndPointIn = []
     ddmEndPointOut = []
     noOutput = []
     siteSpec = None
     inDsLfnMap = {}
     if siteMapperCache != None:
         siteMapper = siteMapperCache.getObj()
         siteSpec = siteMapper.getSite(job.computingSite)
         # resove destSE
         try:
             job.destinationSE = siteMapper.resolveNucleus(
                 job.destinationSE)
             for tmpFile in job.Files:
                 tmpFile.destinationSE = siteMapper.resolveNucleus(
                     tmpFile.destinationSE)
         except:
             pass
         siteMapperCache.releaseObj()
     for file in job.Files:
         if file.type == 'input':
             if strIFiles != '':
                 strIFiles += ','
             strIFiles += file.lfn
             if strDispatch != '':
                 strDispatch += ','
             strDispatch += file.dispatchDBlock
             if strDisToken != '':
                 strDisToken += ','
             strDisToken += file.dispatchDBlockToken
             strProdDBlock += '%s,' % file.prodDBlock
             if not isEventServiceMerge:
                 strProdToken += '%s,' % file.prodDBlockToken
             else:
                 strProdToken += '%s,' % job.metadata[1][file.lfn]
             if strGUID != '':
                 strGUID += ','
             strGUID += file.GUID
             strRealDatasetIn += '%s,' % file.dataset
             strFSize += '%s,' % file.fsize
             if not file.checksum in ['', 'NULL', None]:
                 strCheckSum += '%s,' % file.checksum
             else:
                 strCheckSum += '%s,' % file.md5sum
             strScopeIn += '%s,' % file.scope
             ddmEndPointIn.append(
                 self.getDdmEndpoint(siteSpec, file.dispatchDBlockToken))
             if not file.dataset in inDsLfnMap:
                 inDsLfnMap[file.dataset] = []
             inDsLfnMap[file.dataset].append(file.lfn)
         if file.type == 'output' or file.type == 'log':
             if strOFiles != '':
                 strOFiles += ','
             strOFiles += file.lfn
             if strDestination != '':
                 strDestination += ','
             strDestination += file.destinationDBlock
             if strRealDataset != '':
                 strRealDataset += ','
             strRealDataset += file.dataset
             strFileDestinationSE += '%s,' % file.destinationSE
             if file.type == 'log':
                 logFile = file.lfn
                 logGUID = file.GUID
                 strScopeLog = file.scope
             else:
                 strScopeOut += '%s,' % file.scope
             if strDestToken != '':
                 strDestToken += ','
             strDestToken += re.sub(
                 '^ddd:', 'dst:',
                 file.destinationDBlockToken.split(',')[0])
             strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
             strProdTokenForOutput += '%s,' % file.prodDBlockToken
             ddmEndPointOut.append(
                 self.getDdmEndpoint(
                     siteSpec,
                     file.destinationDBlockToken.split(',')[0]))
             if file.isAllowedNoOutput():
                 noOutput.append(file.lfn)
     # inFiles
     self.data['inFiles'] = strIFiles
     # dispatch DBlock
     self.data['dispatchDblock'] = strDispatch
     # dispatch DBlock space token
     self.data['dispatchDBlockToken'] = strDisToken
     # dispatch DBlock space token for output
     self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
     # outFiles
     self.data['outFiles'] = strOFiles
     # destination DBlock
     self.data['destinationDblock'] = strDestination
     # destination DBlock space token
     self.data['destinationDBlockToken'] = strDestToken
     # prod DBlocks
     self.data['prodDBlocks'] = strProdDBlock[:-1]
     # prod DBlock space token
     self.data['prodDBlockToken'] = strProdToken[:-1]
     # real output datasets
     self.data['realDatasets'] = strRealDataset
     # real output datasets
     self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
     # file's destinationSE
     self.data['fileDestinationSE'] = strFileDestinationSE[:-1]
     # log filename
     self.data['logFile'] = logFile
     # log GUID
     self.data['logGUID'] = logGUID
     # jobPars
     self.data['jobPars'] = job.jobParameters
     # attempt number
     self.data['attemptNr'] = job.attemptNr
     # GUIDs
     self.data['GUID'] = strGUID
     # checksum
     self.data['checksum'] = strCheckSum[:-1]
     # fsize
     self.data['fsize'] = strFSize[:-1]
     # scope
     self.data['scopeIn'] = strScopeIn[:-1]
     self.data['scopeOut'] = strScopeOut[:-1]
     self.data['scopeLog'] = strScopeLog
     # DDM endpoints
     self.data['ddmEndPointIn'] = ','.join(ddmEndPointIn)
     self.data['ddmEndPointOut'] = ','.join(ddmEndPointOut)
     # destinationSE
     self.data['destinationSE'] = job.destinationSE
     # user ID
     self.data['prodUserID'] = job.prodUserID
     # CPU count
     self.data['maxCpuCount'] = job.maxCpuCount
     # RAM count
     self.data['minRamCount'] = job.minRamCount
     # disk count
     self.data['maxDiskCount'] = job.maxDiskCount
     # cmtconfig
     self.data['cmtConfig'] = job.cmtConfig
     # processingType
     self.data['processingType'] = job.processingType
     # transferType
     self.data['transferType'] = job.transferType
     # sourceSite
     self.data['sourceSite'] = job.sourceSite
     # current priority
     self.data['currentPriority'] = job.currentPriority
     # taskID
     if job.lockedby == 'jedi':
         self.data['taskID'] = job.jediTaskID
     else:
         self.data['taskID'] = job.taskID
     # core count
     self.data['coreCount'] = job.coreCount
     # jobsetID
     self.data['jobsetID'] = job.jobsetID
     # debug mode
     if job.specialHandling != None and 'debug' in job.specialHandling:
         self.data['debug'] = 'True'
     # event service or job cloning
     if EventServiceUtils.isJobCloningJob(job):
         self.data['cloneJob'] = EventServiceUtils.getJobCloningType(job)
     elif EventServiceUtils.isEventServiceJob(
             job) or EventServiceUtils.isJumboJob(job):
         self.data['eventService'] = 'True'
         # prod DBlock space token for pre-merging output
         self.data['prodDBlockTokenForOutput'] = strProdTokenForOutput[:-1]
     # event service merge
     if isEventServiceMerge:
         self.data['eventServiceMerge'] = 'True'
         # write to file for ES merge
         writeToFileStr = ''
         try:
             for outputName, inputList in job.metadata[0].iteritems():
                 writeToFileStr += 'inputFor_{0}:'.format(outputName)
                 for tmpInput in inputList:
                     writeToFileStr += '{0},'.format(tmpInput)
                 writeToFileStr = writeToFileStr[:-1]
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
         except:
             pass
         self.data['writeToFile'] = writeToFileStr
     elif job.writeInputToFile():
         try:
             # write input to file
             writeToFileStr = ''
             for inDS, inputList in inDsLfnMap.iteritems():
                 inDS = re.sub('/$', '', inDS)
                 inDS = inDS.split(':')[-1]
                 writeToFileStr += 'tmpin_{0}:'.format(inDS)
                 writeToFileStr += ','.join(inputList)
                 writeToFileStr += '^'
             writeToFileStr = writeToFileStr[:-1]
             self.data['writeToFile'] = writeToFileStr
         except:
             pass
     # no output
     if noOutput != []:
         self.data['allowNoOutput'] = ','.join(noOutput)
     # alternative stage-out
     if job.getAltStgOut() != None:
         self.data['altStageOut'] = job.getAltStgOut()
     # log to OS
     if job.putLogToOS():
         self.data['putLogToOS'] = 'True'

コード例 #14

ファイルを表示

ファイル: AdderGen.py プロジェクト: PanDAWMS/panda-server

 def parseXML(self):
     # get LFN and GUID
     self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     if self.job.Files == []:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns    = []
     guids   = []
     fsizes  = []
     md5sums = []
     chksums = []
     surls   = []
     fullLfnMap = {}
     nEventsMap = {}
     guidMap = dict()
     try:
         root  = xml.dom.minidom.parse(self.xmlFile)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize   = None
             md5sum  = None
             adler32 = None
             surl    = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # endpoints
             self.extraInfo['endpoint'][lfn] = []
             for epNode in file.getElementsByTagName('endpoint'):
                 self.extraInfo['endpoint'][lfn].append(str(epNode.firstChild.data))
             # error check
             if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 != None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN != None:
                 fullLfnMap[lfn] = fullLFN
     except:
         # parse json
         try:
             import json
             with open(self.xmlFile) as tmpF:
                 jsonDict = json.load(tmpF)
                 for lfn, fileData in jsonDict.iteritems():
                     lfn = str(lfn)
                     fsize   = None
                     md5sum  = None
                     adler32 = None
                     surl    = None
                     fullLFN = None
                     guid = str(fileData['guid'])
                     if 'fsize' in fileData:
                         fsize = long(fileData['fsize'])
                     if 'md5sum' in fileData:
                         md5sum = str(fileData['md5sum'])
                         # check
                         if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                             md5sum = None
                     if 'adler32' in fileData:
                         adler32 = str(fileData['adler32'])
                     if 'surl' in fileData:
                         surl = str(fileData['surl'])
                     if 'full_lfn' in fileData:
                         fullLFN = str(fileData['full_lfn'])
                     # endpoints
                     self.extraInfo['endpoint'][lfn] = []
                     if 'endpoint' in fileData:
                         self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                     # error check
                     if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                         if EventServiceUtils.isEventServiceMerge(self.job):
                             continue
                         else:
                             raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
                     # append
                     lfns.append(lfn)
                     guids.append(guid)
                     fsizes.append(fsize)
                     md5sums.append(md5sum)
                     surls.append(surl)
                     if adler32 != None:
                         # use adler32 if available
                         chksums.append("ad:%s" % adler32)
                     else:
                         chksums.append("md5:%s" % md5sum)
                     if fullLFN != None:
                         fullLfnMap[lfn] = fullLFN
         except:
             # check if file exists
             if os.path.exists(self.xmlFile):
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type,value))
                 # set failed anyway
                 self.job.jobStatus = 'failed'
                 # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                 if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                    (self.job.taskBufferErrorCode not in [taskbuffer.ErrorCode.EC_WorkerDone]) and \
                    (self.job.transExitCode  in [0,'0','NULL']):
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                 return 2
             else:
                 # XML was deleted
                 return 1
     # parse metadata to get nEvents
     try:
         root  = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             guidMap[lfn] = guid
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
     except:
         pass
     # parse json
     try:
         import json
         jsonDict = json.loads(self.job.metadata)
         for jsonFileItem in jsonDict['files']['output']:
             for jsonSubFileItem in jsonFileItem['subFiles']:
                 lfn = str(jsonSubFileItem['name'])
                 try:
                     nevents = long(jsonSubFileItem['nentries'])
                     nEventsMap[lfn] = nevents
                 except:
                     pass
                 try:
                     guid = str(jsonSubFileItem['file_guid'])
                     guidMap[lfn] = guid
                 except:
                     pass
     except:
         pass
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     self.logger.debug('guidMap=%s' % str(guidMap))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # copy files for variable number of outputs
     tmpStat = self.copyFilesForVariableNumOutputs(lfns)
     if not tmpStat:
         self.logger.error("failed to copy files for variable number of outputs")
         return 2
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user','panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in ['managed','test'] + JobUtils.list_ptest_prod_sources:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if not file.lfn in lfns:
                 if self.job.jobStatus == 'finished' and \
                         (EventServiceUtils.isEventServiceJob(self.job) or EventServiceUtils.isJumboJob(self.job)):
                     # unset file status for ES jobs
                     pass
                 elif file.isAllowedNoOutput():
                     # allowed not to be produced
                     file.status = 'nooutput'
                     self.logger.debug('set {0} to status={1}'.format(file.lfn,file.status))
                 else:
                     file.status = 'failed'
                     self.job.jobStatus = 'failed'
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(file.lfn)
                     self.logger.error(self.job.ddmErrorDiag)
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID   = guids[i]
                 file.fsize  = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if fullLfnMap.has_key(file.lfn):
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents 
                 if nEventsMap.has_key(file.lfn):
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
             except:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type,value))
             # set lumi block number
             if lumiBlockNr != None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
     self.extraInfo['guid'] = guidMap
     # check consistency between XML and filesTable
     for lfn in lfns:
         if not lfn in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(lfn)
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0

コード例 #15

ファイルを表示

 def run(self):
     try:
         _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
         flagComplete    = True
         topUserDsList   = []
         usingMerger     = False        
         disableNotifier = False
         firstIndvDS     = True
         finalStatusDS   = []
         for destinationDBlock in self.destinationDBlocks:
             dsList = []
             _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
             # ignore tid datasets
             if re.search('_tid[\d_]+$',destinationDBlock):
                 _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))                
                 continue
             # ignore HC datasets
             if re.search('^hc_test\.',destinationDBlock) != None or re.search('^user\.gangarbt\.',destinationDBlock) != None:
                 if re.search('_sub\d+$',destinationDBlock) == None and re.search('\.lib$',destinationDBlock) == None:
                     _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock))                
                     continue
             # query dataset
             if self.datasetMap.has_key(destinationDBlock):
                 dataset = self.datasetMap[destinationDBlock]
             else:
                 dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
             if dataset == None:
                 _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
                 flagComplete = False
                 continue
             # skip tobedeleted/tobeclosed 
             if dataset.status in ['cleanup','tobeclosed','completed','deleted']:
                 _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
                 continue
             dsList.append(dataset)
             # sort
             dsList.sort()
             # count number of completed files
             notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
                                                            'status':'unknown'})
             if notFinish < 0:
                 _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
                 flagComplete = False                
                 continue
             # check if completed
             _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
             if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
                 # close non-DQ2 destinationDBlock immediately
                 finalStatus = 'closed'
             elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock):
                 # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI
                 finalStatus = 'closed'
             elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
                      and self.job.processingType != 'usermerge':
                 # merge output files
                 if firstIndvDS:
                     # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
                     finalStatus = 'tobemerged'
                     firstIndvDS = False
                 else:
                     finalStatus = 'tobeclosed'
                 # set merging to top dataset
                 usingMerger = True
                 # disable Notifier
                 disableNotifier = True
             elif self.job.produceUnMerge():
                 finalStatus = 'doing'
             else:
                 # set status to 'tobeclosed' to trigger DQ2 closing
                 finalStatus = 'tobeclosed'
             if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job):
                 allInJobsetFinished = self.checkSubDatasetsInJobset()
             else:
                 allInJobsetFinished = True
             if notFinish == 0 and allInJobsetFinished: 
                 _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
                 # set status
                 dataset.status = finalStatus
                 # update dataset in DB
                 retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                       criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 if len(retT) > 0 and retT[0]==1:
                     finalStatusDS += dsList
                     # close user datasets
                     if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
                            and (dataset.name.startswith('user') or dataset.name.startswith('group')):
                         # get top-level user dataset 
                         topUserDsName = re.sub('_sub\d+$','',dataset.name)
                         # update if it is the first attempt
                         if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi':
                             topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
                             if topUserDs != None:
                                 # check status
                                 if topUserDs.status in ['completed','cleanup','tobeclosed','deleted',
                                                         'tobemerged','merging']:
                                     _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
                                 else:
                                     # set status
                                     if self.job.processingType.startswith('gangarobot') or \
                                            self.job.processingType.startswith('hammercloud'):
                                         # not trigger freezing for HC datasets so that files can be appended
                                         topUserDs.status = 'completed'
                                     elif not usingMerger:
                                         topUserDs.status = finalStatus
                                     else:
                                         topUserDs.status = 'merging'
                                     # append to avoid repetition
                                     topUserDsList.append(topUserDsName)
                                     # update DB
                                     retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
                                                                              criteriaMap={':crStatus':topUserDs.status})
                                     if len(retTopT) > 0 and retTopT[0]==1:
                                         _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
                                     else:
                                         _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
                         # get parent dataset for merge job
                         if self.job.processingType == 'usermerge':
                             tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
                             if tmpMatch == None:
                                 _logger.error('%s failed to extract parentDS' % self.pandaID)
                             else:
                                 unmergedDsName = tmpMatch.group(1)
                                 # update if it is the first attempt
                                 if not unmergedDsName in topUserDsList:
                                     unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
                                     if unmergedDs == None:
                                         _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
                                     else:
                                         # check status
                                         if unmergedDs.status in ['completed','cleanup','tobeclosed']:
                                             _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
                                         else:
                                             # set status
                                             unmergedDs.status = finalStatus
                                             # append to avoid repetition
                                             topUserDsList.append(unmergedDsName)
                                             # update DB
                                             retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                      criteriaMap={':crStatus':unmergedDs.status})
                                             if len(retTopT) > 0 and retTopT[0]==1:
                                                 _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
                                             else:
                                                 _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
                     # start Activator
                     if re.search('_sub\d+$',dataset.name) == None:
                         if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
                             # don't trigger Activator for merge jobs
                             pass
                         else:
                             if self.job.jobStatus == 'finished':
                                 aThr = Activator(self.taskBuffer,dataset)
                                 aThr.start()
                                 aThr.join()
                 else:
                     # unset flag since another thread already updated 
                     #flagComplete = False
                     pass
             else:
                 # update dataset in DB
                 self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 # unset flag
                 flagComplete = False
             # end
             _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
         # special actions for vo
         if flagComplete:
             closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO)
             if closerPluginClass == None and self.job.VO == 'atlas':
                 # use ATLAS plugin for ATLAS
                 from CloserAtlasPlugin import CloserAtlasPlugin
                 closerPluginClass = CloserAtlasPlugin
             if closerPluginClass != None:
                 closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger)
                 closerPlugin.execute()
         # change pending jobs to failed
         finalizedFlag = True
         if flagComplete and self.job.prodSourceLabel=='user':
             _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
             finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID)
             _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag))
         # update unmerged datasets in JEDI to trigger merging
         if flagComplete and self.job.produceUnMerge() and finalStatusDS != []:
             if finalizedFlag:
                 tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS)
                 _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat))
         # start notifier
         _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
         if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
            (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \
            self.job.lockedby != 'jedi':
             # don't send email for merge jobs
             if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
                 useNotifier = True
                 summaryInfo = {}
                 # check all jobDefIDs in jobsetID
                 if not self.job.jobsetID in [0,None,'NULL']:
                     useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
                                                                                             self.job.prodUserName)
                     _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
                 if useNotifier:
                     _logger.debug('%s start Notifier' % self.pandaID)
                     nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
                     nThr.run()
                     _logger.debug('%s end Notifier' % self.pandaID)                    
         _logger.debug('%s End' % self.pandaID)
     except:
         errType,errValue = sys.exc_info()[:2]
         _logger.error("%s %s" % (errType,errValue))

コード例 #16

ファイルを表示

ファイル: AdderGen.py プロジェクト: lukewayne123/panda-server

 def run(self):
     try:
         self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
         # lock XML
         self.lockXML = open(self.xmlFile)
         try:
             fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
         except:
             self.logger.debug("cannot get lock : %s" % self.xmlFile)
             self.lockXML.close()
             # remove XML just in case for the final attempt
             if not self.ignoreTmpError:
                 try:
                     # remove Catalog
                     os.remove(self.xmlFile)
                 except:
                     pass
             return
         # check if file exists
         if not os.path.exists(self.xmlFile):
             self.logger.debug("not exist : %s" % self.xmlFile)
             try:
                 fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                 self.lockXML.close()
             except:
                 pass
             return
         # query job
         self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                             fromArchived=False,
                                             fromWaiting=False,
                                             forAnal=True)[0]
         # check if job has finished
         if self.job == None:
             self.logger.debug(': job not found in DB')
         elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']:
             self.logger.error(': invalid state -> %s' % self.job.jobStatus)
         elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
             self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
         else:
             # check file status in JEDI
             fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
             self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
             if fileCheckInJEDI == None:
                 raise RuntimeError,'failed to check file status in JEDI'
             if fileCheckInJEDI == False:
                 # set job status to failed since some file status is wrong in JEDI 
                 self.jobStatus = 'failed'
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "wrong file status in JEDI"
                 self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus))
             # keep old status
             oldJobStatus = self.job.jobStatus
             # set job status
             if not self.job.jobStatus in ['transferring']:
                 self.job.jobStatus = self.jobStatus
             addResult = None
             adderPlugin = None
             # parse XML
             parseResult = self.parseXML()
             if parseResult < 2:
                 # intraction with DDM
                 try:
                     # set VO=local for DDM free
                     if self.job.destinationSE == 'local':
                         tmpVO = 'local'
                     else:
                         tmpVO = self.job.VO
                     # instantiate concrete plugin
                     adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
                     if adderPluginClass == None:
                         # use ATLAS plugin by default
                         from AdderAtlasPlugin import AdderAtlasPlugin
                         adderPluginClass = AdderAtlasPlugin
                     self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
                     adderPlugin = adderPluginClass(self.job,
                                                    taskBuffer=self.taskBuffer,
                                                    siteMapper=self.siteMapper,
                                                    extraInfo=self.extraInfo,
                                                    logger=self.logger)
                     # execute
                     self.logger.debug('plugin is ready')
                     adderPlugin.execute()
                     addResult = adderPlugin.result
                     self.logger.debug('plugin done with %s' % (addResult.statusCode))
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO,
                                                                                                      errtype,
                                                                                                      errvalue)) 
                     addResult = None
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "AdderPlugin failure"
                 # ignore temporary errors
                 if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                     self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                     self.logger.debug('escape')
                     # unlock XML
                     try:
                         fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                         self.lockXML.close()
                     except:
                         type, value, traceBack = sys.exc_info()
                         self.logger.debug(": %s %s" % (type,value))
                         self.logger.debug("cannot unlock XML")
                     return
                 # failed
                 if addResult == None or not addResult.isSucceeded():
                     self.job.jobStatus = 'failed'
             # set file status for failed jobs or failed transferring jobs
             if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                 self.job.jobStatus = 'failed'
                 for file in self.job.Files:
                     if file.type in ['output','log']:
                         if addResult != None and file.lfn in addResult.mergingFiles:
                             file.status = 'merging'
                         else:
                             file.status = 'failed'
             else:
                 # reset errors
                 self.job.jobDispatcherErrorCode = 0
                 self.job.jobDispatcherErrorDiag = 'NULL'
                 # set status
                 if addResult != None and addResult.mergingFiles != []:
                     # set status for merging:                        
                     for file in self.job.Files:
                         if file.lfn in addResult.mergingFiles:
                             file.status = 'merging'
                     self.job.jobStatus = 'merging'
                     # propagate transition to prodDB
                     self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                 elif addResult != None and addResult.transferringFiles != []:
                     # set status for transferring
                     for file in self.job.Files:
                         if file.lfn in addResult.transferringFiles:
                             file.status = 'transferring'
                     self.job.jobStatus = 'transferring'
                     # propagate transition to prodDB
                     self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                 else:
                     self.job.jobStatus = 'finished'
             # endtime
             if self.job.endTime=='NULL':
                 self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
             # output size and # of outputs
             self.job.nOutputDataFiles = 0
             self.job.outputFileBytes = 0
             for tmpFile in self.job.Files:
                 if tmpFile.type == 'output':
                     self.job.nOutputDataFiles += 1
                     try:
                         self.job.outputFileBytes += tmpFile.fsize
                     except:
                         pass
             # protection
             maxOutputFileBytes = 99999999999
             if self.job.outputFileBytes > maxOutputFileBytes:
                 self.job.outputFileBytes = maxOutputFileBytes
             # set cancelled state
             if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                 self.job.jobStatus = 'cancelled'
             # update job
             self.logger.debug("updating DB")
             retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                               extraInfo=self.extraInfo)
             self.logger.debug("retU: %s" % retU)
             # failed
             if not retU[0]:
                 self.logger.error('failed to update DB')
                 # unlock XML
                 try:
                     fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                     self.lockXML.close()                            
                 except:
                     type, value, traceBack = sys.exc_info()
                     self.logger.debug(": %s %s" % (type,value))
                     self.logger.debug("cannot unlock XML")
                 return
             # setup for closer
             if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'):
                 destDBList = []
                 guidList = []
                 for file in self.job.Files:
                     # ignore inputs
                     if file.type == 'input':
                         continue
                     # skip pseudo datasets
                     if file.destinationDBlock in ['',None,'NULL']:
                         continue
                     # start closer for output/log datasets
                     if not file.destinationDBlock in destDBList:
                         destDBList.append(file.destinationDBlock)
                     # collect GUIDs
                     if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                               self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                               and file.type == 'output':
                         # extract base LFN since LFN was changed to full LFN for CMS
                         baseLFN = file.lfn.split('/')[-1]
                         guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                          'checksum':file.checksum,'md5sum':file.md5sum,
                                          'fsize':file.fsize,'scope':file.scope})
                 if guidList != []:
                     retG = self.taskBuffer.setGUIDs(guidList)
                 if destDBList != []:
                     # start Closer
                     if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                         cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                     else:
                         cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                     self.logger.debug("start Closer")
                     cThr.start()
                     cThr.join()
                     self.logger.debug("end Closer")
         self.logger.debug("end")
         try:
             # remove Catalog
             os.remove(self.xmlFile)
         except:
             pass
         # unlock XML
         if self.lockXML != None:
             fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
             self.lockXML.close()            
     except:
         type, value, traceBack = sys.exc_info()
         self.logger.debug(": %s %s" % (type,value))
         self.logger.debug("except")
         # unlock XML just in case
         try:
             if self.lockXML != None:
                 fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
         except:
             type, value, traceBack = sys.exc_info()
             self.logger.debug(": %s %s" % (type,value))
             self.logger.debug("cannot unlock XML")

コード例 #17

ファイルを表示

ファイル: AdderGen.py プロジェクト: lukewayne123/panda-server

 def parseXML(self):
     # get LFN and GUID
     self.logger.debug('XML filename : %s' % self.xmlFile)
     # no outputs
     if self.job.Files == []:
         self.logger.debug("has no outputs")
         self.logger.debug("parseXML end")
         return 0
     # get input files
     inputLFNs = []
     for file in self.job.Files:
         if file.type == 'input':
             inputLFNs.append(file.lfn)
     # parse XML
     lfns    = []
     guids   = []
     fsizes  = []
     md5sums = []
     chksums = []
     surls   = []
     fullLfnMap = {}
     nEventsMap = {}
     try:
         root  = xml.dom.minidom.parse(self.xmlFile)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             fsize   = None
             md5sum  = None
             adler32 = None
             surl    = None
             fullLFN = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'fsize':
                     fsize = long(meta.getAttribute('att_value'))
                 elif name == 'md5sum':
                     md5sum = str(meta.getAttribute('att_value'))
                     # check
                     if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
                         md5sum = None
                 elif name == 'adler32':
                     adler32 = str(meta.getAttribute('att_value'))
                 elif name == 'surl':
                     surl = str(meta.getAttribute('att_value'))
                 elif name == 'full_lfn':
                     fullLFN = str(meta.getAttribute('att_value'))
             # error check
             if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
                 if EventServiceUtils.isEventServiceMerge(self.job):
                     continue
                 else:
                     raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
             # append
             lfns.append(lfn)
             guids.append(guid)
             fsizes.append(fsize)
             md5sums.append(md5sum)
             surls.append(surl)
             if adler32 != None:
                 # use adler32 if available
                 chksums.append("ad:%s" % adler32)
             else:
                 chksums.append("md5:%s" % md5sum)
             if fullLFN != None:
                 fullLfnMap[lfn] = fullLFN
     except:
         # check if file exists
         if os.path.exists(self.xmlFile):
             type, value, traceBack = sys.exc_info()
             self.logger.error(": %s %s" % (type,value))
             # set failed anyway
             self.job.jobStatus = 'failed'
             # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
             if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                (self.job.transExitCode  in [0,'0','NULL']):
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
             return 2
         else:
             # XML was deleted
             return 1
     # parse metadata to get nEvents
     try:
         root  = xml.dom.minidom.parseString(self.job.metadata)
         files = root.getElementsByTagName('File')
         for file in files:
             # get GUID
             guid = str(file.getAttribute('ID'))
             # get PFN and LFN nodes
             logical  = file.getElementsByTagName('logical')[0]
             lfnNode  = logical.getElementsByTagName('lfn')[0]
             # convert UTF8 to Raw
             lfn = str(lfnNode.getAttribute('name'))
             # get metadata
             nevents = None
             for meta in file.getElementsByTagName('metadata'):
                 # get fsize
                 name = str(meta.getAttribute('att_name'))
                 if name == 'events':
                     nevents = long(meta.getAttribute('att_value'))
                     nEventsMap[lfn] = nevents
                     break
     except:
         pass
     self.logger.debug('nEventsMap=%s' % str(nEventsMap))
     # get lumi block number
     lumiBlockNr = self.job.getLumiBlockNr()
     # check files
     fileList = []
     for file in self.job.Files:
         fileList.append(file.lfn)
         if file.type == 'input':
             if file.lfn in lfns:
                 if self.job.prodSourceLabel in ['user','panda']:
                     # skipped file
                     file.status = 'skipped'
                 elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']:
                     # failed by pilot
                     file.status = 'failed'
         elif file.type == 'output' or file.type == 'log':
             # add only log file for failed jobs
             if self.jobStatus == 'failed' and file.type != 'log':
                 file.status = 'failed'
                 continue
             # set failed if it is missing in XML
             if not file.lfn in lfns:
                 if self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job):
                     # unset file status for ES jobs
                     pass
                 else:
                     file.status = 'failed'
                 continue
             # look for GUID with LFN
             try:
                 i = lfns.index(file.lfn)
                 file.GUID   = guids[i]
                 file.fsize  = fsizes[i]
                 file.md5sum = md5sums[i]
                 file.checksum = chksums[i]
                 surl = surls[i]
                 # status
                 file.status = 'ready'
                 # change to full LFN
                 if fullLfnMap.has_key(file.lfn):
                     file.lfn = fullLfnMap[file.lfn]
                 # add SURL to extraInfo
                 self.extraInfo['surl'][file.lfn] = surl
                 # add nevents 
                 if nEventsMap.has_key(file.lfn):
                     self.extraInfo['nevents'][file.lfn] = nEventsMap[file.lfn]
             except:
                 # status
                 file.status = 'failed'
                 type, value, traceBack = sys.exc_info()
                 self.logger.error(": %s %s" % (type,value))
             # set lumi block number
             if lumiBlockNr != None and file.status != 'failed':
                 self.extraInfo['lbnr'][file.lfn] = lumiBlockNr 
     # check consistency between XML and filesTable
     for lfn in lfns:
         if not lfn in fileList:
             self.logger.error("%s is not found in filesTable" % lfn)
             self.job.jobStatus = 'failed'
             for tmpFile in self.job.Files:
                 tmpFile.status = 'failed'
             self.job.ddmErrorCode = ErrorCode.EC_Adder
             self.job.ddmErrorDiag = "pilot XML is inconsistent with filesTable"
             return 2
     # return
     self.logger.debug("parseXML end")
     return 0

コード例 #18

ファイルを表示

    def run(self):
        try:
            while True:
                _logger.debug('%s start' % self.pandaID)
                # query job
                job = self.taskBuffer.peekJobs([self.pandaID],
                                               fromDefined=False,
                                               fromArchived=False,
                                               fromWaiting=False)[0]
                # check job status
                if job == None:
                    _logger.debug('%s escape : not found' % self.pandaID)
                    return
                if not job.jobStatus in [
                        'running', 'sent', 'starting', 'holding', 'stagein',
                        'stageout'
                ]:
                    if job.jobStatus == 'transferring' and job.prodSourceLabel in [
                            'user', 'panda'
                    ]:
                        pass
                    else:
                        _logger.debug('%s escape : %s' %
                                      (self.pandaID, job.jobStatus))
                        return
                # time limit
                timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                    minutes=self.sleepTime)
                if job.modificationTime < timeLimit or (
                        job.endTime != 'NULL' and job.endTime < timeLimit):
                    _logger.debug(
                        '%s %s lastmod:%s endtime:%s' %
                        (job.PandaID, job.jobStatus, str(
                            job.modificationTime), str(job.endTime)))
                    # retry ES merge jobs
                    if EventServiceUtils.isEventServiceMerge(job):
                        self.taskBuffer.retryJob(job.PandaID, {},
                                                 getNewPandaID=True,
                                                 attemptNr=job.attemptNr,
                                                 recoverableEsMerge=True)
                        # read back
                        job = self.taskBuffer.peekJobs([self.pandaID],
                                                       fromDefined=False,
                                                       fromArchived=False,
                                                       fromWaiting=False)[0]
                    destDBList = []
                    # retry analysis jobs
                    if (job.prodSourceLabel in ['user','panda']) and (job.attemptNr<2 or job.jobStatus == 'sent') \
                             and job.commandToPilot != 'tobekilled' and (not job.processingType in ['ITB_INTEGRATION']) \
                             and not job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_Reassigned,
                                                                 taskbuffer.ErrorCode.EC_Retried,
                                                                 taskbuffer.ErrorCode.EC_PilotRetried] \
                             and not job.processingType.startswith('gangarobot') \
                             and not job.processingType.startswith('hammercloud'):
                        # reset
                        _logger.debug(
                            ' -> reset %s job with %s : PandaID:%s #%s' %
                            (job.prodSourceLabel, job.jobStatus, job.PandaID,
                             job.attemptNr))
                        job.jobStatus = 'activated'
                        job.startTime = None
                        job.endTime = None
                        job.attemptNr = job.attemptNr + 1
                        # remove flag regarding to pledge-resource handling
                        if not job.specialHandling in [None, 'NULL', '']:
                            newSpecialHandling = re.sub(
                                ',*localpool', '', job.specialHandling)
                            if newSpecialHandling == '':
                                job.specialHandling = None
                            else:
                                job.specialHandling = newSpecialHandling
                        # TEMPORARY : send it to long queue
                        oldComputingSite = job.computingSite
                        if job.jobStatus != 'sent' and job.computingSite.startswith(
                                'ANALY') and (not job.computingSite.startswith(
                                    'ANALY_LONG_')):
                            tmpLongSiteList = []
                            tmpLongSite = re.sub('^ANALY_', 'ANALY_LONG_',
                                                 job.computingSite)
                            tmpLongSite = re.sub('_\d+$', '', tmpLongSite)
                            tmpLongSiteList.append(tmpLongSite)
                            tmpLongSite = job.computingSite + '_LONG'
                            tmpLongSiteList.append(tmpLongSite)
                            tmpLongSite = re.sub('SHORT', 'LONG',
                                                 job.computingSite)
                            if tmpLongSite != job.computingSite:
                                tmpLongSiteList.append(tmpLongSite)
                            for longSite in tmpLongSiteList:
                                if self.siteMapper.checkSite(longSite):
                                    tmpSiteSpec = self.siteMapper.getSite(
                                        longSite)
                                    if tmpSiteSpec.status == 'online':
                                        job.computingSite = longSite
                                        _logger.debug(
                                            ' -> sending PandaID:%s to %s' %
                                            (job.PandaID, job.computingSite))
                                        # set destinationSE
                                        if job.destinationSE == oldComputingSite:
                                            job.destinationSE = job.computingSite
                                        break
                        # modify LFNs and destinationSE
                        for file in job.Files:
                            modTypes = ('output', 'log')
                            if file.type in modTypes:
                                # set destinationSE
                                if file.destinationSE == oldComputingSite:
                                    file.destinationSE = job.computingSite
                            if job.prodSourceLabel == 'panda':
                                # doesn't change output for buildJob
                                modTypes = ('log', )
                            if file.type in modTypes:
                                # set new GUID
                                if file.type == 'log':
                                    file.GUID = commands.getoutput('uuidgen')
                                # add attempt nr
                                oldName = file.lfn
                                file.lfn = re.sub("\.\d+$", "", file.lfn)
                                file.lfn = "%s.%d" % (file.lfn, job.attemptNr)
                                newName = file.lfn
                                # modify jobParameters
                                sepPatt = "(\'|\"|%20|:)" + oldName + "(\'|\"|%20| )"
                                matches = re.findall(sepPatt,
                                                     job.jobParameters)
                                for match in matches:
                                    oldPatt = match[0] + oldName + match[-1]
                                    newPatt = match[0] + newName + match[-1]
                                    job.jobParameters = re.sub(
                                        oldPatt, newPatt, job.jobParameters)
                    else:
                        if job.jobStatus == 'sent':
                            # sent job didn't receive reply from pilot within 30 min
                            job.jobDispatcherErrorCode = ErrorCode.EC_SendError
                            job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
                        elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
                            # lost heartbeat
                            job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
                            if job.jobDispatcherErrorDiag == 'NULL':
                                if job.endTime == 'NULL':
                                    # normal lost heartbeat
                                    job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                        job.modificationTime)
                                else:
                                    # job recovery failed
                                    job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(
                                        job.endTime)
                                    if job.jobStatus == 'transferring':
                                        job.jobDispatcherErrorDiag += ' in transferring'
                        else:
                            # job recovery failed
                            job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
                            job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (
                                self.sleepTime / 60)
                        # set job status
                        job.jobStatus = 'failed'
                        # set endTime for lost heartbeat
                        if job.endTime == 'NULL':
                            # normal lost heartbeat
                            job.endTime = job.modificationTime
                        # set files status
                        for file in job.Files:
                            if file.type == 'output' or file.type == 'log':
                                file.status = 'failed'
                                if not file.destinationDBlock in destDBList:
                                    destDBList.append(file.destinationDBlock)
                    # update job
                    self.taskBuffer.updateJobs([job], False)
                    # start closer
                    if job.jobStatus == 'failed':

                        source = 'jobDispatcherErrorCode'
                        error_code = job.jobDispatcherErrorCode
                        error_diag = job.jobDispatcherErrorDiag

                        try:
                            _logger.debug(
                                "Watcher will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job.PandaID, source,
                                error_code, error_diag, job.attemptNr)
                            _logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            _logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s)"
                                % (e))

                        cThr = Closer(self.taskBuffer, destDBList, job)
                        cThr.start()
                        cThr.join()
                    _logger.debug('%s end' % job.PandaID)
                    return
                # single action
                if self.single:
                    return
                # sleep
                time.sleep(60 * self.sleepTime)
        except:
            type, value, traceBack = sys.exc_info()
            _logger.error("run() : %s %s" % (type, value))
            return