None, '' ]) and (not libDSName in [None, '']): # update GUID tmpLog.debug(" set GUID:%s for %s" % (libGUID, libLFN)) #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}]) # FIXME retG = True if not retG: tmpLog.error(" failed to update GUID for %s" % libLFN) else: # get PandaID with lib.tgz #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready') ids = [] # get jobs jobs = taskBuffer.peekJobs(ids, fromActive=False, fromArchived=False, fromWaiting=False) # remove None and unknown acJobs = [] for job in jobs: if job == None or job.jobStatus == 'unknown': continue acJobs.append(job) # activate tmpLog.debug(" -> activate downstream jobs") #taskBuffer.activateJobs(acJobs) else: # wait tmpLog.debug(" -> wait") varMap = {} varMap[':prodSourceLabel'] = 'user'
# start sender mailSender = MailSender() mailSender.start() # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30,0,1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA,coJumboD,coJumboW = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format(len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA,fromDefined=False,fromActive=True,fromArchived=False,fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec],False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD,fromDefined=True,fromActive=False,fromArchived=False,fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong'
def run(self): self.lock.acquire() try: # loop over all datasets for vuid, name, modDate, verNum in self.datasets: try: try: verNum = int(verNum) except: verNum = 0 _logger.debug("Merge %s %s %s" % (modDate, name, verNum)) toBeClosed = False # close old datasets anyway if modDate < timeLimitX or verNum >= self.maxTry: toBeClosed = True # check version dsSpec = taskBuffer.queryDatasetWithMap({'vuid': vuid}) if dsSpec == None: _logger.error("failed to get dataset spec for %s:%s" % (name, vuid)) continue try: if int(dsSpec.version) != verNum + 1: _logger.debug( "skip %s due to version mismatch %s != %s+1" % (name, dsSpec.version, verNum)) continue except: _logger.error( "failed to convert version='%s' to int for %s" % (dsSpec.version, name)) continue # get PandaID self.proxyLock.acquire() proxyS = taskBuffer.proxyPool.getProxy() pandaID = proxyS.getPandaIDwithDestDBlock(name) taskBuffer.proxyPool.putProxy(proxyS) self.proxyLock.release() if pandaID == None: _logger.error("failed to find PandaID for %s" % name) toBeClosed = True else: # get job self.proxyLock.acquire() pandaJob = taskBuffer.peekJobs([pandaID])[0] self.proxyLock.release() if pandaJob == None: _logger.error( "failed to get job for %s PandaID=%s" % (name, pandaID)) toBeClosed = True else: # run merger _logger.debug("run merger for %s" % name) merger = Merger(taskBuffer, pandaJob) mRet = merger.run() if mRet == None: _logger.debug("got unrecoverable for %s" % name) toBeClosed = True elif mRet == True: _logger.debug("succeeded for %s" % name) toBeClosed = True else: _logger.debug("failed for %s" % name) # close dataset if toBeClosed: _logger.debug("close %s" % name) self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'tobeclosed' taskBuffer.querySQLS( "UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() except: errType, errValue = sys.exc_info()[:2] _logger.error("Failed %s with %s:%s" % (name, errType, errValue)) except: errType, errValue = sys.exc_info()[:2] _logger.error("MergerThr failed with %s:%s" % (errType, errValue)) self.pool.remove(self) self.lock.release()
else: # activate if useLib and libStatus == 'ready' and (not libGUID in [None,'']) and (not libDSName in [None,'']): # update GUID tmpLog.debug(" set GUID:%s for %s" % (libGUID,libLFN)) #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}]) # FIXME retG = True if not retG: tmpLog.error(" failed to update GUID for %s" % libLFN) else: # get PandaID with lib.tgz #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready') ids = [] # get jobs jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) # remove None and unknown acJobs = [] for job in jobs: if job == None or job.jobStatus == 'unknown': continue acJobs.append(job) # activate tmpLog.debug(" -> activate downstream jobs") #taskBuffer.activateJobs(acJobs) else: # wait tmpLog.debug(" -> wait") varMap = {} varMap[':prodSourceLabel'] = 'user' varMap[':jobDefinitionID'] = jobDefinitionID
def run(self): self.lock.acquire() try: for vuid,name,modDate in self.datasets: _logger.debug("start %s %s" % (modDate,name)) self.proxyLock.acquire() retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3,:status4,:status5)", {':destinationDBlock':name,':status1':'ready',':status2':'failed', ':status3':'skipped',':status4':'merging', ':status5':'finished'}) self.proxyLock.release() if retF<0: _logger.error("SQL error") else: # no files in filesTable if len(resF) == 0: _logger.debug("freeze %s " % name) dsExists = True if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \ or name.startswith('hc_test.') or name.startswith('panda.um.'): dsExists = False if name.startswith('panda.um.'): self.proxyLock.acquire() retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ", {':destinationDBlock':name, ':statusM':'merging', ':statusF':'failed'}) self.proxyLock.release() if resMer != None and len(resMer)>0: mergeID = resMer[0][0] # get merging jobs self.proxyLock.acquire() mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() mergeJob = mergingJobs[0] if mergeJob != None: tmpDestDBlocks = [] # get destDBlock for tmpFile in mergeJob.Files: if tmpFile.type in ['output','log']: if not tmpFile.destinationDBlock in tmpDestDBlocks: tmpDestDBlocks.append(tmpFile.destinationDBlock) # run _logger.debug("start JEDI closer for %s " % name) self.proxyLock.acquire() cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob) cThr.start() cThr.join() self.proxyLock.release() _logger.debug("end JEDI closer for %s " % name) continue else: _logger.debug("failed to get merging job for %s " % name) else: _logger.debug("failed to get merging file for %s " % name) status,out = 0,'' elif dsExists: # check if dataset exists status,out = rucioAPI.getMetaData(name) if status == True: if out != None: status,out = ddm.DQ2.main('freezeDataset',name) else: # dataset not exist status,out = 0,'' dsExists = False else: status,out = 0,'' if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: _logger.error('{0} failed to freeze with {1}'.format(name,out)) else: self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'completed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists: continue # set tobedeleted to dis setTobeDeletedToDis(name) # count # of files status,out = ddm.DQ2.main('getNumberOfFiles',name) if status != 0: if not 'DQUnknownDatasetException' in out: _logger.error(out) else: _logger.debug(out) try: nFile = int(out) _logger.debug(nFile) if nFile == 0: # erase dataset _logger.debug('erase %s' % name) status,out = ddm.DQ2.main('eraseDataset',name) _logger.debug('OK with %s' % name) except: pass else: _logger.debug("wait %s " % name) self.proxyLock.acquire() taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) self.proxyLock.release() _logger.debug("end %s " % name) except: pass self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() upJobs = [] finJobs = [] for job in jobs: if job == None or job.jobStatus == 'unknown': continue # use BNL by default dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url dq2SE = [] # get LFC and SEs if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE): # using --destSE for analysis job to transfer output try: dq2URL = 'rucio://atlas-rucio.cern.ch:/grid/atlas' match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1]) if match != None: dq2SE.append(match.group(1)) except: type, value, traceBack = sys.exc_info() _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value)) continue elif siteMapper.checkCloud(job.cloud): # normal production jobs if DataServiceUtils.checkJobDestinationSE(job) == None: tmpDstID = siteMapper.getCloud(job.cloud)['dest'] else: tmpDstID = job.destinationSE tmpDstSite = siteMapper.getSite(tmpDstID) # get catalog URL dq2URL = 'rucio://atlas-rucio.cern.ch:/grid/atlas' if tmpDstSite.se != None: for tmpDstSiteSE in tmpDstSite.se.split(','): match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE) if match != None: dq2SE.append(match.group(1)) # get LFN list lfns = [] guids = [] scopes = [] nTokens = 0 for file in job.Files: # only output files are checked if file.type == 'output' or file.type == 'log': if file.status == 'nooutput': continue if DataServiceUtils.getDistributedDestination(file.destinationDBlockToken) != None: continue lfns.append(file.lfn) guids.append(file.GUID) scopes.append(file.scope) nTokens += len(file.destinationDBlockToken.split(',')) # get files in LRC _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL)) okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE, getPFN=True,scopeList=scopes) # count files nOkTokens = 0 for okLFN,okPFNs in okFiles.iteritems(): nOkTokens += len(okPFNs) # check all files are ready _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens)) if nTokens <= nOkTokens: _logger.debug("%s Finisher : Finish" % job.PandaID) for file in job.Files: if file.type == 'output' or file.type == 'log': if file.status != 'nooutput': file.status = 'ready' # append to run Finisher finJobs.append(job) else: endTime = job.endTime if endTime == 'NULL': endTime = job.startTime # priority-dependent timeout tmpCloudSpec = siteMapper.getCloud(job.cloud) if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']): if tmpCloudSpec.has_key('transtimehi'): timeOutValue = tmpCloudSpec['transtimehi'] else: timeOutValue = 1 else: if tmpCloudSpec.has_key('transtimelo'): timeOutValue = tmpCloudSpec['transtimelo'] else: timeOutValue = 2 # protection if timeOutValue < 1: timeOutValue = 1 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue) _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime))) if endTime < timeOut: # timeout _logger.debug("%s Finisher : Kill" % job.PandaID) strMiss = '' for lfn in lfns: if not lfn in okFiles: strMiss += ' %s' % lfn job.jobStatus = 'failed' job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer job.taskBufferErrorDiag = 'transfer timeout for '+strMiss guidMap = {} for file in job.Files: # set file status if file.status == 'transferring' or file.type in ['log','output']: file.status = 'failed' # collect GUIDs to delete files from _tid datasets if file.type == 'output' or file.type == 'log': if not guidMap.has_key(file.destinationDBlock): guidMap[file.destinationDBlock] = [] guidMap[file.destinationDBlock].append(file.GUID) else: # wait _logger.debug("%s Finisher : Wait" % job.PandaID) for lfn in lfns: if not lfn in okFiles: _logger.debug("%s -> %s" % (job.PandaID,lfn)) upJobs.append(job) # update _logger.debug("updating ...") self.proxyLock.acquire() taskBuffer.updateJobs(upJobs,False) self.proxyLock.release() # run Finisher for job in finJobs: fThr = Finisher(taskBuffer,None,job) fThr.start() fThr.join() _logger.debug("done") time.sleep(1) except: errtype,errvalue = sys.exc_info()[:2] errStr = "FinisherThr failed with %s %s" % (errtype,errvalue) errStr += traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() actJobs = [] for tmpJob in jobs: if tmpJob == None or tmpJob.jobStatus == 'unknown': continue # get LFN list lfns = [] guids = [] scopes = [] for tmpFile in tmpJob.Files: # only input files are checked if tmpFile.type == 'input' and tmpFile.status != 'ready': lfns.append(tmpFile.lfn) scopes.append(tmpFile.scope) # get file replicas _logger.debug("%s check input files at %s" % (tmpJob.PandaID,tmpJob.computingSite)) tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns) if not tmpStat: pass else: # check if locally available siteSpec = siteMapper.getSite(tmpJob.computingSite) allOK = True for tmpFile in tmpJob.Files: # only input if tmpFile.type == 'input' and tmpFile.status != 'ready': # check RSEs if tmpFile.lfn in okFiles: for rse in okFiles[tmpFile.lfn]: if siteSpec.ddm_endpoints.isAssociated(rse) and \ siteSpec.ddm_endpoints.getEndPoint(rse)['is_tape'] == 'N': tmpFile.status = 'ready' break # missing if tmpFile.status != 'ready': allOK = False _logger.debug("%s skip since %s:%s is missing" % (tmpJob.PandaID,tmpFile.scope,tmpFile.lfn)) break if not allOK: continue # append to run activator _logger.debug("%s to activate" % tmpJob.PandaID) actJobs.append(tmpJob) # update _logger.debug("activating ...") self.proxyLock.acquire() taskBuffer.activateJobs(actJobs) self.proxyLock.release() _logger.debug("done") time.sleep(1) except: errtype,errvalue = sys.exc_info()[:2] _logger.error("ActivatorThr failed with %s %s" % (errtype,errvalue)) self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: # get jobs from DB ids = self.ids self.proxyLock.acquire() jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() upJobs = [] finJobs = [] for job in jobs: if job == None or job.jobStatus == 'unknown': continue # use BNL by default dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url dq2SE = [] # get LFC and SEs if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE): # using --destSE for analysis job to transfer output try: dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1] match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1]) if match != None: dq2SE.append(match.group(1)) except: type, value, traceBack = sys.exc_info() _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value)) continue elif siteMapper.checkCloud(job.cloud): # normal production jobs if DataServiceUtils.checkJobDestinationSE(job) == None: tmpDstID = siteMapper.getCloud(job.cloud)['dest'] else: tmpDstID = job.destinationSE tmpDstSite = siteMapper.getSite(tmpDstID) # get catalog URL tmpStat,dq2URL = dataservice.DDM.toa.getLocalCatalog(tmpDstSite.ddm) if tmpDstSite.se != None: for tmpDstSiteSE in tmpDstSite.se.split(','): match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE) if match != None: dq2SE.append(match.group(1)) # get LFN list lfns = [] guids = [] scopes = [] nTokens = 0 for file in job.Files: # only output files are checked if file.type == 'output' or file.type == 'log': lfns.append(file.lfn) guids.append(file.GUID) scopes.append(file.scope) nTokens += len(file.destinationDBlockToken.split(',')) # get files in LRC _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL)) okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE, getPFN=True,scopeList=scopes) # count files nOkTokens = 0 for okLFN,okPFNs in okFiles.iteritems(): nOkTokens += len(okPFNs) # check all files are ready _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens)) if nTokens <= nOkTokens: _logger.debug("%s Finisher : Finish" % job.PandaID) for file in job.Files: if file.type == 'output' or file.type == 'log': file.status = 'ready' # append to run Finisher finJobs.append(job) else: endTime = job.endTime if endTime == 'NULL': endTime = job.startTime # priority-dependent timeout tmpCloudSpec = siteMapper.getCloud(job.cloud) if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']): if tmpCloudSpec.has_key('transtimehi'): timeOutValue = tmpCloudSpec['transtimehi'] else: timeOutValue = 1 else: if tmpCloudSpec.has_key('transtimelo'): timeOutValue = tmpCloudSpec['transtimelo'] else: timeOutValue = 2 # protection if timeOutValue < 1: timeOutValue = 1 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue) _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime))) if endTime < timeOut: # timeout _logger.debug("%s Finisher : Kill" % job.PandaID) strMiss = '' for lfn in lfns: if not lfn in okFiles: strMiss += ' %s' % lfn job.jobStatus = 'failed' job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer job.taskBufferErrorDiag = 'transfer timeout for '+strMiss guidMap = {} for file in job.Files: # set file status if file.status == 'transferring': file.status = 'failed' # collect GUIDs to delete files from _tid datasets if file.type == 'output' or file.type == 'log': if not guidMap.has_key(file.destinationDBlock): guidMap[file.destinationDBlock] = [] guidMap[file.destinationDBlock].append(file.GUID) else: # wait _logger.debug("%s Finisher : Wait" % job.PandaID) for lfn in lfns: if not lfn in okFiles: _logger.debug("%s -> %s" % (job.PandaID,lfn)) upJobs.append(job) # update _logger.debug("updating ...") self.proxyLock.acquire() taskBuffer.updateJobs(upJobs,False) self.proxyLock.release() # run Finisher for job in finJobs: fThr = Finisher(taskBuffer,None,job) fThr.start() fThr.join() _logger.debug("done") time.sleep(1) except: errtype,errvalue = sys.exc_info()[:2] _logger.error("FinisherThr failed with %s %s" % (errtype,errvalue)) self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: for vuid,name,modDate in self.datasets: _logger.debug("start %s %s" % (modDate,name)) self.proxyLock.acquire() retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3,:status4,:status5)", {':destinationDBlock':name,':status1':'ready',':status2':'failed', ':status3':'skipped',':status4':'merging', ':status5':'finished'}) self.proxyLock.release() if retF<0: _logger.error("SQL error") else: # no files in filesTable if len(resF) == 0: _logger.debug("freeze %s " % name) if name.startswith('panda.um.'): self.proxyLock.acquire() retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ", {':destinationDBlock':name, ':statusM':'merging', ':statusF':'failed'}) self.proxyLock.release() if resMer != None and len(resMer)>0: mergeID = resMer[0][0] # get merging jobs self.proxyLock.acquire() mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() mergeJob = mergingJobs[0] if mergeJob != None: tmpDestDBlocks = [] # get destDBlock for tmpFile in mergeJob.Files: if tmpFile.type in ['output','log']: if not tmpFile.destinationDBlock in tmpDestDBlocks: tmpDestDBlocks.append(tmpFile.destinationDBlock) # run _logger.debug("start JEDI closer for %s " % name) self.proxyLock.acquire() cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob) cThr.start() cThr.join() self.proxyLock.release() _logger.debug("end JEDI closer for %s " % name) continue else: _logger.debug("failed to get merging job for %s " % name) else: _logger.debug("failed to get merging file for %s " % name) status,out = 0,'' elif not name.startswith('pandaddm_'): status,out = ddm.DQ2.main('freezeDataset',name) else: status,out = 0,'' if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: _logger.error(out) else: self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'completed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() if name.startswith('pandaddm_') or name.startswith('panda.um.'): continue # set tobedeleted to dis setTobeDeletedToDis(name) # count # of files status,out = ddm.DQ2.main('getNumberOfFiles',name) if status != 0: if not 'DQUnknownDatasetException' in out: _logger.error(out) else: _logger.debug(out) try: nFile = int(out) _logger.debug(nFile) if nFile == 0: # erase dataset _logger.debug('erase %s' % name) status,out = ddm.DQ2.main('eraseDataset',name) _logger.debug('OK with %s' % name) except: pass else: _logger.debug("wait %s " % name) self.proxyLock.acquire() taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) self.proxyLock.release() _logger.debug("end %s " % name) except: pass self.pool.remove(self) self.lock.release()
# start sender mailSender = MailSender() mailSender.start() # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30,0,1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA,coJumboD,coJumboW,coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format(len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA,fromDefined=False,fromActive=True,fromArchived=False,fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec],False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD,fromDefined=True,fromActive=False,fromArchived=False,fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong'
def run(self): self.lock.acquire() try: # loop over all datasets for vuid,name,modDate,verNum in self.datasets: try: try: verNum = int(verNum) except: verNum = 0 _logger.debug("Merge %s %s %s" % (modDate,name,verNum)) toBeClosed = False # close old datasets anyway if modDate < timeLimitX or verNum >= self.maxTry: toBeClosed = True # check version dsSpec = taskBuffer.queryDatasetWithMap({'vuid':vuid}) if dsSpec == None: _logger.error("failed to get dataset spec for %s:%s" % (name,vuid)) continue try: if int(dsSpec.version) != verNum+1: _logger.debug("skip %s due to version mismatch %s != %s+1" % (name,dsSpec.version,verNum)) continue except: _logger.error("failed to convert version='%s' to int for %s" % (dsSpec.version,name)) continue # get PandaID self.proxyLock.acquire() proxyS = taskBuffer.proxyPool.getProxy() pandaID = proxyS.getPandaIDwithDestDBlock(name) taskBuffer.proxyPool.putProxy(proxyS) self.proxyLock.release() if pandaID == None: _logger.error("failed to find PandaID for %s" % name) toBeClosed = True else: # get job self.proxyLock.acquire() pandaJob = taskBuffer.peekJobs([pandaID])[0] self.proxyLock.release() if pandaJob == None: _logger.error("failed to get job for %s PandaID=%s" % (name,pandaID)) toBeClosed = True else: # run merger _logger.debug("run merger for %s" % name) merger = Merger(taskBuffer,pandaJob) mRet = merger.run() if mRet == None: _logger.debug("got unrecoverable for %s" % name) toBeClosed = True elif mRet == True: _logger.debug("succeeded for %s" % name) toBeClosed = True else: _logger.debug("failed for %s" % name) # close dataset if toBeClosed: _logger.debug("close %s" % name) self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'tobeclosed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() except: errType,errValue = sys.exc_info()[:2] _logger.error("Failed %s with %s:%s" % (name,errType,errValue)) except: errType,errValue = sys.exc_info()[:2] _logger.error("MergerThr failed with %s:%s" % (errType,errValue)) self.pool.remove(self) self.lock.release()