Exemple #1
0
         None, ''
 ]) and (not libDSName in [None, '']):
     # update GUID
     tmpLog.debug("  set GUID:%s for %s" % (libGUID, libLFN))
     #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}])
     # FIXME
     retG = True
     if not retG:
         tmpLog.error("  failed to update GUID for %s" % libLFN)
     else:
         # get PandaID with lib.tgz
         #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready')
         ids = []
         # get jobs
         jobs = taskBuffer.peekJobs(ids,
                                    fromActive=False,
                                    fromArchived=False,
                                    fromWaiting=False)
         # remove None and unknown
         acJobs = []
         for job in jobs:
             if job == None or job.jobStatus == 'unknown':
                 continue
             acJobs.append(job)
         # activate
         tmpLog.debug("  -> activate downstream jobs")
         #taskBuffer.activateJobs(acJobs)
 else:
     # wait
     tmpLog.debug("  -> wait")
     varMap = {}
     varMap[':prodSourceLabel'] = 'user'
Exemple #2
0
# start sender
mailSender =  MailSender()
mailSender.start()


# session for co-jumbo jobs
tmpLog.debug("co-jumbo session")
try:
    ret = taskBuffer.getCoJumboJobsToBeFinished(30,0,1000)
    if ret is None:
        tmpLog.debug("failed to get co-jumbo jobs to finish")
    else:
        coJumboA,coJumboD,coJumboW = ret
        tmpLog.debug("finish {0} co-jumbo jobs in Active".format(len(coJumboA)))
        if len(coJumboA) > 0:
            jobSpecs = taskBuffer.peekJobs(coJumboA,fromDefined=False,fromActive=True,fromArchived=False,fromWaiting=False)
            for jobSpec in jobSpecs:
                fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec)
                if not fileCheckInJEDI:
                    jobSpec.jobStatus = 'closed'
                    jobSpec.jobSubStatus = 'cojumbo_wrong'
                    jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                taskBuffer.archiveJobs([jobSpec],False)
        tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(len(coJumboD)))
        if len(coJumboD) > 0:
            jobSpecs = taskBuffer.peekJobs(coJumboD,fromDefined=True,fromActive=False,fromArchived=False,fromWaiting=False)
            for jobSpec in jobSpecs:
                fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec)
                if not fileCheckInJEDI:
                    jobSpec.jobStatus = 'closed'
                    jobSpec.jobSubStatus = 'cojumbo_wrong'
Exemple #3
0
 def run(self):
     self.lock.acquire()
     try:
         # loop over all datasets
         for vuid, name, modDate, verNum in self.datasets:
             try:
                 try:
                     verNum = int(verNum)
                 except:
                     verNum = 0
                 _logger.debug("Merge %s %s %s" % (modDate, name, verNum))
                 toBeClosed = False
                 # close old datasets anyway
                 if modDate < timeLimitX or verNum >= self.maxTry:
                     toBeClosed = True
                 # check version
                 dsSpec = taskBuffer.queryDatasetWithMap({'vuid': vuid})
                 if dsSpec == None:
                     _logger.error("failed to get dataset spec for %s:%s" %
                                   (name, vuid))
                     continue
                 try:
                     if int(dsSpec.version) != verNum + 1:
                         _logger.debug(
                             "skip %s due to version mismatch %s != %s+1" %
                             (name, dsSpec.version, verNum))
                         continue
                 except:
                     _logger.error(
                         "failed to convert version='%s' to int for %s" %
                         (dsSpec.version, name))
                     continue
                 # get PandaID
                 self.proxyLock.acquire()
                 proxyS = taskBuffer.proxyPool.getProxy()
                 pandaID = proxyS.getPandaIDwithDestDBlock(name)
                 taskBuffer.proxyPool.putProxy(proxyS)
                 self.proxyLock.release()
                 if pandaID == None:
                     _logger.error("failed to find PandaID for %s" % name)
                     toBeClosed = True
                 else:
                     # get job
                     self.proxyLock.acquire()
                     pandaJob = taskBuffer.peekJobs([pandaID])[0]
                     self.proxyLock.release()
                     if pandaJob == None:
                         _logger.error(
                             "failed to get job for %s PandaID=%s" %
                             (name, pandaID))
                         toBeClosed = True
                     else:
                         # run merger
                         _logger.debug("run merger for %s" % name)
                         merger = Merger(taskBuffer, pandaJob)
                         mRet = merger.run()
                         if mRet == None:
                             _logger.debug("got unrecoverable for %s" %
                                           name)
                             toBeClosed = True
                         elif mRet == True:
                             _logger.debug("succeeded for %s" % name)
                             toBeClosed = True
                         else:
                             _logger.debug("failed for %s" % name)
                 # close dataset
                 if toBeClosed:
                     _logger.debug("close %s" % name)
                     self.proxyLock.acquire()
                     varMap = {}
                     varMap[':vuid'] = vuid
                     varMap[':status'] = 'tobeclosed'
                     taskBuffer.querySQLS(
                         "UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                         varMap)
                     self.proxyLock.release()
             except:
                 errType, errValue = sys.exc_info()[:2]
                 _logger.error("Failed %s with %s:%s" %
                               (name, errType, errValue))
     except:
         errType, errValue = sys.exc_info()[:2]
         _logger.error("MergerThr failed with %s:%s" % (errType, errValue))
     self.pool.remove(self)
     self.lock.release()
			else:
				# activate
				if useLib and libStatus == 'ready' and (not libGUID in [None,'']) and (not libDSName in [None,'']):
					# update GUID
					tmpLog.debug("  set GUID:%s for %s" % (libGUID,libLFN))
					#retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}])
					# FIXME
					retG = True
					if not retG:
						tmpLog.error("  failed to update GUID for %s" % libLFN)
					else:
						# get PandaID with lib.tgz
						#ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready')
						ids = []
						# get jobs
						jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False)
						# remove None and unknown
						acJobs = []
						for job in jobs:
							if job == None or job.jobStatus == 'unknown':
								continue
							acJobs.append(job)
						# activate
						tmpLog.debug("  -> activate downstream jobs")
						#taskBuffer.activateJobs(acJobs)
				else:
					# wait
					tmpLog.debug("  -> wait")
					varMap = {}
					varMap[':prodSourceLabel'] = 'user'
					varMap[':jobDefinitionID'] = jobDefinitionID
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             _logger.debug("start %s %s" % (modDate,name))
             self.proxyLock.acquire()
             retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3,:status4,:status5)",
                                          {':destinationDBlock':name,':status1':'ready',':status2':'failed',
                                           ':status3':'skipped',':status4':'merging',
                                           ':status5':'finished'})
             self.proxyLock.release()
             if retF<0:
                 _logger.error("SQL error")
             else:
                 # no files in filesTable
                 if len(resF) == 0:
                     _logger.debug("freeze %s " % name)
                     dsExists = True
                     if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                             or name.startswith('hc_test.') or name.startswith('panda.um.'):
                         dsExists = False
                     if name.startswith('panda.um.'):
                         self.proxyLock.acquire()
                         retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ",
                                                              {':destinationDBlock':name,
                                                               ':statusM':'merging',
                                                               ':statusF':'failed'})
                         self.proxyLock.release()
                         if resMer != None and len(resMer)>0:
                             mergeID = resMer[0][0]
                             # get merging jobs
                             self.proxyLock.acquire()
                             mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False)
                             self.proxyLock.release()    
                             mergeJob = mergingJobs[0]
                             if mergeJob != None:
                                 tmpDestDBlocks = []
                                 # get destDBlock
                                 for tmpFile in mergeJob.Files:
                                     if tmpFile.type in ['output','log']:
                                         if not tmpFile.destinationDBlock in tmpDestDBlocks:
                                             tmpDestDBlocks.append(tmpFile.destinationDBlock)
                                 # run
                                 _logger.debug("start JEDI closer for %s " % name)
                                 self.proxyLock.acquire()
                                 cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob)
                                 cThr.start()
                                 cThr.join()
                                 self.proxyLock.release()
                                 _logger.debug("end JEDI closer for %s " % name)
                                 continue
                             else:
                                 _logger.debug("failed to get merging job for %s " % name)
                         else:
                             _logger.debug("failed to get merging file for %s " % name)
                         status,out = 0,''
                     elif dsExists:
                         # check if dataset exists
                         status,out = rucioAPI.getMetaData(name)
                         if status == True:
                             if out != None:
                                 status,out = ddm.DQ2.main('freezeDataset',name)
                             else:
                                 # dataset not exist
                                 status,out = 0,''
                                 dsExists = False
                     else:
                         status,out = 0,''
                     if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
                            out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
                            out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
                         _logger.error('{0} failed to freeze with {1}'.format(name,out))
                     else:
                         self.proxyLock.acquire()
                         varMap = {}
                         varMap[':vuid'] = vuid
                         varMap[':status'] = 'completed' 
                         taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                          varMap)
                         self.proxyLock.release()                            
                         if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists:
                             continue
                         # set tobedeleted to dis
                         setTobeDeletedToDis(name)
                         # count # of files
                         status,out = ddm.DQ2.main('getNumberOfFiles',name)
                         if status != 0:
                             if not 'DQUnknownDatasetException' in out:
                                 _logger.error(out)
                         else:
                             _logger.debug(out)                                            
                             try:
                                 nFile = int(out)
                                 _logger.debug(nFile)
                                 if nFile == 0:
                                     # erase dataset
                                     _logger.debug('erase %s' % name)                                
                                     status,out = ddm.DQ2.main('eraseDataset',name)
                                     _logger.debug('OK with %s' % name)
                             except:
                                 pass
                 else:
                     _logger.debug("wait %s " % name)
                     self.proxyLock.acquire()                        
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
                     self.proxyLock.release()                                                    
             _logger.debug("end %s " % name)
     except:
         pass
     self.pool.remove(self)
     self.lock.release()
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         upJobs = []
         finJobs = []
         for job in jobs:
             if job == None or job.jobStatus == 'unknown':
                 continue
             # use BNL by default
             dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url
             dq2SE  = []
             # get LFC and SEs
             if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE):
                 # using --destSE for analysis job to transfer output
                 try:
                     dq2URL = 'rucio://atlas-rucio.cern.ch:/grid/atlas'
                     match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1])
                     if match != None:
                         dq2SE.append(match.group(1))
                 except:
                     type, value, traceBack = sys.exc_info()
                     _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value))
                     continue
             elif siteMapper.checkCloud(job.cloud):
                 # normal production jobs
                 if DataServiceUtils.checkJobDestinationSE(job) == None:
                     tmpDstID = siteMapper.getCloud(job.cloud)['dest']
                 else:
                     tmpDstID = job.destinationSE
                 tmpDstSite = siteMapper.getSite(tmpDstID)
                 # get catalog URL
                 dq2URL = 'rucio://atlas-rucio.cern.ch:/grid/atlas'
                 if tmpDstSite.se != None:
                     for tmpDstSiteSE in tmpDstSite.se.split(','):
                         match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE)
                         if match != None:
                             dq2SE.append(match.group(1))
             # get LFN list
             lfns   = []
             guids  = []
             scopes = []
             nTokens = 0
             for file in job.Files:
                 # only output files are checked
                 if file.type == 'output' or file.type == 'log':
                     if file.status == 'nooutput':
                         continue
                     if DataServiceUtils.getDistributedDestination(file.destinationDBlockToken) != None:
                         continue
                     lfns.append(file.lfn)
                     guids.append(file.GUID)
                     scopes.append(file.scope)
                     nTokens += len(file.destinationDBlockToken.split(','))
             # get files in LRC
             _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL))
             okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,
                                                             getPFN=True,scopeList=scopes)
             # count files
             nOkTokens = 0
             for okLFN,okPFNs in okFiles.iteritems():
                 nOkTokens += len(okPFNs)
             # check all files are ready    
             _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens))
             if nTokens <= nOkTokens:
                 _logger.debug("%s Finisher : Finish" % job.PandaID)
                 for file in job.Files:
                     if file.type == 'output' or file.type == 'log':
                         if file.status != 'nooutput':
                             file.status = 'ready'
                 # append to run Finisher
                 finJobs.append(job)                        
             else:
                 endTime = job.endTime
                 if endTime == 'NULL':
                     endTime = job.startTime
                 # priority-dependent timeout
                 tmpCloudSpec = siteMapper.getCloud(job.cloud)
                 if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']):
                     if tmpCloudSpec.has_key('transtimehi'):
                         timeOutValue = tmpCloudSpec['transtimehi']
                     else:
                         timeOutValue = 1
                 else:
                     if tmpCloudSpec.has_key('transtimelo'):                    
                         timeOutValue = tmpCloudSpec['transtimelo']
                     else:
                         timeOutValue = 2                        
                 # protection
                 if timeOutValue < 1:
                     timeOutValue  = 1
                 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue)
                 _logger.debug("%s  Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime)))
                 if endTime < timeOut:
                     # timeout
                     _logger.debug("%s Finisher : Kill" % job.PandaID)
                     strMiss = ''
                     for lfn in lfns:
                         if not lfn in okFiles:
                             strMiss += ' %s' % lfn
                     job.jobStatus = 'failed'
                     job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer
                     job.taskBufferErrorDiag = 'transfer timeout for '+strMiss
                     guidMap = {}
                     for file in job.Files:
                         # set file status
                         if file.status == 'transferring' or file.type in ['log','output']:
                             file.status = 'failed'
                         # collect GUIDs to delete files from _tid datasets
                         if file.type == 'output' or file.type == 'log':
                             if not guidMap.has_key(file.destinationDBlock):
                                 guidMap[file.destinationDBlock] = []
                             guidMap[file.destinationDBlock].append(file.GUID)
                 else:
                     # wait
                     _logger.debug("%s Finisher : Wait" % job.PandaID)
                     for lfn in lfns:
                         if not lfn in okFiles:
                             _logger.debug("%s    -> %s" % (job.PandaID,lfn))
             upJobs.append(job)
         # update
         _logger.debug("updating ...")
         self.proxyLock.acquire()
         taskBuffer.updateJobs(upJobs,False)
         self.proxyLock.release()
         # run Finisher
         for job in finJobs:
             fThr = Finisher(taskBuffer,None,job)
             fThr.start()
             fThr.join()
         _logger.debug("done")
         time.sleep(1)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         errStr  = "FinisherThr failed with %s %s" % (errtype,errvalue)
         errStr += traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         actJobs = []
         for tmpJob in jobs:
             if tmpJob == None or tmpJob.jobStatus == 'unknown':
                 continue
             # get LFN list
             lfns   = []
             guids  = []
             scopes = []
             for tmpFile in tmpJob.Files:
                 # only input files are checked
                 if tmpFile.type == 'input' and tmpFile.status != 'ready':
                     lfns.append(tmpFile.lfn)
                     scopes.append(tmpFile.scope)
             # get file replicas
             _logger.debug("%s check input files at %s" % (tmpJob.PandaID,tmpJob.computingSite))
             tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns)
             if not tmpStat:
                 pass
             else:
                 # check if locally available
                 siteSpec = siteMapper.getSite(tmpJob.computingSite)
                 allOK = True
                 for tmpFile in tmpJob.Files:
                     # only input
                     if tmpFile.type == 'input' and tmpFile.status != 'ready':
                         # check RSEs
                         if tmpFile.lfn in okFiles:
                             for rse in okFiles[tmpFile.lfn]:
                                 if siteSpec.ddm_endpoints.isAssociated(rse) and \
                                         siteSpec.ddm_endpoints.getEndPoint(rse)['is_tape'] == 'N':
                                     tmpFile.status = 'ready'
                                     break
                         # missing
                         if tmpFile.status != 'ready':
                             allOK = False
                             _logger.debug("%s skip since %s:%s is missing" % (tmpJob.PandaID,tmpFile.scope,tmpFile.lfn))
                             break
                 if not allOK:
                     continue
                 # append to run activator
                 _logger.debug("%s to activate" % tmpJob.PandaID)
                 actJobs.append(tmpJob)
         # update
         _logger.debug("activating ...")
         self.proxyLock.acquire()
         taskBuffer.activateJobs(actJobs)
         self.proxyLock.release()
         _logger.debug("done")
         time.sleep(1)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         _logger.error("ActivatorThr failed with %s %s" % (errtype,errvalue))
     self.pool.remove(self)
     self.lock.release()
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         upJobs = []
         finJobs = []
         for job in jobs:
             if job == None or job.jobStatus == 'unknown':
                 continue
             # use BNL by default
             dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url
             dq2SE  = []
             # get LFC and SEs
             if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE):
                 # using --destSE for analysis job to transfer output
                 try:
                     dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1]
                     match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1])
                     if match != None:
                         dq2SE.append(match.group(1))
                 except:
                     type, value, traceBack = sys.exc_info()
                     _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value))
                     continue
             elif siteMapper.checkCloud(job.cloud):
                 # normal production jobs
                 if DataServiceUtils.checkJobDestinationSE(job) == None:
                     tmpDstID = siteMapper.getCloud(job.cloud)['dest']
                 else:
                     tmpDstID = job.destinationSE
                 tmpDstSite = siteMapper.getSite(tmpDstID)
                 # get catalog URL
                 tmpStat,dq2URL = dataservice.DDM.toa.getLocalCatalog(tmpDstSite.ddm)
                 if tmpDstSite.se != None:
                     for tmpDstSiteSE in tmpDstSite.se.split(','):
                         match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE)
                         if match != None:
                             dq2SE.append(match.group(1))
             # get LFN list
             lfns   = []
             guids  = []
             scopes = []
             nTokens = 0
             for file in job.Files:
                 # only output files are checked
                 if file.type == 'output' or file.type == 'log':
                     lfns.append(file.lfn)
                     guids.append(file.GUID)
                     scopes.append(file.scope)
                     nTokens += len(file.destinationDBlockToken.split(','))
             # get files in LRC
             _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL))
             okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,
                                                             getPFN=True,scopeList=scopes)
             # count files
             nOkTokens = 0
             for okLFN,okPFNs in okFiles.iteritems():
                 nOkTokens += len(okPFNs)
             # check all files are ready    
             _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens))
             if nTokens <= nOkTokens:
                 _logger.debug("%s Finisher : Finish" % job.PandaID)
                 for file in job.Files:
                     if file.type == 'output' or file.type == 'log':
                         file.status = 'ready'
                 # append to run Finisher
                 finJobs.append(job)                        
             else:
                 endTime = job.endTime
                 if endTime == 'NULL':
                     endTime = job.startTime
                 # priority-dependent timeout
                 tmpCloudSpec = siteMapper.getCloud(job.cloud)
                 if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']):
                     if tmpCloudSpec.has_key('transtimehi'):
                         timeOutValue = tmpCloudSpec['transtimehi']
                     else:
                         timeOutValue = 1
                 else:
                     if tmpCloudSpec.has_key('transtimelo'):                    
                         timeOutValue = tmpCloudSpec['transtimelo']
                     else:
                         timeOutValue = 2                        
                 # protection
                 if timeOutValue < 1:
                     timeOutValue  = 1
                 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue)
                 _logger.debug("%s  Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime)))
                 if endTime < timeOut:
                     # timeout
                     _logger.debug("%s Finisher : Kill" % job.PandaID)
                     strMiss = ''
                     for lfn in lfns:
                         if not lfn in okFiles:
                             strMiss += ' %s' % lfn
                     job.jobStatus = 'failed'
                     job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer
                     job.taskBufferErrorDiag = 'transfer timeout for '+strMiss
                     guidMap = {}
                     for file in job.Files:
                         # set file status
                         if file.status == 'transferring':
                             file.status = 'failed'
                         # collect GUIDs to delete files from _tid datasets
                         if file.type == 'output' or file.type == 'log':
                             if not guidMap.has_key(file.destinationDBlock):
                                 guidMap[file.destinationDBlock] = []
                             guidMap[file.destinationDBlock].append(file.GUID)
                 else:
                     # wait
                     _logger.debug("%s Finisher : Wait" % job.PandaID)
                     for lfn in lfns:
                         if not lfn in okFiles:
                             _logger.debug("%s    -> %s" % (job.PandaID,lfn))
             upJobs.append(job)
         # update
         _logger.debug("updating ...")
         self.proxyLock.acquire()
         taskBuffer.updateJobs(upJobs,False)
         self.proxyLock.release()
         # run Finisher
         for job in finJobs:
             fThr = Finisher(taskBuffer,None,job)
             fThr.start()
             fThr.join()
         _logger.debug("done")
         time.sleep(1)
     except:
         errtype,errvalue = sys.exc_info()[:2]
         _logger.error("FinisherThr failed with %s %s" % (errtype,errvalue))
     self.pool.remove(self)
     self.lock.release()
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             _logger.debug("start %s %s" % (modDate,name))
             self.proxyLock.acquire()
             retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3,:status4,:status5)",
                                          {':destinationDBlock':name,':status1':'ready',':status2':'failed',
                                           ':status3':'skipped',':status4':'merging',
                                           ':status5':'finished'})
             self.proxyLock.release()
             if retF<0:
                 _logger.error("SQL error")
             else:
                 # no files in filesTable
                 if len(resF) == 0:
                     _logger.debug("freeze %s " % name)
                     if name.startswith('panda.um.'):
                         self.proxyLock.acquire()
                         retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ",
                                                              {':destinationDBlock':name,
                                                               ':statusM':'merging',
                                                               ':statusF':'failed'})
                         self.proxyLock.release()
                         if resMer != None and len(resMer)>0:
                             mergeID = resMer[0][0]
                             # get merging jobs
                             self.proxyLock.acquire()
                             mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False)
                             self.proxyLock.release()    
                             mergeJob = mergingJobs[0]
                             if mergeJob != None:
                                 tmpDestDBlocks = []
                                 # get destDBlock
                                 for tmpFile in mergeJob.Files:
                                     if tmpFile.type in ['output','log']:
                                         if not tmpFile.destinationDBlock in tmpDestDBlocks:
                                             tmpDestDBlocks.append(tmpFile.destinationDBlock)
                                 # run
                                 _logger.debug("start JEDI closer for %s " % name)
                                 self.proxyLock.acquire()
                                 cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob)
                                 cThr.start()
                                 cThr.join()
                                 self.proxyLock.release()
                                 _logger.debug("end JEDI closer for %s " % name)
                                 continue
                             else:
                                 _logger.debug("failed to get merging job for %s " % name)
                         else:
                             _logger.debug("failed to get merging file for %s " % name)
                         status,out = 0,''
                     elif not name.startswith('pandaddm_'):
                         status,out = ddm.DQ2.main('freezeDataset',name)
                     else:
                         status,out = 0,''
                     if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
                            out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
                            out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
                         _logger.error(out)
                     else:
                         self.proxyLock.acquire()
                         varMap = {}
                         varMap[':vuid'] = vuid
                         varMap[':status'] = 'completed' 
                         taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                          varMap)
                         self.proxyLock.release()                            
                         if name.startswith('pandaddm_') or name.startswith('panda.um.'):
                             continue
                         # set tobedeleted to dis
                         setTobeDeletedToDis(name)
                         # count # of files
                         status,out = ddm.DQ2.main('getNumberOfFiles',name)
                         if status != 0:
                             if not 'DQUnknownDatasetException' in out:
                                 _logger.error(out)
                         else:
                             _logger.debug(out)                                            
                             try:
                                 nFile = int(out)
                                 _logger.debug(nFile)
                                 if nFile == 0:
                                     # erase dataset
                                     _logger.debug('erase %s' % name)                                
                                     status,out = ddm.DQ2.main('eraseDataset',name)
                                     _logger.debug('OK with %s' % name)
                             except:
                                 pass
                 else:
                     _logger.debug("wait %s " % name)
                     self.proxyLock.acquire()                        
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
                     self.proxyLock.release()                                                    
             _logger.debug("end %s " % name)
     except:
         pass
     self.pool.remove(self)
     self.lock.release()
Exemple #10
0
# start sender
mailSender =  MailSender()
mailSender.start()


# session for co-jumbo jobs
tmpLog.debug("co-jumbo session")
try:
    ret = taskBuffer.getCoJumboJobsToBeFinished(30,0,1000)
    if ret is None:
        tmpLog.debug("failed to get co-jumbo jobs to finish")
    else:
        coJumboA,coJumboD,coJumboW,coJumboTokill = ret
        tmpLog.debug("finish {0} co-jumbo jobs in Active".format(len(coJumboA)))
        if len(coJumboA) > 0:
            jobSpecs = taskBuffer.peekJobs(coJumboA,fromDefined=False,fromActive=True,fromArchived=False,fromWaiting=False)
            for jobSpec in jobSpecs:
                fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec)
                if not fileCheckInJEDI:
                    jobSpec.jobStatus = 'closed'
                    jobSpec.jobSubStatus = 'cojumbo_wrong'
                    jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                taskBuffer.archiveJobs([jobSpec],False)
        tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(len(coJumboD)))
        if len(coJumboD) > 0:
            jobSpecs = taskBuffer.peekJobs(coJumboD,fromDefined=True,fromActive=False,fromArchived=False,fromWaiting=False)
            for jobSpec in jobSpecs:
                fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec)
                if not fileCheckInJEDI:
                    jobSpec.jobStatus = 'closed'
                    jobSpec.jobSubStatus = 'cojumbo_wrong'
 def run(self):
     self.lock.acquire()
     try:
         # loop over all datasets
         for vuid,name,modDate,verNum in self.datasets:
             try:
                 try:
                     verNum = int(verNum)
                 except:
                     verNum = 0
                 _logger.debug("Merge %s %s %s" % (modDate,name,verNum))
                 toBeClosed = False
                 # close old datasets anyway
                 if modDate < timeLimitX or verNum >= self.maxTry:
                     toBeClosed = True
                 # check version
                 dsSpec = taskBuffer.queryDatasetWithMap({'vuid':vuid})
                 if dsSpec == None:
                     _logger.error("failed to get dataset spec for %s:%s" % (name,vuid))
                     continue
                 try:
                     if int(dsSpec.version) != verNum+1:
                         _logger.debug("skip %s due to version mismatch %s != %s+1" % (name,dsSpec.version,verNum))
                         continue
                 except:
                     _logger.error("failed to convert version='%s' to int for %s" % (dsSpec.version,name))
                     continue
                 # get PandaID
                 self.proxyLock.acquire()                
                 proxyS = taskBuffer.proxyPool.getProxy()
                 pandaID = proxyS.getPandaIDwithDestDBlock(name)
                 taskBuffer.proxyPool.putProxy(proxyS)
                 self.proxyLock.release()                
                 if pandaID == None:
                     _logger.error("failed to find PandaID for %s" % name)
                     toBeClosed = True
                 else:
                     # get job
                     self.proxyLock.acquire()
                     pandaJob = taskBuffer.peekJobs([pandaID])[0]
                     self.proxyLock.release()
                     if pandaJob == None:
                         _logger.error("failed to get job for %s PandaID=%s" % (name,pandaID))
                         toBeClosed = True
                     else:
                         # run merger
                         _logger.debug("run merger for %s" % name)
                         merger = Merger(taskBuffer,pandaJob)
                         mRet = merger.run()
                         if mRet == None:
                             _logger.debug("got unrecoverable for %s" % name)
                             toBeClosed = True
                         elif mRet == True:
                             _logger.debug("succeeded for %s" % name)
                             toBeClosed = True
                         else:
                             _logger.debug("failed for %s" % name)                            
                 # close dataset
                 if toBeClosed:
                     _logger.debug("close %s" % name)                    
                     self.proxyLock.acquire()
                     varMap = {}
                     varMap[':vuid'] = vuid
                     varMap[':status'] = 'tobeclosed'
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                      varMap)
                     self.proxyLock.release()
             except:
                 errType,errValue = sys.exc_info()[:2]
                 _logger.error("Failed %s with %s:%s" % (name,errType,errValue))                    
     except:
         errType,errValue = sys.exc_info()[:2]
         _logger.error("MergerThr failed with %s:%s" % (errType,errValue))
     self.pool.remove(self)
     self.lock.release()