Beispiel #1
0
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         actJobs = []
         replicaMap = dict()
         for tmpJob in jobs:
             if tmpJob is None or tmpJob.jobStatus == 'unknown':
                 continue
             # check if locally available
             siteSpec = siteMapper.getSite(tmpJob.computingSite)
             scope_input, scope_output = select_scope(siteSpec, tmpJob.prodSourceLabel)
             allOK = True
             for tmpFile in tmpJob.Files:
                 # only input files are checked
                 if tmpFile.type == 'input' and tmpFile.status != 'ready':
                     # get replicas
                     if tmpFile.dispatchDBlock not in replicaMap:
                         tmpStat, repMap = rucioAPI.listDatasetReplicas(tmpFile.dispatchDBlock)
                         if tmpStat != 0:
                             repMap = {}
                         replicaMap[tmpFile.dispatchDBlock] = repMap
                     # check RSEs
                     for rse in replicaMap[tmpFile.dispatchDBlock]:
                         repInfo = replicaMap[tmpFile.dispatchDBlock][rse]
                         if siteSpec.ddm_endpoints_input[scope_input].isAssociated(rse) and \
                                 siteSpec.ddm_endpoints_input[scope_input].getEndPoint(rse)['is_tape'] == 'N' and \
                                 repInfo[0]['total'] == repInfo[0]['found'] and repInfo[0]['total'] is not None:
                             tmpFile.status = 'ready'
                             break
                     # missing
                     if tmpFile.status != 'ready':
                         allOK = False
                         _logger.debug("%s skip since %s:%s is missing with rule" % (tmpJob.PandaID,tmpFile.scope,tmpFile.lfn))
                         break
             if not allOK:
                 continue
             # append to run activator
             _logger.debug("%s to activate with rule" % tmpJob.PandaID)
             actJobs.append(tmpJob)
         # update
         _logger.debug("activating ...")
         self.proxyLock.acquire()
         taskBuffer.activateJobs(actJobs)
         self.proxyLock.release()
         _logger.debug("done")
         time.sleep(1)
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         _logger.error("ActivatorThr failed with %s %s" % (errtype,errvalue))
     self.pool.remove(self)
     self.lock.release()
Beispiel #2
0
mailSender.start()

# session for co-jumbo jobs
tmpLog.debug("co-jumbo session")
try:
    ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000)
    if ret is None:
        tmpLog.debug("failed to get co-jumbo jobs to finish")
    else:
        coJumboA, coJumboD, coJumboW, coJumboTokill = ret
        tmpLog.debug("finish {0} co-jumbo jobs in Active".format(
            len(coJumboA)))
        if len(coJumboA) > 0:
            jobSpecs = taskBuffer.peekJobs(coJumboA,
                                           fromDefined=False,
                                           fromActive=True,
                                           fromArchived=False,
                                           fromWaiting=False)
            for jobSpec in jobSpecs:
                fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                    jobSpec)
                if not fileCheckInJEDI:
                    jobSpec.jobStatus = 'closed'
                    jobSpec.jobSubStatus = 'cojumbo_wrong'
                    jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                taskBuffer.archiveJobs([jobSpec], False)
        tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(
            len(coJumboD)))
        if len(coJumboD) > 0:
            jobSpecs = taskBuffer.peekJobs(coJumboD,
                                           fromDefined=True,
Beispiel #3
0
         None, ''
 ]) and (libDSName not in [None, '']):
     # update GUID
     tmpLog.debug("  set GUID:%s for %s" % (libGUID, libLFN))
     #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}])
     # FIXME
     retG = True
     if not retG:
         tmpLog.error("  failed to update GUID for %s" % libLFN)
     else:
         # get PandaID with lib.tgz
         #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready')
         ids = []
         # get jobs
         jobs = taskBuffer.peekJobs(ids,
                                    fromActive=False,
                                    fromArchived=False,
                                    fromWaiting=False)
         # remove None and unknown
         acJobs = []
         for job in jobs:
             if job is None or job.jobStatus == 'unknown':
                 continue
             acJobs.append(job)
         # activate
         tmpLog.debug("  -> activate downstream jobs")
         #taskBuffer.activateJobs(acJobs)
 else:
     # wait
     tmpLog.debug("  -> wait")
     varMap = {}
     varMap[':prodSourceLabel'] = 'user'
Beispiel #4
0
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         actJobs = []
         for tmpJob in jobs:
             if tmpJob is None or tmpJob.jobStatus == 'unknown':
                 continue
             # get LFN list
             lfns   = []
             guids  = []
             scopes = []
             for tmpFile in tmpJob.Files:
                 # only input files are checked
                 if tmpFile.type == 'input' and tmpFile.status != 'ready':
                     lfns.append(tmpFile.lfn)
                     scopes.append(tmpFile.scope)
             # get file replicas
             _logger.debug("%s check input files at %s" % (tmpJob.PandaID, tmpJob.computingSite))
             tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns)
             if not tmpStat:
                 pass
             else:
                 # check if locally available
                 siteSpec = siteMapper.getSite(tmpJob.computingSite)
                 scope_input, scope_output = select_scope(siteSpec, tmpJob.prodSourceLabel)
                 allOK = True
                 for tmpFile in tmpJob.Files:
                     # only input
                     if tmpFile.type == 'input' and tmpFile.status != 'ready':
                         # check RSEs
                         if tmpFile.lfn in okFiles:
                             for rse in okFiles[tmpFile.lfn]:
                                 if siteSpec.ddm_endpoints_input[scope_input].isAssociated(rse) and \
                                         siteSpec.ddm_endpoints_input[scope_input].getEndPoint(rse)['is_tape'] == 'N':
                                     tmpFile.status = 'ready'
                                     break
                         # missing
                         if tmpFile.status != 'ready':
                             allOK = False
                             _logger.debug("%s skip since %s:%s is missing" % (tmpJob.PandaID,tmpFile.scope,tmpFile.lfn))
                             break
                 if not allOK:
                     continue
                 # append to run activator
                 _logger.debug("%s to activate" % tmpJob.PandaID)
                 actJobs.append(tmpJob)
         # update
         _logger.debug("activating ...")
         self.proxyLock.acquire()
         taskBuffer.activateJobs(actJobs)
         self.proxyLock.release()
         _logger.debug("done")
         time.sleep(1)
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         _logger.error("ActivatorThr failed with %s %s" % (errtype,errvalue))
     self.pool.remove(self)
     self.lock.release()
Beispiel #5
0
 def run(self):
     self.lock.acquire()
     try:
         # get jobs from DB
         ids = self.ids
         self.proxyLock.acquire()
         jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
         self.proxyLock.release()
         upJobs = []
         finJobs = []
         for job in jobs:
             if job is None or job.jobStatus == 'unknown':
                 continue
             seList = ['dummy']
             tmpNucleus = siteMapper.getNucleus(job.nucleus)
             # get SEs
             if job.prodSourceLabel == 'user' and job.destinationSE not in siteMapper.siteSpecList:
                 # using --destSE for analysis job to transfer output
                 seList = [job.destinationSE]
             elif tmpNucleus is not None:
                 seList = list(tmpNucleus.allDdmEndPoints)
             elif siteMapper.checkCloud(job.cloud):
                 # normal production jobs
                 if DataServiceUtils.checkJobDestinationSE(job) is None:
                     tmpDstID = siteMapper.getCloud(job.cloud)['dest']
                 else:
                     tmpDstID = job.destinationSE
                 tmpDstSite = siteMapper.getSite(tmpDstID)
                 scope_input, scope_output = select_scope(tmpDstSite, job.prodSourceLabel)
                 seList = tmpDstSite.ddm_endpoints_output[scope_output].getLocalEndPoints()
             # get LFN list
             lfns   = []
             guids  = []
             scopes = []
             nTokens = 0
             for file in job.Files:
                 # only output files are checked
                 if file.type == 'output' or file.type == 'log':
                     if file.status == 'nooutput':
                         continue
                     if DataServiceUtils.getDistributedDestination(file.destinationDBlockToken) is not None:
                         continue
                     lfns.append(file.lfn)
                     guids.append(file.GUID)
                     scopes.append(file.scope)
                     nTokens += len(file.destinationDBlockToken.split(','))
             # get files in LRC
             _logger.debug("%s Cloud:%s" % (job.PandaID,job.cloud))
             tmpStat,okFiles = rucioAPI.listFileReplicas(scopes,lfns,seList)
             if not tmpStat:
                 _logger.error("%s failed to get file replicas" % job.PandaID)
                 okFiles = {}
             # count files
             nOkTokens = 0
             for okLFN in okFiles:
                 okSEs = okFiles[okLFN]
                 nOkTokens += len(okSEs)
             # check all files are ready    
             _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens))
             if nTokens <= nOkTokens:
                 _logger.debug("%s Finisher : Finish" % job.PandaID)
                 for file in job.Files:
                     if file.type == 'output' or file.type == 'log':
                         if file.status != 'nooutput':
                             file.status = 'ready'
                 # append to run Finisher
                 finJobs.append(job)                        
             else:
                 endTime = job.endTime
                 if endTime == 'NULL':
                     endTime = job.startTime
                 # priority-dependent timeout
                 tmpCloudSpec = siteMapper.getCloud(job.cloud)
                 if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']):
                     if 'transtimehi' in tmpCloudSpec:
                         timeOutValue = tmpCloudSpec['transtimehi']
                     else:
                         timeOutValue = 1
                 else:
                     if 'transtimelo' in tmpCloudSpec:
                         timeOutValue = tmpCloudSpec['transtimelo']
                     else:
                         timeOutValue = 2                        
                 # protection
                 if timeOutValue < 1:
                     timeOutValue  = 1
                 timeOut = self.timeNow - datetime.timedelta(days=timeOutValue)
                 _logger.debug("%s  Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime)))
                 if endTime < timeOut:
                     # timeout
                     _logger.debug("%s Finisher : Kill" % job.PandaID)
                     strMiss = ''
                     for lfn in lfns:
                         if not lfn in okFiles:
                             strMiss += ' %s' % lfn
                     job.jobStatus = 'failed'
                     job.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_Transfer
                     job.taskBufferErrorDiag = 'transfer timeout for '+strMiss
                     guidMap = {}
                     for file in job.Files:
                         # set file status
                         if file.status == 'transferring' or file.type in ['log','output']:
                             file.status = 'failed'
                         # collect GUIDs to delete files from _tid datasets
                         if file.type == 'output' or file.type == 'log':
                             if file.destinationDBlock not in guidMap:
                                 guidMap[file.destinationDBlock] = []
                             guidMap[file.destinationDBlock].append(file.GUID)
                 else:
                     # wait
                     _logger.debug("%s Finisher : Wait" % job.PandaID)
                     for lfn in lfns:
                         if not lfn in okFiles:
                             _logger.debug("%s    -> %s" % (job.PandaID,lfn))
             upJobs.append(job)
         # update
         _logger.debug("updating ...")
         self.proxyLock.acquire()
         taskBuffer.updateJobs(upJobs,False)
         self.proxyLock.release()
         # run Finisher
         for job in finJobs:
             fThr = Finisher(taskBuffer,None,job)
             fThr.start()
             fThr.join()
         _logger.debug("done")
         time.sleep(1)
     except Exception:
         errtype,errvalue = sys.exc_info()[:2]
         errStr  = "FinisherThr failed with %s %s" % (errtype,errvalue)
         errStr += traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Beispiel #6
0
 def run(self):
     self.lock.acquire()
     try:
         for vuid,name,modDate in self.datasets:
             _logger.debug("Freezer start %s %s" % (modDate,name))
             self.proxyLock.acquire()
             retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ",
                                          {':destinationDBlock':name})
             self.proxyLock.release()
             if retF < 0:
                 _logger.error("SQL error")
             else:
                 allFinished = True
                 onePandaID = None
                 for tmpPandaID,tmpFileStatus in resF:
                     onePandaID = tmpPandaID
                     if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']:
                         allFinished = False
                         break
                 # check sub datasets in the jobset for event service job
                 if allFinished:
                     self.proxyLock.acquire()
                     tmpJobs = taskBuffer.getFullJobStatus([onePandaID])
                     self.proxyLock.release()
                     if len(tmpJobs) > 0 and tmpJobs[0] is not None:
                         if EventServiceUtils.isEventServiceMerge(tmpJobs[0]):
                             self.proxyLock.acquire()
                             cThr = Closer(taskBuffer, [], tmpJobs[0])
                             allFinished = cThr.checkSubDatasetsInJobset()
                             self.proxyLock.release()
                             _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished))
                 # no files in filesTable
                 if allFinished:
                     _logger.debug("freeze %s " % name)
                     dsExists = True
                     if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \
                             or name.startswith('hc_test.') or name.startswith('panda.um.'):
                         dsExists = False
                     if name.startswith('panda.um.'):
                         self.proxyLock.acquire()
                         retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ",
                                                              {':destinationDBlock':name,
                                                               ':statusM':'merging',
                                                               ':statusF':'failed'})
                         self.proxyLock.release()
                         if resMer is not None and len(resMer)>0:
                             mergeID = resMer[0][0]
                             # get merging jobs
                             self.proxyLock.acquire()
                             mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False)
                             self.proxyLock.release()    
                             mergeJob = mergingJobs[0]
                             if mergeJob is not None:
                                 tmpDestDBlocks = []
                                 # get destDBlock
                                 for tmpFile in mergeJob.Files:
                                     if tmpFile.type in ['output','log']:
                                         if not tmpFile.destinationDBlock in tmpDestDBlocks:
                                             tmpDestDBlocks.append(tmpFile.destinationDBlock)
                                 # run
                                 _logger.debug("start JEDI closer for %s " % name)
                                 self.proxyLock.acquire()
                                 cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob)
                                 cThr.start()
                                 cThr.join()
                                 self.proxyLock.release()
                                 _logger.debug("end JEDI closer for %s " % name)
                                 continue
                             else:
                                 _logger.debug("failed to get merging job for %s " % name)
                         else:
                             _logger.debug("failed to get merging file for %s " % name)
                         status,out = True,''
                     elif dsExists:
                         # check if dataset exists
                         status,out = rucioAPI.getMetaData(name)
                         if status == True:
                             if out is not None:
                                 try:
                                     rucioAPI.closeDataset(name)
                                     status = True
                                 except Exception:
                                     errtype,errvalue = sys.exc_info()[:2]
                                     out = 'failed to freeze : {0} {1}'.format(errtype,errvalue)
                                     status = False
                             else:
                                 # dataset not exist
                                 status,out = True,''
                                 dsExists = False
                     else:
                         status,out = True,''
                     if not status:
                         _logger.error('{0} failed to freeze with {1}'.format(name,out))
                     else:
                         self.proxyLock.acquire()
                         varMap = {}
                         varMap[':vuid'] = vuid
                         varMap[':status'] = 'completed' 
                         taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
                                          varMap)
                         self.proxyLock.release()                            
                         if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists:
                             continue
                         # set tobedeleted to dis
                         setTobeDeletedToDis(name)
                         # count # of files
                         status,out = rucioAPI.getNumberOfFiles(name)
                         if status is not True:
                             if status is False:
                                 _logger.error(out)
                         else:
                             _logger.debug(out)                                            
                             try:
                                 nFile = int(out)
                                 _logger.debug(nFile)
                                 if nFile == 0:
                                     # erase dataset
                                     _logger.debug('erase %s' % name)                                
                                     status,out = rucioAPI.eraseDataset(name)
                                     _logger.debug('OK with %s' % name)
                             except Exception:
                                 pass
                 else:
                     _logger.debug("wait %s " % name)
                     self.proxyLock.acquire()                        
                     taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
                     self.proxyLock.release()                                                    
             _logger.debug("end %s " % name)
     except Exception:
         errStr = traceback.format_exc()
         _logger.error(errStr)
     self.pool.remove(self)
     self.lock.release()
Beispiel #7
0
import pandaserver.userinterface.Client as Client
from pandaserver.userinterface.Client import baseURLSSL

from pandaserver.taskbuffer.TaskBuffer import taskBuffer
from pandaserver.brokerage.SiteMapper import SiteMapper
from pandaserver.config import panda_config
from pandaserver.dataservice import DataServiceUtils
from pandaserver.dataservice.DataServiceUtils import select_scope

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)
# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

id = sys.argv[1]
job = taskBuffer.peekJobs([id])[0]

if job is None:
    print("got None")
    sys.exit(0)

xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!-- ATLAS file meta-data catalog -->
<!DOCTYPE POOLFILECATALOG SYSTEM "InMemory">
<POOLFILECATALOG>
"""
try:
    att = sys.argv[2]
except Exception:
    att = job.attemptNr
Beispiel #8
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    tmpLog = LogWrapper(_logger, None)

    tmpLog.debug("===================== start =====================")

    # current minute
    currentMinute = datetime.datetime.utcnow().minute

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # delete
    tmpLog.debug("Del session")
    status, retSel = taskBuffer.querySQLS(
        "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
    if retSel is not None:
        try:
            maxID = retSel[0][0]
            tmpLog.debug("maxID : %s" % maxID)
            if maxID is not None:
                varMap = {}
                varMap[':maxID'] = maxID
                varMap[':jobStatus1'] = 'activated'
                varMap[':jobStatus2'] = 'waiting'
                varMap[':jobStatus3'] = 'failed'
                varMap[':jobStatus4'] = 'cancelled'
                status, retDel = taskBuffer.querySQLS(
                    "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",
                    varMap)
        except Exception:
            pass

    # count # of getJob/updateJob in dispatcher's log
    try:
        # don't update when logrotate is running
        timeNow = datetime.datetime.utcnow()
        logRotateTime = timeNow.replace(hour=3,
                                        minute=2,
                                        second=0,
                                        microsecond=0)
        if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \
               (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)):
            tmpLog.debug("skip pilotCounts session for logrotate")
        else:
            # log filename
            dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir
            # time limit
            timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                hours=3)
            timeLimitS = datetime.datetime.utcnow() - datetime.timedelta(
                hours=1)
            # check if tgz is required
            com = 'head -1 %s' % dispLogName
            lostat, loout = commands_get_status_output(com)
            useLogTgz = True
            if lostat == 0:
                match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                  loout)
                if match is not None:
                    startTime = datetime.datetime(*time.strptime(
                        match.group(0), '%Y-%m-%d %H:%M:%S')[:6])
                    # current log contains all info
                    if startTime < timeLimit:
                        useLogTgz = False
            # log files
            dispLogNameList = [dispLogName]
            if useLogTgz:
                today = datetime.date.today()
                dispLogNameList.append('{0}-{1}.gz'.format(
                    dispLogName, today.strftime('%Y%m%d')))
            # delete tmp
            commands_get_status_output('rm -f %s.tmp-*' % dispLogName)
            # tmp name
            tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow(
            ).strftime('%Y-%m-%d-%H-%M-%S'))
            # loop over all files
            pilotCounts = {}
            pilotCountsS = {}
            for tmpDispLogName in dispLogNameList:
                # expand or copy
                if tmpDispLogName.endswith('.gz'):
                    com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName)
                else:
                    com = 'cp %s %s' % (tmpDispLogName, tmpLogName)
                lostat, loout = commands_get_status_output(com)
                if lostat != 0:
                    errMsg = 'failed to expand/copy %s with : %s' % (
                        tmpDispLogName, loout)
                    raise RuntimeError(errMsg)
                # search string
                sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*'
                sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)'
                # read
                logFH = open(tmpLogName)
                for line in logFH:
                    # check format
                    match = re.search(sStr, line)
                    if match is not None:
                        # check timerange
                        timeStamp = datetime.datetime(*time.strptime(
                            match.group(1), '%Y-%m-%d %H:%M:%S')[:6])
                        if timeStamp < timeLimit:
                            continue
                        tmpMethod = match.group(2)
                        tmpSite = match.group(3)
                        tmpNode = match.group(4)
                        tmpType = match.group(5)

                        # protection against corrupted entries from pilot,
                        # e.g. pilot reading site json from cvmfs while it was being updated
                        if tmpSite not in aSiteMapper.siteSpecList:
                            continue
                        # sum
                        pilotCounts.setdefault(tmpSite, {})
                        pilotCounts[tmpSite].setdefault(tmpMethod, {})
                        pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0)
                        pilotCounts[tmpSite][tmpMethod][tmpNode] += 1
                        # short
                        if timeStamp > timeLimitS:
                            if tmpSite not in pilotCountsS:
                                pilotCountsS[tmpSite] = dict()
                            if tmpMethod not in pilotCountsS[tmpSite]:
                                pilotCountsS[tmpSite][tmpMethod] = dict()
                            if tmpNode not in pilotCountsS[tmpSite][tmpMethod]:
                                pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0
                            pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1
                # close
                logFH.close()
            # delete tmp
            commands_get_status_output('rm %s' % tmpLogName)
            # update
            hostID = panda_config.pserverhost.split('.')[0]
            tmpLog.debug("pilotCounts session")
            retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3)
            tmpLog.debug(retPC)
            retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1)
            tmpLog.debug(retPC)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue))

    # nRunning
    tmpLog.debug("nRunning session")
    try:
        if (currentMinute / panda_config.nrun_interval
            ) % panda_config.nrun_hosts == panda_config.nrun_snum:
            retNR = taskBuffer.insertnRunningInSiteData()
            tmpLog.debug(retNR)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("nRunning : %s %s" % (errType, errValue))

    # session for co-jumbo jobs
    tmpLog.debug("co-jumbo session")
    try:
        ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000)
        if ret is None:
            tmpLog.debug("failed to get co-jumbo jobs to finish")
        else:
            coJumboA, coJumboD, coJumboW, coJumboTokill = ret
            tmpLog.debug("finish {0} co-jumbo jobs in Active".format(
                len(coJumboA)))
            if len(coJumboA) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboA,
                                               fromDefined=False,
                                               fromActive=True,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False)
            tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(
                len(coJumboD)))
            if len(coJumboD) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboD,
                                               fromDefined=True,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], True)
            tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format(
                len(coJumboW)))
            if len(coJumboW) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboW,
                                               fromDefined=False,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=True)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False, True)
            tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(
                len(coJumboTokill)))
            if len(coJumboTokill) > 0:
                jediJobs = list(coJumboTokill)
                nJob = 100
                iJob = 0
                while iJob < len(jediJobs):
                    tmpLog.debug(' killing %s' %
                                 str(jediJobs[iJob:iJob + nJob]))
                    Client.killJobs(jediJobs[iJob:iJob + nJob],
                                    51,
                                    keepUnmerged=True)
                    iJob += nJob
    except Exception:
        errStr = traceback.format_exc()
        tmpLog.error(errStr)

    tmpLog.debug("Fork session")

    # thread for fork
    class ForkThr(threading.Thread):
        def __init__(self, fileName):
            threading.Thread.__init__(self)
            self.fileName = fileName

        def run(self):
            if 'VIRTUAL_ENV' in os.environ:
                prefix = os.environ['VIRTUAL_ENV']
            else:
                prefix = ''
            setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix)
            runStr = '%s/python -Wignore ' % panda_config.native_python
            runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i '
            runStr += self.fileName
            if self.fileName.split('/')[-1].startswith('set.NULL.'):
                runStr += ' -t'
            comStr = setupStr + runStr
            tmpLog.debug(comStr)
            commands_get_status_output(comStr)

    # get set.* files
    filePatt = panda_config.logdir + '/' + 'set.*'
    fileList = glob.glob(filePatt)

    # the max number of threads
    maxThr = 10
    nThr = 0

    # loop over all files
    forkThrList = []
    timeNow = datetime.datetime.utcnow()
    for tmpName in fileList:
        if not os.path.exists(tmpName):
            continue
        try:
            # takes care of only recent files
            modTime = datetime.datetime(
                *(time.gmtime(os.path.getmtime(tmpName))[:7]))
            if (timeNow - modTime) > datetime.timedelta(minutes=1) and \
                    (timeNow - modTime) < datetime.timedelta(hours=1):
                cSt, cOut = commands_get_status_output(
                    'ps aux | grep fork | grep -v PYTH')
                # if no process is running for the file
                if cSt == 0 and tmpName not in cOut:
                    nThr += 1
                    thr = ForkThr(tmpName)
                    thr.start()
                    forkThrList.append(thr)
                    if nThr > maxThr:
                        break
        except Exception:
            errType, errValue = sys.exc_info()[:2]
            tmpLog.error("%s %s" % (errType, errValue))

    # join fork threads
    for thr in forkThrList:
        thr.join()

    # terminate TaskBuffer IF
    # taskBufferIF.terminate()

    tmpLog.debug("===================== end =====================")